Hi,
why does my kernel die for WIDTH_PX not equal to the pitch of the allocation?
Can anyone reproduce this?
#include <stdio.h>
//height and width
#define HEIGHT_PX 437
#define WIDTH_PX 645
//NOTE: Choose WIDTH_PX an arbitrary value not equal to the pitch
//of the allocation (e.g. 645) and the kernel dies on my machine with an access
//violation. Why?
#define BLOCK_SIDE_LENGTH 16
//Test texture fetch
__global__ void testKernel(unsigned char* arr1, size_t pitch_arr1)
{
//Calculate x and y coordinate
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < WIDTH_PX && y < HEIGHT_PX)
{
unsigned char* currentpos = (unsigned char*) ((char*) arr1 + y * pitch_arr1) + x;
*currentpos = 255;
}
}
int main()
{
//Allocate array
unsigned char arr1[HEIGHT_PX * WIDTH_PX] = {0};
unsigned char* d_pointer_arr1;
size_t pitch_arr1;
//Upload array
cudaMallocPitch((void**) &d_pointer_arr1, &pitch_arr1, WIDTH_PX, HEIGHT_PX);
cudaMemcpy2D(d_pointer_arr1, pitch_arr1, arr1, WIDTH_PX, WIDTH_PX, HEIGHT_PX, cudaMemcpyHostToDevice);
//print pitch
printf("pitch: %i\n", pitch_arr1);
//Threads per block
dim3 threadsPerBlock(BLOCK_SIDE_LENGTH, BLOCK_SIDE_LENGTH);
//Calculate number of blocks
unsigned int blocksx = (WIDTH_PX + BLOCK_SIDE_LENGTH - 1) / BLOCK_SIDE_LENGTH;
unsigned int blocksy = (HEIGHT_PX + BLOCK_SIDE_LENGTH - 1) / BLOCK_SIDE_LENGTH;
dim3 numBlocks(blocksx, blocksy);
//Start Kernel
testKernel<<<numBlocks, threadsPerBlock>>>(arr1, pitch_arr1);
//Copy back to array
cudaMemcpy2D(arr1, WIDTH_PX, d_pointer_arr1, pitch_arr1, WIDTH_PX, HEIGHT_PX, cudaMemcpyDeviceToHost);
}
REgards,
Kwyjibo