Why does this kernel die? Accessing pitched memory

Hi,

why does my kernel die for WIDTH_PX not equal to the pitch of the allocation?

Can anyone reproduce this?

#include <stdio.h>

//height and width

#define HEIGHT_PX  437

#define WIDTH_PX  645

//NOTE: Choose WIDTH_PX an arbitrary value not equal to the pitch 

//of the allocation (e.g. 645) and the kernel dies on my machine with an access 

//violation. Why?

#define BLOCK_SIDE_LENGTH  16

//Test texture fetch

__global__ void testKernel(unsigned char* arr1, size_t pitch_arr1)

{

	//Calculate x and y coordinate

	unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;

	unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

	

	if (x < WIDTH_PX && y < HEIGHT_PX)

	{

		unsigned char* currentpos = (unsigned char*) ((char*) arr1 + y * pitch_arr1) + x;

		*currentpos = 255;

	}

}

int main()

{

	//Allocate array

	unsigned char arr1[HEIGHT_PX * WIDTH_PX] = {0};

	unsigned char* d_pointer_arr1;

	size_t pitch_arr1;

	

	//Upload array

	cudaMallocPitch((void**) &d_pointer_arr1, &pitch_arr1, WIDTH_PX, HEIGHT_PX);

	cudaMemcpy2D(d_pointer_arr1, pitch_arr1, arr1, WIDTH_PX, WIDTH_PX, HEIGHT_PX, cudaMemcpyHostToDevice);

	

	//print pitch

	printf("pitch: %i\n", pitch_arr1);

	

	//Threads per block

	dim3 threadsPerBlock(BLOCK_SIDE_LENGTH, BLOCK_SIDE_LENGTH);

	

	//Calculate number of blocks

	unsigned int blocksx = (WIDTH_PX		+ BLOCK_SIDE_LENGTH - 1) / BLOCK_SIDE_LENGTH;

	unsigned int blocksy = (HEIGHT_PX		+ BLOCK_SIDE_LENGTH - 1) / BLOCK_SIDE_LENGTH;

	dim3 numBlocks(blocksx, blocksy);

	//Start Kernel

	testKernel<<<numBlocks, threadsPerBlock>>>(arr1, pitch_arr1);

	

	//Copy back to array

	cudaMemcpy2D(arr1, WIDTH_PX, d_pointer_arr1, pitch_arr1, WIDTH_PX, HEIGHT_PX, cudaMemcpyDeviceToHost);

}

REgards,

Kwyjibo

Hi,

why does my kernel die for WIDTH_PX not equal to the pitch of the allocation?

Can anyone reproduce this?

#include <stdio.h>

//height and width

#define HEIGHT_PX  437

#define WIDTH_PX  645

//NOTE: Choose WIDTH_PX an arbitrary value not equal to the pitch 

//of the allocation (e.g. 645) and the kernel dies on my machine with an access 

//violation. Why?

#define BLOCK_SIDE_LENGTH  16

//Test texture fetch

__global__ void testKernel(unsigned char* arr1, size_t pitch_arr1)

{

	//Calculate x and y coordinate

	unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;

	unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

	

	if (x < WIDTH_PX && y < HEIGHT_PX)

	{

		unsigned char* currentpos = (unsigned char*) ((char*) arr1 + y * pitch_arr1) + x;

		*currentpos = 255;

	}

}

int main()

{

	//Allocate array

	unsigned char arr1[HEIGHT_PX * WIDTH_PX] = {0};

	unsigned char* d_pointer_arr1;

	size_t pitch_arr1;

	

	//Upload array

	cudaMallocPitch((void**) &d_pointer_arr1, &pitch_arr1, WIDTH_PX, HEIGHT_PX);

	cudaMemcpy2D(d_pointer_arr1, pitch_arr1, arr1, WIDTH_PX, WIDTH_PX, HEIGHT_PX, cudaMemcpyHostToDevice);

	

	//print pitch

	printf("pitch: %i\n", pitch_arr1);

	

	//Threads per block

	dim3 threadsPerBlock(BLOCK_SIDE_LENGTH, BLOCK_SIDE_LENGTH);

	

	//Calculate number of blocks

	unsigned int blocksx = (WIDTH_PX		+ BLOCK_SIDE_LENGTH - 1) / BLOCK_SIDE_LENGTH;

	unsigned int blocksy = (HEIGHT_PX		+ BLOCK_SIDE_LENGTH - 1) / BLOCK_SIDE_LENGTH;

	dim3 numBlocks(blocksx, blocksy);

	//Start Kernel

	testKernel<<<numBlocks, threadsPerBlock>>>(arr1, pitch_arr1);

	

	//Copy back to array

	cudaMemcpy2D(arr1, WIDTH_PX, d_pointer_arr1, pitch_arr1, WIDTH_PX, HEIGHT_PX, cudaMemcpyDeviceToHost);

}

REgards,

Kwyjibo

Just realized, that I m passing a host pointer to a kernel.

However, the problem still exists.

Just realized, that I m passing a host pointer to a kernel.

However, the problem still exists.