cudaMalloc3D and friends proper use for whatever data type

Hi all.

After spending a lot of time searching for how to properly use 3D to linear allocation in CUDA I finally got it done and I’m posting about my findings.

Most people have problems with pitched sizes and consequently memory corruption:

http://forums.nvidia.com/index.php?showtopic=84371

http://forums.nvidia.com/index.php?showtopic=157812

http://forums.nvidia.com/index.php?showtopic=101833

but that’s a good read and i learn a lot.

REVIEWED AND WORKING FOR ANY SIZE (>64K PITCH) (thnks CapJo)

However the main point is to properly specify the cudaExtent and cudaPitchedStr (both) structs:

alloc device mem

// disregard what everyone says about the width component having the physical size of your datatype

extent.width=x;//no need for the =x*sizeof(datatype) as said on all posts

going in conformity with the reference manual: only cudaArrays dont need physical size, everything else does

cudaExtent extent; 

		extent.width=x*sizeof(datatype);

	extent.height=y;

	extent.depth=z;

cudaPitchedPtr mem_device;

cudaMalloc3D(&mem_device,extent);

copy params

cudaMemcpy3DParms p = { 0 };

	p.srcPtr = make_cudaPitchedPtr((void*)mem_host, x*sizeof(float3),x,y);

	p.dstPtr = mem_device;

	p.extent = extent;

	p.kind = cudaMemcpyHostToDevice;

	status=cudaMemcpy3D(&p);

	if(status != cudaSuccess){fprintf(stderr, "MemcpyHtD: %s\n", cudaGetErrorString(status));}

	MyKernel<<<1,1>>>(mem_device,extent);

access data as you like (and makes sense)

__global__ void MyKernel(cudaPitchedPtr devPitchedPtr,cudaExtent extent)

{

	char* devPtr = (char*) devPitchedPtr.ptr;

	size_t pitch = devPitchedPtr.pitch;

	size_t slicePitch = pitch * extent.height;

	for(int k=0; k < extent.depth; k++){

		char* slice = devPtr + k * slicePitch;

		for(int j=0; j< extent.height; j++){

			float3* row = (float3*) (slice+j*pitch);

			//cuPrintf("j:%f pitch:%d slicePitch:%d\n",j,pitch,slicePitch);

			//for(int i=0; i< (extent.width/sizeof(float3));i++){

				//cuPrintf("x:%f y:%f z:%f\n",row[i].x,row[i].y,row[i].z);

				

			//}

			teste(row,extent.width/sizeof(float3));	

			

		}

	}

}

__device__ void teste(float3* row, size_t width){

	for(int x=0; x < width;x++){

		row[x].x+=2;

		row[x].y+=2;

		row[x].z+=2;

		cuPrintf("x:%f y:%f z:%f\n",row[x].x,row[x].y,row[x].z);

	}

}

THNKS CapJo ( http://forums.nvidia.com/index.php?showtop…st&p=990000 )
pitchf.cu (2.46 KB)

Thats cleary wrong, how should cudaMalloc3D know how much memory you will actually need without knowing the size of the datatype or the extent of memory per line? To allocate memory you need the amount!

You must specify the size of your data per line in x direction and that means x*sizeof(datatype) and that’s the reason why ALL posts says that you need to do that.

Your example only works because the smallest amount of data that is allocated by cudaMalloc[2D | 3D] is a multiple of 64 Byte per line and your data - 5 elements of float3 equals 60 Byte and fits into 64 Byte. In other examples it won’t work.

And a link without any explanation in the thread I opend doesn’t help people much. And as I said in the thread, there is a bug in cudaMemset3D, which is confirmed by NVIDIA and will be fixed in the next release. cudaMalloc3D works correctly and I used it correctly.

http://forums.nvidia.com/index.php?showtopic=157812

Sorry about my sayings. Looks like I’ve offended you. Didn’t mean to. Just bad english i guess…

I’ve spent so many time that when I saw correct output I thought of sharing it, so I posted in various posts regarding the use of 3D alloc in CUDA. (and just pasted the link to this post…)

About the topic… what modifications can I do to properly alloc write modify and read back to host ?

Sorry again…

COrrected!

Hey, thanks tiotempestade. I think this will really help me figure out what’s happening.

Do you by anychance have the cuPrintf.cu file??

cuprintf_2.3_5105025.zip (22.1 KB)

Thanks for the post, this has helped me loads, but as a newbe I have a question…

So having copied a 3D matrix to the device why on earth would you run a single thread to access each element? or have I missed something? I mean this is how it’s done in the CUDA programming guide too.

Based on the code in this post here is my version, which makes a 3D array, then modifies it with 2D array of blocks, each with N threads (hence 3D) and finally copies the result back.

I hope I haven’t missed something obvious, but let me know if I have…

//3d_array.cu

#include <stdlib.h>

#include <stdio.h>

// Device code

__global__ void MyKernel(cudaPitchedPtr devPitchedPtr,cudaExtent extent)

{

	char* devPtr = (char*) devPitchedPtr.ptr;

	size_t pitch = devPitchedPtr.pitch;

	size_t slicePitch = pitch * extent.height;

	int x = threadIdx.x;

	int y = blockIdx.x;

	int z = blockIdx.y;

	char* slice = devPtr + z * slicePitch;

	float* row = (float*) (slice + y * pitch);

	row[x] += x + (y*10) + (z*100);

}

int main(void)

{

	cudaSetDevice(0);

	int N = 10;

	float array[N][N][N];

	float result[N][N][N];

	int i,j,k;

	cudaError_t status = cudaSuccess;

	//initialise array

	for(i=0;i<N;i++) {

		for(j=0;j<N;j++) {

			for(k=0;k<N;k++) {

				array[i][j][k] = 0.0;

				result[i][j][k] = 0.0;

			}

		}

	}

	//allocate memory on device for a 3D matrix

	cudaExtent extent;

		extent.width=N*sizeof(float);

	extent.height=N;

		extent.depth=N;

	cudaPitchedPtr mem_device;

	status=cudaMalloc3D(&mem_device,extent);

	if(status != cudaSuccess){fprintf(stderr, "Malloc: %s\n", cudaGetErrorString(status));}

	//copy memory to device

	cudaMemcpy3DParms p = { 0 };

	p.srcPtr = make_cudaPitchedPtr((void*)array, N*sizeof(float),N,N);

	p.dstPtr = mem_device;

	p.extent = extent;

	p.kind = cudaMemcpyHostToDevice;

	status=cudaMemcpy3D(&p);

	if(status != cudaSuccess){fprintf(stderr, "MemcpyHtD: %s\n", cudaGetErrorString(status));}

	//run 3d kernel!

	dim3 blocks_per_grid(N,N);

	MyKernel <<<blocks_per_grid, N>>> (mem_device, extent);

	//copy result array back to host

	cudaMemcpy3DParms q = {0};

	q.srcPtr = mem_device;

	q.dstPtr = make_cudaPitchedPtr((void*)result,N*sizeof(float),N,N);

	q.extent=extent;

	q.kind = cudaMemcpyDeviceToHost;

	status=cudaMemcpy3D(&q);

	if(status != cudaSuccess){fprintf(stderr, "MemcpyDtoH: %s\n", cudaGetErrorString(status));}

	//initialise array

	for(i=0;i<N;i++) {

		for(j=0;j<N;j++) {

			for(k=0;k<N;k++) {

				printf("%f ", result[i][j][k]);

			}

			printf("\n");

		}

		printf("\n");

	}

	cudaFree(mem_device.ptr);

}

EDIT: actually this only works if the x and y dimensions are the same. see here http://forums.nvidia.com/index.php?showtopic=174233