Hello,
I noticed that cudaArray_t
object usually has a different memory layout. For example:
// On cpu side, create matrix[width][height][depth] and assign matrix[3][5][7] = 1
int width=10, height=20, depth=30;
size_t size = width* height * depth * sizeof(float);
float* mat = (float*)malloc(size);
for (int i = 0; i < width * height * depth; i++) {
mat[i] = 2.0f;
}
mat[3 * height * depth + 5 * depth + 7] = 1.0f;
The code above should create a matrix is shape [10, 20, 30]
with 2s and assign 1 to mat[3][5][7]
only.
After that, I copy the data to cuda array, make texture object with the array:
cudaExtent extent = make_cudaExtent(width, height, depth);
cudaArray_t cuArray;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMalloc3DArray(&cuArray, &channelDesc, extent);
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr(mat, width * sizeof(float), width, height);
copyParams.dstArray = cuArray;
copyParams.extent = extent;
copyParams.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(©Params);
cudaResourceDesc ResDesc;
memset(&ResDesc, 0, sizeof(ResDesc));
ResDesc.resType = cudaResourceTypeArray;
ResDesc.res.array.array = cuArray;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
texDesc.filterMode = cudaFilterModePoint;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 0;
cudaTextureObject_t tex;
cudaCreateTextureObject(&tex, &ResDesc, &texDesc, NULL);
cudaSurfaceObject_t surfObj;
cudaCreateSurfaceObject(&surfObj, &ResDesc);
My kernel function to print the value at mat[3][5][7]
and my launch procedure:
__global__ void readValue(cudaTextureObject_t tex, cudaSurfaceObject_t surfObj, int width, int height, int depth)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int z = blockIdx.z * blockDim.z + threadIdx.z;
if (x == 3 && y == 5 && z == 7)
{
printf("Value at (%d, %d, %d): %f\n", x, y, z, tex3D<float>(tex, z, y, x));
}
}
...
dim3 blockSize(8, 8, 8);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, (depth + blockSize.z - 1) / blockSize.z);
readValue<<<gridSize, blockSize>>>(tex, surfObj, width, height, depth);
cudaDeviceSynchronize();
...
However, I am seeing 2.0
as the result from the kernel. I also noticed that many code examples indicate that I should be very careful of the dimensionality changes of the cuda array. I want to have an explanation of why we need to have a different memory layout, how should I access/write the data using texture fetch / surface write with expected coordinate set (x, y, z), as well as its impact on our thread block / grid setup. Thank you so much!