Cuda array memory layout question

Hello,

I noticed that cudaArray_t object usually has a different memory layout. For example:

// On cpu side, create matrix[width][height][depth] and assign matrix[3][5][7] = 1
int width=10, height=20, depth=30;
size_t size = width* height * depth * sizeof(float);
float* mat = (float*)malloc(size);
for (int i = 0; i < width * height * depth; i++) {
    mat[i] = 2.0f;
}
mat[3 * height * depth + 5 * depth + 7] = 1.0f;

The code above should create a matrix is shape [10, 20, 30] with 2s and assign 1 to mat[3][5][7] only.

After that, I copy the data to cuda array, make texture object with the array:

cudaExtent extent = make_cudaExtent(width, height, depth);
cudaArray_t cuArray;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMalloc3DArray(&cuArray, &channelDesc, extent);
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr(mat, width * sizeof(float), width, height);
copyParams.dstArray = cuArray;
copyParams.extent = extent;
copyParams.kind = cudaMemcpyHostToDevice;

cudaMemcpy3D(&copyParams);

cudaResourceDesc ResDesc;
memset(&ResDesc, 0, sizeof(ResDesc));
ResDesc.resType = cudaResourceTypeArray;
ResDesc.res.array.array = cuArray;

cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
texDesc.filterMode = cudaFilterModePoint;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 0;
    
cudaTextureObject_t tex;
cudaCreateTextureObject(&tex, &ResDesc, &texDesc, NULL);

cudaSurfaceObject_t surfObj;
cudaCreateSurfaceObject(&surfObj, &ResDesc);

My kernel function to print the value at mat[3][5][7] and my launch procedure:

__global__ void readValue(cudaTextureObject_t tex, cudaSurfaceObject_t surfObj, int width, int height, int depth)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int z = blockIdx.z * blockDim.z + threadIdx.z;

    if (x == 3 && y == 5 && z == 7)
    {
        printf("Value at (%d, %d, %d): %f\n", x, y, z, tex3D<float>(tex, z, y, x));
    }
}

...
dim3 blockSize(8, 8, 8);
dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, (depth + blockSize.z - 1) / blockSize.z);
readValue<<<gridSize, blockSize>>>(tex, surfObj, width, height, depth);
cudaDeviceSynchronize(); 
...

However, I am seeing 2.0 as the result from the kernel. I also noticed that many code examples indicate that I should be very careful of the dimensionality changes of the cuda array. I want to have an explanation of why we need to have a different memory layout, how should I access/write the data using texture fetch / surface write with expected coordinate set (x, y, z), as well as its impact on our thread block / grid setup. Thank you so much!

In C and C++, for matrix storage order of a multiply-subscripted array, the rapidly varying index as we move linearly through memory is the last index.

In CUDA C++, for a texture, the rapidly varying index is x.

When I see this:

The code I would expect to see is:

mat[3 * width*height + 5 * width + 7] = 1.0f;

(I am associating “width” with the number of elements in the last subscript equal to the distance of 1 in the second to last subscript, and “height” with the number of positions in the second to last subscript equal to the distance of 1 in the third to last subscript. For me, I find it convenient to associate x with the last dimension/subscript, y with the second to last dimension/subscript, and z with the 3rd to last dimension/subscript).

And the way I would look for that in the kernel is:

if (x == 7 && y == 5 && z == 3)
{
    printf("Value at (%d, %d, %d): %f\n", z y, x, tex3D<float>(tex, x, y, z));
}

That methodology seems to work for me:

# cat t207.cu
#include <cstdio>
__global__ void readValue(cudaTextureObject_t tex, cudaSurfaceObject_t surfObj, int width, int height, int depth)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int z = blockIdx.z * blockDim.z + threadIdx.z;

    if (x == 7 && y == 5 && z == 3)
    {
        printf("Value at mat[%d][%d][%d]: %f\n", z, y, x, tex3D<float>(tex, x, y, z));
    }
}


int main(){

  int width=10, height=20, depth=30;
  size_t size = width* height * depth * sizeof(float);
  float* mat = (float*)malloc(size);
  for (int i = 0; i < width * height * depth; i++) {
    mat[i] = 2.0f;
  }
  mat[3 * height * width + 5 * width + 7] = 1.0f;
  cudaExtent extent = make_cudaExtent(width, height, depth);
  cudaArray_t cuArray;
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
  cudaMalloc3DArray(&cuArray, &channelDesc, extent);
  cudaMemcpy3DParms copyParams = {0};
  copyParams.srcPtr = make_cudaPitchedPtr(mat, width * sizeof(float), width, height);
  copyParams.dstArray = cuArray;
  copyParams.extent = extent;
  copyParams.kind = cudaMemcpyHostToDevice;

  cudaMemcpy3D(&copyParams);

  cudaResourceDesc ResDesc;
  memset(&ResDesc, 0, sizeof(ResDesc));
  ResDesc.resType = cudaResourceTypeArray;
  ResDesc.res.array.array = cuArray;

  cudaTextureDesc texDesc;
  memset(&texDesc, 0, sizeof(texDesc));
  texDesc.addressMode[0] = cudaAddressModeClamp;
  texDesc.addressMode[1] = cudaAddressModeClamp;
  texDesc.addressMode[2] = cudaAddressModeClamp;
  texDesc.filterMode = cudaFilterModePoint;
  texDesc.readMode = cudaReadModeElementType;
  texDesc.normalizedCoords = 0;

  cudaTextureObject_t tex;
  cudaCreateTextureObject(&tex, &ResDesc, &texDesc, NULL);

  cudaSurfaceObject_t surfObj;
  cudaCreateSurfaceObject(&surfObj, &ResDesc);

  dim3 blockSize(8, 8, 8);
  dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, (depth + blockSize.z - 1) / blockSize.z);
  readValue<<<gridSize, blockSize>>>(tex, surfObj, width, height, depth);
  cudaDeviceSynchronize();
}
# nvcc -o t207 t207.cu
# compute-sanitizer ./t207
========= COMPUTE-SANITIZER
Value at mat[3][5][7]: 1.000000
========= ERROR SUMMARY: 0 errors
#

Does this mean my matrix is actually of shape [depth, height, width] instead of shape [width, height, depth]? If so, everything makes sense. Thanks.

this is an ordering convention that you have not explicitly specified. That is, I have no idea what the significance of the first position is precisely, nor the 2nd or 3rd. With respect to C++, I can explicitly specify the ordering associated with syntactically correct array subscripting of the form [p][q][r]. The language defines how that works.

but you have now proposed:

[p, q, r]. I don’t know how that works or what it means. As far as I know, C++ makes no formal syntax definitions for that construct, nor the use of the word “shape”. Attempting to use that construct in C++ will give you no sensible utility of any kind. Nevertheless, I am convinced I have given you enough information to demonstrate how to associate the memory order of a multiply-subscripted array in C++, with the order you should expect when doing texturing operations.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.