Suq.*.b32 other than suq.widht.b32 and suq.height.b32 causes cudaError 801/500

Hi Everybody,

when I try to query surface properties other than suq.width.b32 and suq.height.b32, I get

  • cudaErrorNotSupported = 801 in CUDA Toolkit v. >= 11.3, and
  • cudaErrorNotFound = 500 in CUDA Toolkit v. <= 11.2

I obtain the same errors on different devices, i.e.

  • GeForce GTX 1050 Ti,
  • TITAN RTX,
  • Tesla V100-SXM2, and
  • A100-PCIE-40GB,

with different sub-versions of the 450 driver running on different GNU/Linux distributions.

Are these instructions supported just for particular devices?

Am I missing something? See the simplified reproducer below.

Thanks in advance!

#include<stdio.h>

// Simple copy kernel
__global__ void copyKernel(cudaSurfaceObject_t inputSurfObj,
                           cudaSurfaceObject_t outputSurfObj,
                           int width, int height) 
{
    // Calculate surface coordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

    if(x==0 && y==0)
    {
      int w = -1, h = -1, d = -1, c = -1, o = -1, s = -1, l = -1;
      asm("suq.width.b32  %0, [%1];" : "=r"(w) : "l"(inputSurfObj));
      asm("suq.height.b32  %0, [%1];" : "=r"(h) : "l"(inputSurfObj));

      // the followings are producing ERROR 801/500
//      asm("suq.depth.b32  %0, [%1];" : "=r"(d) : "l"(inputSurfObj));
//      asm("suq.channel_data_type.b32  %0, [%1];" : "=r"(c) : "l"(inputSurfObj));
//      asm("suq.channel_order.b32  %0, [%1];" : "=r"(o) : "l"(inputSurfObj));
//      asm("suq.array_size.b32  %0, [%1];" : "=r"(s) : "l"(inputSurfObj));
//      asm("suq.memory_layout.b32  %0, [%1];" : "=r"(l) : "l"(inputSurfObj));

      printf("Surface width=%i, height=%i, depth=%i, channel_data_type=%i, channel_order=%i, array_size=%i memory_layout=%i\n", w, h, d, c, o, s, l);
    }

    if (x < width && y < height) {
        uchar4 data;
        // Read from input surface
        surf2Dread(&data,  inputSurfObj, x * 4, y);
        // Write to output surface
        surf2Dwrite(data, outputSurfObj, x * 4, y);
    }
}

// Host code
int main()
{

    const size_t height = 16;
    const size_t width = 16;

    // Allocate and set some host data
    unsigned char *h_data =
        (unsigned char *)std::malloc(sizeof(unsigned char) * width * height * 4);
    for (size_t i = 0; i < height * width * 4; ++i)
        h_data[i] = (unsigned char) i;

    // Allocate CUDA arrays in device memory
    cudaChannelFormatDesc channelDesc =
        cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
    cudaArray_t cuInputArray;
    cudaMallocArray(&cuInputArray, &channelDesc, width, height,
                    cudaArraySurfaceLoadStore);
    cudaArray_t cuOutputArray;
    cudaMallocArray(&cuOutputArray, &channelDesc, width, height,
                    cudaArraySurfaceLoadStore);

    // Set pitch of the source (the width in memory in bytes of the 2D array
    // pointed to by src, including padding), we dont have any padding
    const size_t spitch = 4 * width * sizeof(unsigned char);
    // Copy data located at address h_data in host memory to device memory
    cudaMemcpy2DToArray(cuInputArray, 0, 0, h_data, spitch,
                        4 * width * sizeof(unsigned char), height,
                        cudaMemcpyHostToDevice);

    // Specify surface
    struct cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypeArray;

    // Create the surface objects
    resDesc.res.array.array = cuInputArray;
    cudaSurfaceObject_t inputSurfObj = 0;
    cudaCreateSurfaceObject(&inputSurfObj, &resDesc);
    resDesc.res.array.array = cuOutputArray;
    cudaSurfaceObject_t outputSurfObj = 0;
    cudaCreateSurfaceObject(&outputSurfObj, &resDesc);

    // Invoke kernel
    dim3 threadsperBlock(16, 16);
    dim3 numBlocks((width + threadsperBlock.x - 1) / threadsperBlock.x,
                    (height + threadsperBlock.y - 1) / threadsperBlock.y);
    copyKernel<<<numBlocks, threadsperBlock>>>(inputSurfObj, outputSurfObj, width,
                                                height);

    cudaDeviceSynchronize();

    // check for error
    cudaError_t error = cudaGetLastError();
    if(error != cudaSuccess)
    {
      // print the CUDA error message and exit
      printf("CUDA ERROR %i: %s\n", error, cudaGetErrorString(error));
      exit(-1);
    }

    unsigned char *h_data_out =
        (unsigned char *)std::malloc(sizeof(unsigned char) * width * height * 4);
 
    // Copy data from device back to host
    cudaMemcpy2DFromArray(h_data_out, spitch, cuOutputArray, 0, 0,
                            4 * width * sizeof(unsigned char), height,
                            cudaMemcpyDeviceToHost);
    int err = 0;

    for (size_t i = 0; i < height * width * 4; ++i)
        err += (h_data_out[i] != (unsigned char) i);

    printf("Errors: %i\n", err);

    // Destroy surface objects
    cudaDestroySurfaceObject(inputSurfObj);
    cudaDestroySurfaceObject(outputSurfObj);

    // Free device memory
    cudaFreeArray(cuInputArray);
    cudaFreeArray(cuOutputArray);

    // Free host memory
    free(h_data);

    return 0;
}

Thank you for your prompt reply!

In my case the operand is a u64 representing an opaque surface object, I thought it was supported since the documentation states

Operand a is a .surfref variable or a .u64 register.

OK my suggestion is to file a bug.