Hi Everybody,
when I try to query surface properties other than suq.width.b32 and suq.height.b32, I get
- cudaErrorNotSupported = 801 in CUDA Toolkit v. >= 11.3, and
- cudaErrorNotFound = 500 in CUDA Toolkit v. <= 11.2
I obtain the same errors on different devices, i.e.
- GeForce GTX 1050 Ti,
- TITAN RTX,
- Tesla V100-SXM2, and
- A100-PCIE-40GB,
with different sub-versions of the 450 driver running on different GNU/Linux distributions.
Are these instructions supported just for particular devices?
Am I missing something? See the simplified reproducer below.
Thanks in advance!
#include<stdio.h>
// Simple copy kernel
__global__ void copyKernel(cudaSurfaceObject_t inputSurfObj,
cudaSurfaceObject_t outputSurfObj,
int width, int height)
{
// Calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x==0 && y==0)
{
int w = -1, h = -1, d = -1, c = -1, o = -1, s = -1, l = -1;
asm("suq.width.b32 %0, [%1];" : "=r"(w) : "l"(inputSurfObj));
asm("suq.height.b32 %0, [%1];" : "=r"(h) : "l"(inputSurfObj));
// the followings are producing ERROR 801/500
// asm("suq.depth.b32 %0, [%1];" : "=r"(d) : "l"(inputSurfObj));
// asm("suq.channel_data_type.b32 %0, [%1];" : "=r"(c) : "l"(inputSurfObj));
// asm("suq.channel_order.b32 %0, [%1];" : "=r"(o) : "l"(inputSurfObj));
// asm("suq.array_size.b32 %0, [%1];" : "=r"(s) : "l"(inputSurfObj));
// asm("suq.memory_layout.b32 %0, [%1];" : "=r"(l) : "l"(inputSurfObj));
printf("Surface width=%i, height=%i, depth=%i, channel_data_type=%i, channel_order=%i, array_size=%i memory_layout=%i\n", w, h, d, c, o, s, l);
}
if (x < width && y < height) {
uchar4 data;
// Read from input surface
surf2Dread(&data, inputSurfObj, x * 4, y);
// Write to output surface
surf2Dwrite(data, outputSurfObj, x * 4, y);
}
}
// Host code
int main()
{
const size_t height = 16;
const size_t width = 16;
// Allocate and set some host data
unsigned char *h_data =
(unsigned char *)std::malloc(sizeof(unsigned char) * width * height * 4);
for (size_t i = 0; i < height * width * 4; ++i)
h_data[i] = (unsigned char) i;
// Allocate CUDA arrays in device memory
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
cudaArray_t cuInputArray;
cudaMallocArray(&cuInputArray, &channelDesc, width, height,
cudaArraySurfaceLoadStore);
cudaArray_t cuOutputArray;
cudaMallocArray(&cuOutputArray, &channelDesc, width, height,
cudaArraySurfaceLoadStore);
// Set pitch of the source (the width in memory in bytes of the 2D array
// pointed to by src, including padding), we dont have any padding
const size_t spitch = 4 * width * sizeof(unsigned char);
// Copy data located at address h_data in host memory to device memory
cudaMemcpy2DToArray(cuInputArray, 0, 0, h_data, spitch,
4 * width * sizeof(unsigned char), height,
cudaMemcpyHostToDevice);
// Specify surface
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
// Create the surface objects
resDesc.res.array.array = cuInputArray;
cudaSurfaceObject_t inputSurfObj = 0;
cudaCreateSurfaceObject(&inputSurfObj, &resDesc);
resDesc.res.array.array = cuOutputArray;
cudaSurfaceObject_t outputSurfObj = 0;
cudaCreateSurfaceObject(&outputSurfObj, &resDesc);
// Invoke kernel
dim3 threadsperBlock(16, 16);
dim3 numBlocks((width + threadsperBlock.x - 1) / threadsperBlock.x,
(height + threadsperBlock.y - 1) / threadsperBlock.y);
copyKernel<<<numBlocks, threadsperBlock>>>(inputSurfObj, outputSurfObj, width,
height);
cudaDeviceSynchronize();
// check for error
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
printf("CUDA ERROR %i: %s\n", error, cudaGetErrorString(error));
exit(-1);
}
unsigned char *h_data_out =
(unsigned char *)std::malloc(sizeof(unsigned char) * width * height * 4);
// Copy data from device back to host
cudaMemcpy2DFromArray(h_data_out, spitch, cuOutputArray, 0, 0,
4 * width * sizeof(unsigned char), height,
cudaMemcpyDeviceToHost);
int err = 0;
for (size_t i = 0; i < height * width * 4; ++i)
err += (h_data_out[i] != (unsigned char) i);
printf("Errors: %i\n", err);
// Destroy surface objects
cudaDestroySurfaceObject(inputSurfObj);
cudaDestroySurfaceObject(outputSurfObj);
// Free device memory
cudaFreeArray(cuInputArray);
cudaFreeArray(cuOutputArray);
// Free host memory
free(h_data);
return 0;
}