Does CUDA support 16-bit surfaces?

Can we use 16-bit surfaces in CUDA kernels on the Jetson Nano?

Reading 8 and 16-bit (CU_AD_FORMAT_UNSIGNED_INT8 / 16) textures in a CUDA kernel seem to work fine.
Reading 8 bit surfaces also work fine, but I cannot seem to read 16-bit surfaces and without receiving a kernel failure.

I only see an 8-bit surface example in the multimedia_api, so I figure I would ask.

In this example we fill an host array with data, upload it to a surface and read it in a kernel (with a single kernel thread).

Source and 16-bit build of the TestCuda binary are here:
https://github.com/rspruyt/JetsonNanoCudaCrashTest/tree/surfaceTest

#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>

#define USE_16_BIT	// This works for 8 bit but not 16 (comment this out for 8-bit)

#ifdef USE_16_BIT
	typedef uint16_t T;	// 16-bit
#else
	typedef uint8_t T;	// 8-bit
#endif

#define WIDTH 3
#define HEIGHT 3

__global__ void testKernel(CUsurfObject surf) {
	for (int i=0; i < HEIGHT; ++i) { // y
		for (int j=0; j < WIDTH; ++j) { // x
			T val = surf2Dread<T>(surf, j, i, cudaBoundaryModeClamp);
			printf("x=%d y=%d, surf=%llu val=%d\n", j, i, surf, val);
		}
	}
}

int main(int argc, char** argv) {
	CUarray array = 0;
	CUsurfObject surf = 0;
	CUDA_ARRAY_DESCRIPTOR arrDesc;
	CUDA_RESOURCE_DESC resDesc;
	
	// clear the descriptors
	memset(&arrDesc, 0, sizeof(arrDesc));
	memset(&resDesc, 0, sizeof(resDesc));
	
	// init CUDA
	cudaFree(NULL);
	
	// create an 8 or 16 bit array
	arrDesc.Format = sizeof(T) * 8 == 8 ? CU_AD_FORMAT_UNSIGNED_INT8 : CU_AD_FORMAT_UNSIGNED_INT16;
	arrDesc.Width = WIDTH;
	arrDesc.Height = HEIGHT;
	arrDesc.NumChannels = 1;
	CUresult result = cuArrayCreate(&array, &arrDesc);
	if (result != CUDA_SUCCESS) {
		printf("Failed to create CUDA Array\n");
		return -1;
	}

	// create a surface from the array
	resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
	resDesc.res.array.hArray = array;
	result = cuSurfObjectCreate(&surf, &resDesc);
	if (result != CUDA_SUCCESS) {
		printf("Failed to create Surface\n");
		return -1;
	}
	printf("\nCreated surface %llu\n\n", surf);

	// create some host data to copy to the surface
	T* data = (T*)calloc(WIDTH * HEIGHT, sizeof(T));
	for (int i = 0; i < WIDTH * HEIGHT; ++i) {
		data[i] = i;
		printf("data[%d] = %d\n", i, data[i]);
	}

	// copy data from Host to Surface - even if we omit this copy, kernel still fails for 16-bit
	CUDA_MEMCPY2D copyParam;
	memset(&copyParam, 0, sizeof(copyParam));
	int rowBytes = WIDTH * sizeof(T);
	copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
	copyParam.dstArray = array;
	copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
	copyParam.srcHost = data;
	copyParam.srcPitch = rowBytes;
	copyParam.WidthInBytes = rowBytes;
	copyParam.Height = HEIGHT;
	
	printf("\nUploading Data to Surface\n");
	result = cuMemcpy2D(&copyParam);
	if (result != CUDA_SUCCESS) {
		printf("Failed to copy to surface\n");
		return -1;
	}

	// run the kernel
	testKernel<<<1,1>>>(surf);
	cudaError_t err = cudaDeviceSynchronize();
	if (err == cudaSuccess) {
		printf("\nSuccess!\n");
	} else {
		printf("Kernel failed: %d\n", err);
		return -1;
	}	
}

If we comment out:

// #define USE_16_BIT

8 bit works fine:

nvidia@nano-randy:~/JetsonNanoCudaCrashTest$ ./TestCuda

Created surface 1

data[0] = 0
data[1] = 1
data[2] = 2
data[3] = 3
data[4] = 4
data[5] = 5
data[6] = 6
data[7] = 7
data[8] = 8

Uploading Data to Surface
x=0 y=0, surf=1 val=0
x=1 y=0, surf=1 val=1
x=2 y=0, surf=1 val=2
x=0 y=1, surf=1 val=3
x=1 y=1, surf=1 val=4
x=2 y=1, surf=1 val=5
x=0 y=2, surf=1 val=6
x=1 y=2, surf=1 val=7
x=2 y=2, surf=1 val=8

Success!

Now we uncomment

#define USE_16_BIT

16-bit fails:

Created surface 1

data[0] = 0
data[1] = 1
data[2] = 2
data[3] = 3
data[4] = 4
data[5] = 5
data[6] = 6
data[7] = 7
data[8] = 8

Uploading Data to Surface
x=0 y=0, surf=1 val=0
x=1 y=0, surf=1 val=0  // <--- expect a value of 1 here, although kernel failed... so results are invalid anyways
x=2 y=0, surf=1 val=1
x=0 y=1, surf=1 val=3
x=1 y=1, surf=1 val=3
x=2 y=1, surf=1 val=4
x=0 y=2, surf=1 val=6
x=1 y=2, surf=1 val=6
x=2 y=2, surf=1 val=7
Kernel failed: 4

Perhaps this is a simple programming mistake or a parameter missing when trying to use a 16-bit surface?

Hi,

Thanks for your feedback.
We will check this and update more information with you later.

Hi,

Based on this sample, the index is measured in bytes.
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#surface-memory

Please update your sample with this change:

diff --git a/TestCuda.cu b/TestCuda.cu
index 958d3df..0198d4c 100755
--- a/TestCuda.cu
+++ b/TestCuda.cu
@@ -18,7 +18,7 @@
 __global__ void testKernel(CUsurfObject surf) {
        for (int i=0; i < HEIGHT; ++i) { // y
                for (int j=0; j < WIDTH; ++j) { // x
-                       T val = surf2Dread<T>(surf, j, i, cudaBoundaryModeClamp);
+                       T val = surf2Dread<T>(surf, <b>j*sizeof(T)</b>, i, cudaBoundaryModeClamp);
                        printf("x=%d y=%d, surf=%llu val=%d\n", j, i, surf, val);
                }
        }

Thanks.

Thanks Aasta!

That would explain it, since we were migrating from 16-bit textures to surfaces.

I didn’t expect this, which is my mistake. Details can be found here: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html

“Unlike texture memory, surface memory uses byte addressing. This means that the x-coordinate used to access a texture element via texture functions needs to be multiplied by the byte size of the element to access the same element via a surface function. For example, the element at texture coordinate x of a one-dimensional floating-point CUDA array bound to a texture reference texRef and a surface reference surfRef is read using tex1d(texRef, x) via texRef, but surf1Dread(surfRef, 4x) via surfRef. Similarly, the element at texture coordinate x and y of a two-dimensional floating-point CUDA array bound to a texture reference texRef and a surface reference surfRef is accessed using tex2d(texRef, x, y) via texRef, but surf2Dread(surfRef, 4x, y) via surfRef (the byte offset of the y-coordinate is internally calculated from the underlying line pitch of the CUDA array).”