I posted the code below previously, as a quick demonstration of to how to use FP16 data in textures. I just tried it again, in CUDA 8.0 with an sm_50 target, and it works as expected, the output is as follows on my Windows 7 system:
0.00000000e+00 5.96046448e-08 1.19209290e-07
1.00000000e+00 1.00097656e+00 1.00195313e+00
2.00000000e+00 2.00195313e+00 2.00390625e+00
inf nan nan
I have yet to try CUDA 9, so cannot tell you whether the code still works with it, but I assume it does, because otherwise backward compatibility would be destroyed. Maybe I misunderstood your question and you are asking how to assign the texture data to a ‘half’ variable rather than a ‘float’, as I have done in the code below?
#include <stdlib.h>
#include <stdio.h>
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
texture<float, 2> tex;
__global__ void kernel (int m, int n)
{
float val;
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
val = tex2D (tex, col + 0.5f, row + 0.5f);
printf ("% 15.8e ", val);
}
printf ("\n");
}
}
int main (void)
{
int m = 4; // height = #rows
int n = 3; // width = #columns
size_t pitch, tex_ofs;
unsigned short arr[4][3]= {{0x0000,0x0001,0x0002}, // zero, denormals
{0x3c00,0x3c01,0x3c02}, // 1.0 + eps
{0x4000,0x4001,0x4002}, // 2.0 + eps
{0x7c00,0x7c01,0x7c02}}; // infinity, NaNs
unsigned short *arr_d = 0;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDescHalf();
CUDA_SAFE_CALL(cudaMallocPitch((void**)&arr_d,&pitch,n*sizeof(*arr_d),m));
CUDA_SAFE_CALL(cudaMemcpy2D(arr_d, pitch, arr, n*sizeof(arr[0][0]),
n*sizeof(arr[0][0]),m,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, arr_d, &channelDesc,
n, m, pitch));
if (tex_ofs !=0) {
printf ("tex_ofs = %zu\n", tex_ofs);
return EXIT_FAILURE;
}
kernel<<<1,1>>>(m, n);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL (cudaDeviceSynchronize());
CUDA_SAFE_CALL (cudaFree (arr_d));
return EXIT_SUCCESS;
}