How create channel format for 2-D texture binding to 'half-float' image ?

I would like to know how to correctly create a channel format for 1 2-D texture with data type ‘half’ (16-bit float). The Programming guide mentions a function ‘cudaCreateChannelDescHalf’, but at least in cuda toolkit 5.0 this function seems to be not existing.

Do I have to use
cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindFloat);
or
cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsigned)
?

Note the 2-D texture shall be bound to an existing 2-D pitch-linear image (datatype=half, one channel only).

reposted for Cuda Toolkit 9.1

For Cuda Toolkit < 7.5, I was representing half-floats on the GPU with a ‘uint16_t’ and was using ‘cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsigned)’ in order to create the channel descriptor for the texture object. That worked fine.

For Cuda Toolkit >= 7.5, I want to represent half-floats on the GPU with the ‘half’ datatype from the Cuda Toolkit which is available since this toolkit version (header file ‘cuda_fp16.h’).

Do I have to use ‘cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindFloat)’ in order to create the channel descriptor for the texture object ? I suppose so - but is it supported by the GPU ? As ‘half’ provides overloaded casting operators to all native datatypes, I supose I don’t have to use a manual ‘unwrapping’, or ?

Oops, looks like texture object do not work with ‘half’ as datatype (Cuda Toolkit 9.1, CentOS).

I get a compile error for the templatized ‘tex2D’ function …

/usr/local/cuda/include/texture_indirect_functions.h(155): error: no instance of overloaded function "tex2D" matches the argument list
            argument types are: (half *, cudaTextureObject_t, float, float)
          detected during:
            instantiation of "T tex2D<T>(cudaTextureObject_t, float, float) [with T=half]" 
/home/centos/project/common/libs/Cuda/CudaCVCoreNG1.0/include/Cusu/Acc/Getter.h(57): here

So using ‘half’ data type natively in texture object seems not to work with current Toolkit (9.1) - the tex2D<…> compile error seems to prove it.

So I will revert back to the strategy employed for older toolkits (< 7.5), by representing ‘half’ as ‘unsigned short’ for which a texture object can be found to.

But, this makes problems as the function ‘__float2half’ returns now ‘__half’ - instead of ‘unsigned short’ as done previously … Same for ‘__half2float’ accepts now a ‘__half’ instead of a ‘unsigned short’.

As __half seems to be implemented as a unsigned short (see ‘cuda_fp16.hpp’, declaration of struct ‘__half’), how can I can get the internal ‘unsigned short’ representation ? Some sort of type punning via an union ? Or deriving from struct ‘half’ (so that I can access the protected member ‘unsigned short __x’ ? Deriving seems to be the best option.

I posted the code below previously, as a quick demonstration of to how to use FP16 data in textures. I just tried it again, in CUDA 8.0 with an sm_50 target, and it works as expected, the output is as follows on my Windows 7 system:

0.00000000e+00    5.96046448e-08    1.19209290e-07
 1.00000000e+00    1.00097656e+00    1.00195313e+00
 2.00000000e+00    2.00195313e+00    2.00390625e+00
            inf               nan               nan

I have yet to try CUDA 9, so cannot tell you whether the code still works with it, but I assume it does, because otherwise backward compatibility would be destroyed. Maybe I misunderstood your question and you are asking how to assign the texture data to a ‘half’ variable rather than a ‘float’, as I have done in the code below?

#include <stdlib.h>
#include <stdio.h>

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                            \
 do {                                                                \
    cudaError_t err = call;                                             \
    if (cudaSuccess != err) {                                           \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",  \
                 __FILE__, __LINE__, cudaGetErrorString(err) );         \
        exit(EXIT_FAILURE);                                             \
        }                                                               \
 } while (0)
 
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                            \
    do {                                                                \
        /* Check synchronous errors, i.e. pre-launch */                 \
        cudaError_t err = cudaGetLastError();                           \
        if (cudaSuccess != err) {                                       \
            fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n", \
                     __FILE__, __LINE__, cudaGetErrorString(err) );     \
            exit(EXIT_FAILURE);                                         \
        }                                                               \
        /* Check asynchronous errors, i.e. kernel failed (ULF) */       \
        err = cudaThreadSynchronize();                                  \
        if (cudaSuccess != err) {                                       \
            fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n", \
                     __FILE__, __LINE__, cudaGetErrorString( err) );    \
            exit(EXIT_FAILURE);                                         \
        }                                                               \
    } while (0)

texture<float, 2> tex;

__global__ void kernel (int m, int n) 
{
    float val;
    for (int row = 0; row < m; row++) {
        for (int col = 0; col < n; col++) {
            val = tex2D (tex, col + 0.5f, row + 0.5f);
            printf ("% 15.8e   ", val);
        }
        printf ("\n");
    }
}

int main (void)
{
    int m = 4; // height = #rows
    int n = 3; // width  = #columns
    size_t pitch, tex_ofs;
    unsigned short arr[4][3]= {{0x0000,0x0001,0x0002},  // zero, denormals
                               {0x3c00,0x3c01,0x3c02},  // 1.0 + eps
                               {0x4000,0x4001,0x4002},  // 2.0 + eps
                               {0x7c00,0x7c01,0x7c02}}; // infinity, NaNs
    unsigned short *arr_d = 0;
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDescHalf(); 
    CUDA_SAFE_CALL(cudaMallocPitch((void**)&arr_d,&pitch,n*sizeof(*arr_d),m));
    CUDA_SAFE_CALL(cudaMemcpy2D(arr_d, pitch, arr, n*sizeof(arr[0][0]),
                                n*sizeof(arr[0][0]),m,cudaMemcpyHostToDevice));
    CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, arr_d, &channelDesc,
                                       n, m, pitch));
    if (tex_ofs !=0) {
        printf ("tex_ofs = %zu\n", tex_ofs);
        return EXIT_FAILURE;
    }
    kernel<<<1,1>>>(m, n);
    CHECK_LAUNCH_ERROR();
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    CUDA_SAFE_CALL (cudaFree (arr_d));
    return EXIT_SUCCESS;
}

Your code should work. But I use texture objects, not teture references. The problem is that the “tex2D(…” statement gives a compiler error (as stated above) - so setting ‘half’ as T is not working. Maybe I should use “tex2D(…” instead. But AFAIK, in order to read a pixel of type T, one should use “tex2D(…”

Sorry, I must have overlooked that.

It seems the documentation doesn’t provide sufficient information? If so, consider filing an enhancement request to get the documentation improved.