Reading R8G8B8A8 texture using tex2D() causes strange result.

I fixed the problem, although this solution (IMHO) looks quite ugly.
This version works:

texture<uchar4, 2, cudaReadModeNormalizedFloat> inTex;

extern “C” global

void cuda_main(unsigned char surface, int width, int height, size_t pitch)
{
int x = blockIdx.x
blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
unsigned char *pixel;

if (x >= width || y >= height) 
	return;
pixel = (surface + y*pitch) + 4 * x;

[b]float4[/b] c = tex2D(inTex, ((float)x / (float)width), ((float)y / (float)height));

pixel[0] = c.x[b] * 255.0[/b];
pixel[1] = c.y[b] * 255.0[/b];
pixel[2] = c.z [b]* 255.0[/b];
pixel[3] = c.w[b] * 255.0[/b];

}

Perhaps exist some solution without double conversion. I still suspect that this is a bug in CUDA.

I see no bug here. The texture is a ‘uchar’ texture. However, to apply interpolation, ‘cudaReadModeNormalizedFloat’ must be used, which means that the data returned by the texture lookup will be a ‘float’ in the interval [0,1]. To get back the original values from the texture, one therefore needs to “unnormalize” by multiplying with the appropriate scale factor, which is 255 in the case of ‘uchar’ data. This is clear from my example code, although I did not specifically call attention to it in a comment, mistakenly thinking it would be obvious.

Why is there a requirement to use ‘cudaReadModeNormalizedFloat’? Presumably because the interpolator hardware built into the texture units uses simple fixed-point computation with an 1.8 format (see appendix G.2 of the CUDA Programming Guide), and therefore cannot provide dynamic scaling as in an equivalent floating-point computation.

A higher-precision alternative to texture-based interpolation (which has only a minor performance impact in many cases) is to access the texture data point-wise, then interpolate the resulting fp32 data manually, e.g. using two FMA operations for the 1D case: [url]https://devblogs.nvidia.com/lerp-faster-cuda/[/url]

Note that 255.0 is a double-precision constant, forcing double-precision computation. If pixel is not of type ‘double’, it would be better to use 255.0f here.

I’m not an expert in the internal structure of the GUI, but it seems to me that using “cudaReadModeElementType” implies that the return value will be in the original format, in my case it’s integer, 8 bits. Instead, some garbage comes back. This flag simply does not work in case of integer textures, but a float texture does not need to be converted at all, and in this case cudaReadModeNormalizedFloat does not make sense. I do not see any logic here.

If I use cudaReadModeNormalizedFloat, then everything works as you said and you are absolutely right. Thank you very much for the help, I just did not know about the existence of this flag (I work with CUDA only two weeks).

I am not sure what the issue is. Use of ‘cudaReadModeElementType’ returns the actual texture data, and therefore no scaling to “unnormalize” is needed in that case. But it requires point-wise retrieval, no interpolation is allowed (basically at the hardware level this is a path that bypasses the interpolator hardware and delivers the data straight from memory). Here is my previous example code, modified to use ‘cudaReadModeElementType’. Note the scaling at the tex2D() call is gone.

#include <stdlib.h>
#include <stdio.h>

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

texture<unsigned char, 2, cudaReadModeElementType> tex;

__global__ void kernel (int m, int n) 
{
    float val;
    for (int row = 0; row < m; row++) {
        for (int col = 0; col < n; col++) {
            val = tex2D (tex, col+0.5f, row+0.5f);
            printf ("%12.5f  ", val);
        }
        printf ("\n");
    }
}

int main (void)
{
    int m = 4; // height = #rows
    int n = 3; // width  = #columns
    size_t pitch, tex_ofs;
    unsigned char arr[4][3]= {{10,20,30},{40,50,60},{70,80,90},{100,110,120}};
    unsigned char *arr_d = 0;

    CUDA_SAFE_CALL(cudaMallocPitch((void**)&arr_d,&pitch,n*sizeof(*arr_d),m));
    CUDA_SAFE_CALL(cudaMemcpy2D(arr_d, pitch, arr, n*sizeof(arr[0][0]),
                                n*sizeof(arr[0][0]),m,cudaMemcpyHostToDevice));
    tex.normalized = false;
    CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, arr_d, &tex.channelDesc,
                                       n, m, pitch));
    if (tex_ofs !=0) {
        printf ("tex_ofs = %zu\n", tex_ofs);
        return EXIT_FAILURE;
    }
    printf ("reading array straight\n");
    kernel<<<1,1>>>(m, n);
    CHECK_LAUNCH_ERROR();
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    return EXIT_SUCCESS;
}

Expected output:

reading array straight
    10.00000      20.00000      30.00000
    40.00000      50.00000      60.00000
    70.00000      80.00000      90.00000
   100.00000     110.00000     120.00000

Okay, I do not want to argue, especially since I’m not an expert. I was just surprised by the garbage returned instead of expected values. I was not ready for this behaviour.

I want to ask one more question: - Have you ever worked with mipmaps? I mean CUmipmappedArray, cuTexRefSetMipmapFilterMode() and tex2DLod(). My input texture contains mipmaps, but I do not see it at the output. Whichever level I set, the first slice is returned. I did not find any examples using mipmaps.

P.S.
In your sample, try to change to . While I used one channel, everything was fine. The problem starts when I use RGBA.

When working with an unfamiliar API, it is natural that some confusion can occur. The CUDA documentation is sufficiently comprehensive that all the information needed to construct the examples I posted in this thread can be found there.

I have worked with mipmaps before: in OpenGL and OpenGL-ES. Not in CUDA. And I am not familiar with CUDA’s driver API as previously noted, having used the runtime API exclusively for a dozen years. So I am afraid I am unable to provide assistance for this new question. BTW, it would probably be best to post this new issue in a new thread dedicated to it. That makes it easier for future readers to find and reference.

P.S. Here is the previous example modified to use a ‘uchar4’ texture.

#include <stdlib.h>
#include <stdio.h>

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

texture<uchar4, 2, cudaReadModeElementType> tex;

__global__ void kernel (int m, int n) 
{
    uchar4 val;
    for (int row = 0; row < m; row++) {
        for (int col = 0; col < n; col++) {
            val = tex2D (tex, col+0.5f, row+0.5f);
            printf ("(%3u %3u %3u %3u)  ", val.x, val.y, val.z, val.w);
        }
        printf ("\n");
    }
}

int main (void)
{
    int m = 4; // height = #rows
    int n = 3; // width  = #columns
    size_t pitch, tex_ofs;
    uchar4 arr[4][3]= {{{10,11,12,13},{20,21,22,23},{30,31,32,33}},
                       {{40,41,42,43},{50,51,52,53},{60,61,62,63}},
                       {{70,71,72,72},{80,81,82,83},{90,91,92,93}},
                       {{100,101,102,103},{110,111,112,113},{120,121,122,123}}};
    uchar4 *arr_d = 0;

    CUDA_SAFE_CALL(cudaMallocPitch((void**)&arr_d,&pitch,n*sizeof(*arr_d),m));
    CUDA_SAFE_CALL(cudaMemcpy2D(arr_d, pitch, arr, n*sizeof(arr[0][0]),
                                n*sizeof(arr[0][0]),m,cudaMemcpyHostToDevice));
    tex.normalized = false;
    CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, arr_d, &tex.channelDesc,
                                       n, m, pitch));
    if (tex_ofs !=0) {
        printf ("tex_ofs = %zu\n", tex_ofs);
        return EXIT_FAILURE;
    }
    printf ("reading array straight\n");
    kernel<<<1,1>>>(m, n);
    CHECK_LAUNCH_ERROR();
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    return EXIT_SUCCESS;
}

It works exactly as expected. The output is:

reading array straight
( 10  11  12  13)  ( 20  21  22  23)  ( 30  31  32  33)
( 40  41  42  43)  ( 50  51  52  53)  ( 60  61  62  63)
( 70  71  72  72)  ( 80  81  82  83)  ( 90  91  92  93)
(100 101 102 103)  (110 111 112 113)  (120 121 122 123)

OK, thank you!
Mipmaps is not a top priority at the moment.

Best regards,
San

The issue is solved and this thread can be closed.
For those who are interested in interop between DX11 and CUDA, I posted source code where these components work together (including mipmaps).

You can take it here: http://sanbasestudio.com/tmp/DX11_CUDA_Test_final.zip (I like open source!)

Perhaps I’m not the only person who tries to combine a graphical interface and mathematical calculations. So, I hope this can also help someone else.
Special thanks to cbuchner1, I was pleased to learn that I am known not only in the art hangouts. :)

San