CUDA Texture Memory Example for Beginners

Hello everyone!

For my school project, I need to do interpolation using texture memory. However, I know very little about texture memory and cannot use it. Could you show very very basic code that uses texture memory or direct me to it?

Best,

CUDA ships with a whole bunch of example applications, including some that demonstrate the use of textures. I would suggest having a look at those. Below is a brief example code I cooked up myself. The output should look like so:

reading array straight
    10.00000      20.00000      30.00000
    40.00000      50.00000      60.00000
    70.00000      80.00000      90.00000
   100.00000     110.00000     120.00000
reading array shifted 0.5 in x-direction
    15.00000      25.00000      30.00000
    45.00000      55.00000      60.00000
    75.00000      85.00000      90.00000
   105.00000     115.00000     120.00000
reading array shifted 0.5 in y-direction
    10.00000      20.00000      30.00000
    25.00000      35.00000      45.00000
    55.00000      65.00000      75.00000
    85.00000      95.00000     105.00000

#include <stdlib.h>
#include <stdio.h>

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex;

__global__ void kernel (int m, int n, float shift_x, float shift_y) 
{
    float val;
    for (int row = 0; row < m; row++) {
        for (int col = 0; col < n; col++) {
            val = 255.0 * tex2D (tex, col+0.5f+shift_x, row+0.5f+shift_y);
            printf ("%12.5f  ", val);
        }
        printf ("\n");
    }
}

int main (void)
{
    int m = 4; // height = #rows
    int n = 3; // width  = #columns
    size_t pitch, tex_ofs;
    unsigned char arr[4][3]= {{10,20,30},{40,50,60},{70,80,90},{100,110,120}};
    unsigned char *arr_d = 0;

    CUDA_SAFE_CALL(cudaMallocPitch((void**)&arr_d,&pitch,n*sizeof(*arr_d),m));
    CUDA_SAFE_CALL(cudaMemcpy2D(arr_d, pitch, arr, n*sizeof(arr[0][0]),
                                n*sizeof(arr[0][0]),m,cudaMemcpyHostToDevice));
    tex.normalized = false;
    tex.filterMode = cudaFilterModeLinear;
    CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, arr_d, &tex.channelDesc,
                                       n, m, pitch));
    if (tex_ofs !=0) {
        printf ("tex_ofs = %zu\n", tex_ofs);
        return EXIT_FAILURE;
    }
    printf ("reading array straight\n");
    kernel<<<1,1>>>(m, n, 0.0f, 0.0f);
    CHECK_LAUNCH_ERROR();
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    printf ("reading array shifted 0.5 in x-direction\n");
    kernel<<<1,1>>>(m, n, 0.5f, 0.0f);
    CHECK_LAUNCH_ERROR();
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    printf ("reading array shifted 0.5 in y-direction\n");
    kernel<<<1,1>>>(m, n, 0.0, -0.5f);
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    CUDA_SAFE_CALL (cudaFree (arr_d));
    return EXIT_SUCCESS;
}
1 Like

Can’t thank you enough for your help!

Nevertheless, I have some questions for you. If you would answer them I would appreciated.
1 - Your code does not have <cuda.h> or <cuda_runtime.h> libraries but works. How is that possible I could not understand.
2 - 3rd and 4th inputs to kernel function are written as 0.0f or 0.5f. What is the meaning of “f” used here?
3 - In the kernel, val variable is defined as 255*tex2D(…). Why 255?
4 - why tex.normalized is false?
5 - Should not we unbind the texture after we are done with it?

Thank you a lot for your incredible help, it made me understand texture memory better.

Best,

When you compile a file with a .cu extension, nvcc automatically pulls in CUDA-specific header files.
f is a suffix for floating-point literal constants that makes them have type float. This is a C/C++ thing.
Yes, it would arguably be cleaner to unbind the texture, but since the apps exits anyhow there is really no need here.
tex.normalized = false because the program does not index into the texture with normalized coordinates (normalized means: in [0,1]).
Multiply with 255 because [0,1] is mapped to [0,255]

1 Like