Unable to correctly use a 2D CUDA array with a texture object

Hello, I’m learning to program with CUDA and specifically I’m learning how to use a 2D CUDA array (cudaArray_t) and how to use it with a texture object (cudaTextureObject_t).

I create a simple problem to understand these concepts: fill an output buffer in the device with a texture object that uses a 2D array filled with ones.

I have a problem with tex2D<float>(input, x, y) in the kernel code. Sometimes it returns zero instead of one. So the output buffer doesn’t contain all ones. I do not know why tex2D<float>(input, x, y) has this behavior. I think there is something wrong with the coordinates I pass to text2D, or the configuration of the cudaTextureDesc I pass to the texture object. What am I missing?

This is the source file called test-array-texture.cu:

// This is a test file to larn CUDA Array and Texture Object.
#include <stdio.h>
#include <stdlib.h>

// HandleError, HANDLE_ERROR, HandleNull, HANDLE_NULL are utilies to handle
// errors on CUDA calls and malloc calls.

static void HandleError(cudaError_t err, const char *file, int line) {
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
        exit( EXIT_FAILURE );
    }
}

#define HANDLE_ERROR(err) (HandleError( err, __FILE__, __LINE__ ))

static void HandleNull(void *ptr, const char *file, int line) {
        if (ptr == NULL) {
                printf("Host memory failed in %s at line %d\n", file, line);
                exit(EXIT_FAILURE);
        }
}

#define HANDLE_NULL(a) (HandleNull(a, __FILE__, __LINE__))

// Square dimension of the matrix.
const size_t DIM = 32;

__global__ void set_value_kernel(float *output, cudaTextureObject_t input) {
    // Map from threadIdx/BlockIdx to cell position.
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    float value = tex2D<float>(input, x, y);

    //printf("%d, %d, %d = %2.1f\n", x, y, offset, value);
    output[offset] = value;
}

void init_input_array(cudaArray_t dev_input) {
    // Input is a 2D array but is linear on host memory.
    float *input = (float *)malloc(DIM * DIM * sizeof(float));
    HANDLE_NULL(input);

    // Set host array values.
    for (int i = 0; i < DIM * DIM; i++)
        input[i] = 1.f;

    // Copy host array to device array.
    HANDLE_ERROR(cudaMemcpy2DToArray(dev_input,
                0, 0, // Offsets (not used).
                input,
                DIM * sizeof(float), // Width in bytes plus padding (no padding).
                DIM * sizeof(float), // Width in bytes.
                DIM, // Number of rows (not bytes).
                cudaMemcpyHostToDevice));

    free(input);
}

int main() {
    // Allocate a CUDA 2D array on the device. This array is used to create a
    // texture object that is used as input in the kernel.
    cudaChannelFormatDesc fmt_desc;
    memset(&fmt_desc, 0, sizeof(cudaChannelFormatDesc));
    fmt_desc.f = cudaChannelFormatKindFloat;
    fmt_desc.x = 32;
    fmt_desc.y = 32;
    cudaArray_t dev_input;
    HANDLE_ERROR(cudaMallocArray(&dev_input, &fmt_desc, DIM, DIM, cudaArraySurfaceLoadStore));

    init_input_array(dev_input); // Initialise the array with default values.

    // Create the resource description of the texture.
    cudaResourceDesc res_desc;
    memset(&res_desc, 0, sizeof(cudaResourceDesc));
    res_desc.resType = cudaResourceTypeArray;
    res_desc.res.array.array = dev_input;

    // Create the texture description (with default options).
    cudaTextureDesc tex_desc;
    memset(&tex_desc, 0, sizeof(cudaTextureDesc));

    // Finally create the texture object.
    cudaTextureObject_t input_tex;
    HANDLE_ERROR(cudaCreateTextureObject(&input_tex, &res_desc, &tex_desc, NULL));

    // Allocate a linear output buffer in the device. It is used as 2D array.
    float *dev_output;
    HANDLE_ERROR(cudaMalloc((void **)&dev_output, DIM * DIM * sizeof(float)));

    // Launch the kernel with 8x8 blocks of 8x8 thread for each block.
    dim3 blocks(DIM / 4, DIM / 4);
    dim3 threads(8, 8);
    set_value_kernel<<<blocks, threads>>>(dev_output, input_tex);

    // Allocate a linear output buffer in the host. It is used as 2D array.
    float *output = (float *)calloc(DIM * DIM, sizeof(float));
    HANDLE_NULL(output);

    // Copy output buffer from device to host.
    HANDLE_ERROR(cudaMemcpy(output, dev_output, DIM * DIM * sizeof(float),
                cudaMemcpyDeviceToHost));

    // Print output.
    for (int row = 0; row < DIM; row++) {
        for (int column = 0; column < DIM; column++) {
            printf("%5.0f ", output[column + row * DIM]);
        }
        puts("");
    }

    // Free allocated memory.
    HANDLE_ERROR(cudaFree(dev_output));
    HANDLE_ERROR(cudaDestroyTextureObject(input_tex));
    HANDLE_ERROR(cudaFreeArray(dev_input));
    free(output);

    return EXIT_SUCCESS;
}

I’m using an NVIDIA GeForce GTX 1050 Ti (compute capability 6.1) and these are the steps to build and run the code:

$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Wed_Nov_22_10:17:15_PST_2023
Cuda compilation tools, release 12.3, V12.3.107
Build cuda_12.3.r12.3/compiler.33567101_0
$ nvcc -Werror all-warnings test-array-texture.cu --output-file test-array-texture
$ ./test-array-texture
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 
    0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0 

Thanks in advance for the help!

Hi, I solved the problem myself. The problem was in cudaChannelFormatDesc and in the number of blocks, not in the texture object:

  1. In cudaChannelFormatDesc I set fmt_desc.y to 32, but I’m using a single float component, so the correct value is zero.
  2. I start a thread for each cell: 8x8 threads (64) for each block, and 8x8 blocks (64), 4096 threads in total. But the input is 32x32 (1024 elements), so the coordinates in the kernel will overflow and the calculated offset is wrong. I fixed this by starting 4x4 blocks (16) and the total is correct to 64x16=1024 threads.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.