OpenCV Image loading in CUDA texture

karratz12 · March 2, 2021, 3:11am

Hello,

I have been trying to load and read a texture using CUDA. I have been trying to understand the API for a week now and there are some explanations about the API I really don’t understand. I am trying to read an image from a 3 channel RGB file using the opencv lib and load the image as a texture to perform some image processing on it. I am having trouble just reading it in and out. Please take a look at my code and let me know what I am doing wrong…

using namespace cv;
cudaTextureObject_t texObjLinear;
constexpr bool CV_SUCCESS = true;
cudaArray* d_imageArray = 0;
size_t pitch;
uchar* d_img;

#define checkCudaErrors(call) { \
    const cudaError_t error = call; \
    if (error != cudaSuccess) {\
        printf("Error : %s:%d, ", __FILE__, __LINE__); \
        printf("code:%d, reason : %s\n", error, cudaGetErrorName(error)); \
        exit(-10 * error);\
    } \
} \

#define CHECK_CV(call) { \
    const bool error = call; \
    if (error != CV_SUCCESS) {\
        printf("Error : %s:%d, ", __FILE__, __LINE__); \
        printf("code:%d, reason\n", error); \
        exit(-10 * error);\
    } \
} \

__global__ void extractGradients(uchar* output, const cudaTextureObject_t texObj,  int width, int height) {

    // Calculate normalized texture coordinates
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    float u = x / (float)width;
    float v = y / (float)height;

    // Transform coordinates
    //u -= 0.5f;
    //v -= 0.5f;
  
    printf("%d \n", tex2D<uchar>(texObj, x * 3, y));

    //B     G     R       stride

    output[y * (width * 3) + (x * 3)] = tex2D<uchar>(texObj, x * 3, y);
    output[y * (width * 3) + (x * 3) + 1] = tex2D<uchar>(texObj, x * 3 + 1, y);
    output[y * (width * 3) + (x * 3) + 2] = tex2D<uchar>(texObj, x * 3 + 2, y); 
}

void initTexture(int w, int h, cv::Mat& _img) {
   
    // allocate array and copy image data
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
    //cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    
    checkCudaErrors(cudaMallocArray(&d_imageArray, &channelDesc, w, h));
    checkCudaErrors(cudaMemcpy2DToArray(d_imageArray, 0, 0, _img.data, w * sizeof(uchar) * 3,
        w * sizeof(uchar) * 3, h, cudaMemcpyHostToDevice));

    cudaResourceDesc            texRes;
    memset(&texRes, 0, sizeof(cudaResourceDesc));

    texRes.resType = cudaResourceTypeArray;
    texRes.res.array.array = d_imageArray;

    cudaTextureDesc             texDescr;
    memset(&texDescr, 0, sizeof(cudaTextureDesc));

    texDescr.normalizedCoords = false;
    texDescr.filterMode = cudaFilterModePoint;
    texDescr.addressMode[0] = cudaAddressModeClamp;
    texDescr.addressMode[1] = cudaAddressModeClamp;
    //texDescr.readMode = cudaReadModeNormalizedFloat; 
    texDescr.readMode = cudaReadModeElementType;

    checkCudaErrors(cudaCreateTextureObject(&texObjLinear, &texRes, &texDescr, NULL));
}

void initTexture2(int w, int h, cv::Mat& _img) {

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);

    checkCudaErrors(cudaMallocPitch(&d_img, &pitch, _img.step1(), h));
    checkCudaErrors(cudaMemcpy2D(d_img, pitch, _img.data, _img.step1(),
        w * 3 * sizeof(uchar), h, cudaMemcpyHostToDevice));
    
    cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
    cudaResourceDesc            texRes;
    memset(&texRes, 0, sizeof(cudaResourceDesc));

    texRes.resType = cudaResourceTypePitch2D;
    texRes.res.pitch2D.devPtr = d_img;
    texRes.res.pitch2D.desc = desc;
    texRes.res.pitch2D.width = w;
    texRes.res.pitch2D.height = h;
    texRes.res.pitch2D.pitchInBytes = pitch;
    cudaTextureDesc         texDescr;
    memset(&texDescr, 0, sizeof(cudaTextureDesc));

    texDescr.normalizedCoords = false;
    texDescr.filterMode = cudaFilterModePoint;
    texDescr.addressMode[0] = cudaAddressModeClamp;
    texDescr.addressMode[1] = cudaAddressModeClamp;
    texDescr.readMode = cudaReadModeElementType;
    //texDescr.readMode = cudaReadModeNormalizedFloat;

    checkCudaErrors(cudaCreateTextureObject(&texObjLinear, &texRes, &texDescr, NULL));
}

int main() {
    try {

        returnGPUCudaInfoResources(0);
        //const std::string filename = "C:\\Users\\karratdev\\Projects\\DelauneyTesselation\\standard_test_images\\standard_test_images\\lena_color_256.tif";
        const std::string filename = ".\\standard_test_images\\standard_test_images\\lena_color_256.tif";
        cv::Mat image = imread(filename, IMREAD_COLOR);

        /*
        uchar* testOutImage;
        testOutImage = (uchar*)malloc(image.cols * image.rows * sizeof(uchar) * 3);
        memcpy(testOutImage, image.data, image.cols * image.rows * sizeof(uchar) * 3);
        cv::Mat imageOutTest = cv::Mat(image.rows, image.cols, CV_8UC3, testOutImage);
        CHECK_CV(imwrite(".\\standard_test_images\\standard_test_images\\lena_color_256_test.png", imageOutTest));
        */

        if (image.empty()) {
            printf("Cannot read image file: %s\n", filename.c_str());
            return -1;
        }

        Point pt; pt.x = 10; pt.y = 8;
        cv::circle(image, pt, 2, 1);

        //initTexture(image.cols, image.rows, image);
        initTexture2(image.cols, image.rows, image);

        // Allocate result of transformation in device memory
        uchar* d_output;
        checkCudaErrors(cudaMalloc((void **) &d_output, image.cols * image.rows * sizeof(uchar) * 3));

        uchar* gpuRef;
        gpuRef = (uchar*)malloc(image.cols * image.rows * sizeof(uchar) * 3);

        // Invoke kernel
        dim3 dimBlock(16, 16);
        dim3 dimGrid((image.cols + dimBlock.x - 1) / dimBlock.x,
            (image.rows + dimBlock.y - 1) / dimBlock.y);
        printf("Kernel Dimension :\n   Block size : %i , %i \n    Grid size : %i , %i",
            dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y);
        extractGradients <<< dimGrid, dimBlock >>> (d_output, texObjLinear, image.cols, image.rows);

        checkCudaErrors(cudaMemcpy(gpuRef, d_output, image.cols * image.rows * sizeof(uchar) * 3, cudaMemcpyDeviceToHost));
        cv::Mat imageOut = cv::Mat(image.rows, image.cols, CV_8UC3, gpuRef);
        CHECK_CV(imwrite(".\\standard_test_images\\standard_test_images\\lena_color_256.jpg", imageOut));

        if (CV_SUCCESS == true) printf("\nSuccess !");

    }
    catch (Exception ex) {
        std::cerr << ex.what() << std::endl;
    }

    return 0;
}

This should be easy, I come from openCL and I didn’t have any problems but I am not understanding the texture CUDA system.

Here is a sample of my output :
lena_color_256

This is my original lena RGB :
lena_color_256.tif (192.4 KB)

karratz12 · March 3, 2021, 7:35pm

Please help, bump.

karratz12 · March 6, 2021, 6:10am

Please help, bump.

Robert_Crovella · March 8, 2021, 10:08pm

The fundamental issue here seems to be confusion about the texture type. It’s evident from your code that you intend to read individual bytes (i.e. individual uchar color channels) from the texture. In that case, you want the texture type to be uchar not uchar4. When I make the following changes, your code appears to work correctly for me:

karratz12:

    cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();  // modified
    cudaResourceDesc            texRes;
    memset(&texRes, 0, sizeof(cudaResourceDesc));

    texRes.resType = cudaResourceTypePitch2D;
    texRes.res.pitch2D.devPtr = d_img;
    texRes.res.pitch2D.desc = desc;
    texRes.res.pitch2D.width = 3*w;  // modified

I’ll make a few other suggestions:

You’re not making it easier for me to help you when you strip off the include headers. It’s about 5 additional lines of code over the 160 or so that you have posted, so I don’t think it would somehow make the posting “too large”. For me, it is not helpful when people do that.
I suggest learning to use the code formatting tools on this site. The proper approach is not to surround your code with blockquote (or whatever you did, or whatever button you pushed). A proper approach is to select your code in the edit window, then press the </> button at the top of the edit window. (I’ve already fixed this by editing your post.)

njuffa · March 8, 2021, 10:42pm

Another way to produce code blocks with generic syntax highlighting is to enclose them in a back-tick triple (```) at the start and end. E.g.

```
float foo (int bar);
```

displays as:

float foo (int bar);

On Stackoverflow, the backtick technique also supports specifying the programing language for the syntax highlighter. I haven’t tried yet whether that works in this forum.

```lang-cpp
// C++ code
```

rs277 · March 8, 2021, 10:52pm

That’s a nice alternative to the :

[code]
float foo (int bar);
[/code]

which also displays as:

float foo (int bar);

njuffa · March 8, 2021, 10:55pm

Way too much typing :-)

njuffa · March 9, 2021, 12:01am

Selecting code and pressing the button </> results in a code block without syntax highlighting, since the result is technically classified as pre-formatted text, not code:

float foo (int bar);

Note the missing highlighting of the function name, “foo”

karratz12 · March 9, 2021, 6:03am

Ok, I thank you so much. I will pay attention next time.

I had trouble understanding the concept of the cudaChannelFormatDesc, the Nvidia guide talking about this structure is not very clear.

Imagine I had the same RGB image, without an alpha channel and I did want to use uchar4 as an element type. Would my image be loaded as :

8 bits R, 8 bits G , 8 bits B, 8 bits 0 per element of the cudaArray??
So retrieving my RGB in the kernel would be :

float4 curPix = tex2D<float4>(texObj, u, v);

Or would my data be loaded as :

8 bits R, 8 bits G, 8 bits B , 8 bits of the next R per element ?

I guess I just don’t understand the inner working of loading texture memory and cudaChannelFormatDesc / cudaMemcpy2DToArray…

njuffa · March 9, 2021, 6:37am

I don’t think there is a need to understand the inner workings, it is sufficient to develop a working mental model. What usually works for me is this kind of a process:

(1) Read the documentation to develop an initial mental model. A few things (or even many things) will be unclear.
(2) Use the mental model in writing actual code. Usually things don’t work as expected the first time through, because the initial mental model is flawed
(3) Run some experiments on minimal examples (e.g. to answer 8 bits R, 8 bits G , 8 bits B, 8 bits 0 per element vs.8 bits R, 8 bits G, 8 bits B , 8 bits of the next R per element)
(4) Read the documentation again. Usually it makes a lot more sense the second time through, in the context of some hands-on-experience and real-world observations.

Repeat steps (2) through (4) as many times as needed. Repetition is a key component of learning. One important part of this learning process is getting one’s hands dirty, so to speak. Trying stuff, experimenting with stuff.

Robert_Crovella · March 9, 2021, 3:38pm

I would suggest not doing that. If you want to use uchar4, your underlying data should consist of a sequence of uchar4. It does not. I’m not going to try to explain the bizarre behavior that would come about if you try to do it anyway. It’s going to be some combination of pulling data “from the next element(s)” combined with some kind of clamp or other behavior when your indexing exceeds your underlying allocation.

There are various cuda sample codes that demonstrate usage of textures, plus there are 2 key canonical sections of the programming guide to be familiar with (here and here), plus there are numerous answered questions about texture usage on these forums as well as stackoverflow.

Topic		Replies	Views
2D Texture access How can I access pixels from 2D texture CUDA Programming and Performance	24	18473	June 5, 2010
Using Textures CUDA Programming and Performance	10	21830	March 29, 2007
Reading R8G8B8A8 texture using tex2D() causes strange result. CUDA Programming and Performance	27	2886	April 28, 2018
Texture Linear Filter doesn't work with uchar! normalizedFloat does not work with uchar, why CUDA Programming and Performance	7	3005	December 16, 2009
Reading and Writing OpenGL Textures with Cuda CUDA Programming and Performance	24	24954	June 26, 2023
CUDA and OpenGL - Beginner question CUDA Programming and Performance	15	4468	March 6, 2015
Simplest texture 2D examples CUDA Programming and Performance	11	11444	March 26, 2019
Reading OpenGL texture data from CUDA CUDA Programming and Performance	10	3182	January 17, 2020
opencv imageData copy to cuda CUDA Programming and Performance	6	8595	May 22, 2012
Resample a 16-bit indexed color image What's killing my performance? CUDA Programming and Performance	15	14257	August 23, 2007

OpenCV Image loading in CUDA texture

Related topics