Cpp code runs fine with cuda memcheck but gives zeros without cuda-memcheck

matejag7 · February 8, 2024, 10:16am

I have a simple code that runs on GPU (with cuda version 8.0.72). The code read an image from binary file (which works correctly) and then copy image to GPU, perform multiplication and copy back result to CPU. When I run code with cuda-memcheck it works as expected, but when I run it normally the result is zeros. Any idea what may cause this and how to fix it?

Code is listed below.

#include <iostream>
#include <fstream>
#include <opencv2/opencv.hpp>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>

#define HEIGHT 480
#define WIDTH 640

__global__ void multiply(float* In1, float* In2, float* Out, int N);

int main()
{
    std::ifstream file("posle_BPR.bin", std::ios::in | std::ios::binary);
    cv::Mat I(1,HEIGHT * WIDTH, CV_64F);
    cv::Mat I_out(1,HEIGHT * WIDTH, CV_32F);

    float *I_gpu;
    float *tmp;

    file.read(reinterpret_cast<char*>(I.data), HEIGHT * WIDTH * I.elemSize());
    file.close();
    I.convertTo(I,CV_32F);

    int size = HEIGHT*WIDTH*sizeof(float);

    cudaMalloc((void**)& I_gpu, size);
    cudaMemcpy(I_gpu, I.data, size, cudaMemcpyHostToDevice);

    cudaMalloc((void**)& tmp, size);
    cudaMemcpy(tmp, I.data, size, cudaMemcpyHostToDevice);

    int bw2 = 1024;
    dim3 dimBlock2(bw2);
    dim3 dimGrid2(ceil(WIDTH*HEIGHT / float(bw2)));

    multiply <<<dimGrid2,dimBlock2,bw2*sizeof(float)>>>(reinterpret_cast<float*>(I_gpu), reinterpret_cast<float*>(I_gpu), reinterpret_cast<float*>(tmp),HEIGHT*WIDTH);

    cudaDeviceSynchronize();
    cudaMemcpy(I_out.data, tmp, size, cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();

    std::cout<<I_out<<std::endl;

    return 0;
}

__global__ void multiply(float* In1, float* In2, float* Out, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N){
            Out[i] = In1[i] * In2[i];

    }
}

matejag7 · February 8, 2024, 11:50am

It looks like it’s the problem with size of image when cudaMemcpy is done. When I lower size of image to less than 200k pixels it runs well. How can I fix this, so that I can run this code with bigger images?

Robert_Crovella · February 8, 2024, 9:06pm

That seems like a fairly old CUDA version to be using, currently.

I don’t have OpenCV nor your input file, but when I create a simple test case around the code you have shown, I don’t see any issues:

# cat t134.cu
#include <iostream>
#include <fstream>

#define HEIGHT 480
#define WIDTH 640
#define T1 2.0f

__global__ void multiply(float* In1, float* In2, float* Out, int N);

int main()
{

    float *I_gpu;
    float *tmp;
    float *h;
    int size = HEIGHT*WIDTH*sizeof(float);
    h = new float[HEIGHT*WIDTH];
    for (int i = 0; i< HEIGHT*WIDTH; i++) h[i]  = T1;
    cudaMalloc((void**)& I_gpu, size);
    cudaMemcpy(I_gpu, h, size, cudaMemcpyHostToDevice);
    cudaMalloc((void**)& tmp, size);
    cudaMemset(tmp, 0, size);
    int bw2 = 1024;
    dim3 dimBlock2(bw2);
    dim3 dimGrid2(ceil(WIDTH*HEIGHT / float(bw2)));

    multiply <<<dimGrid2,dimBlock2,bw2*sizeof(float)>>>(reinterpret_cast<float*>(I_gpu), reinterpret_cast<float*>(I_gpu), reinterpret_cast<float*>(tmp),HEIGHT*WIDTH);
    cudaMemcpy(h, tmp, size, cudaMemcpyDeviceToHost);
    for (int i = 0; i< HEIGHT*WIDTH; i++) if (h[i] != T1*T1) {std::cout << "Error at: " << i << " was: " << h[i] << " should be: " << T1*T1 << std::endl; return 0;}

    return 0;
}

__global__ void multiply(float* In1, float* In2, float* Out, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N){
            Out[i] = In1[i] * In2[i];

    }
}
# nvcc -o t134 t134.cu
# compute-sanitizer ./t134
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
# ./t134
#

So I suspect the problem lies in something you haven’t shown or something you haven’t told us.

Topic		Replies	Views
Very simple CUDA program bad output CUDA Programming and Performance	3	761	July 3, 2017
Initcheck fails for an npp API CUDA-MEMCHECK	1	629	July 25, 2020
Calling NPP helper with large image gives kernel execution error GPU-Accelerated Libraries npp	3	1815	November 11, 2021
compilation CUDA Programming and Performance	3	7870	March 25, 2010
HELP NEEDED! cudamemcpy CUDA Programming and Performance	3	2533	March 18, 2008
CUDA memory copy (cudaMemcpy) fails after NPP sum function (nppiSum_8u_C3R) GPU-Accelerated Libraries npp	0	695	February 16, 2023
Simple question on passing to the kernel CUDA Programming and Performance	15	3392	January 15, 2012
Downsampling an image with CUDA CUDA Programming and Performance	1	1547	August 14, 2020
cudaMemcpy CUDA Programming and Performance	3	8417	April 8, 2009
Issue with NPP and opencv does not work with all image sizes? GPU-Accelerated Libraries opencv	0	1115	August 12, 2016

Cpp code runs fine with cuda memcheck but gives zeros without cuda-memcheck

Related topics