Cpp code runs fine with cuda memcheck but gives zeros without cuda-memcheck

I have a simple code that runs on GPU (with cuda version 8.0.72). The code read an image from binary file (which works correctly) and then copy image to GPU, perform multiplication and copy back result to CPU. When I run code with cuda-memcheck it works as expected, but when I run it normally the result is zeros. Any idea what may cause this and how to fix it?

Code is listed below.

#include <iostream>
#include <fstream>
#include <opencv2/opencv.hpp>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>

#define HEIGHT 480
#define WIDTH 640

__global__ void multiply(float* In1, float* In2, float* Out, int N);

int main()
{
    std::ifstream file("posle_BPR.bin", std::ios::in | std::ios::binary);
    cv::Mat I(1,HEIGHT * WIDTH, CV_64F);
    cv::Mat I_out(1,HEIGHT * WIDTH, CV_32F);

    float *I_gpu;
    float *tmp;

    file.read(reinterpret_cast<char*>(I.data), HEIGHT * WIDTH * I.elemSize());
    file.close();
    I.convertTo(I,CV_32F);

    int size = HEIGHT*WIDTH*sizeof(float);

    cudaMalloc((void**)& I_gpu, size);
    cudaMemcpy(I_gpu, I.data, size, cudaMemcpyHostToDevice);

    cudaMalloc((void**)& tmp, size);
    cudaMemcpy(tmp, I.data, size, cudaMemcpyHostToDevice);

    int bw2 = 1024;
    dim3 dimBlock2(bw2);
    dim3 dimGrid2(ceil(WIDTH*HEIGHT / float(bw2)));

    multiply <<<dimGrid2,dimBlock2,bw2*sizeof(float)>>>(reinterpret_cast<float*>(I_gpu), reinterpret_cast<float*>(I_gpu), reinterpret_cast<float*>(tmp),HEIGHT*WIDTH);

    cudaDeviceSynchronize();
    cudaMemcpy(I_out.data, tmp, size, cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();

    std::cout<<I_out<<std::endl;

    return 0;
}

__global__ void multiply(float* In1, float* In2, float* Out, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N){
            Out[i] = In1[i] * In2[i];

    }
}

It looks like it’s the problem with size of image when cudaMemcpy is done. When I lower size of image to less than 200k pixels it runs well. How can I fix this, so that I can run this code with bigger images?

That seems like a fairly old CUDA version to be using, currently.

I don’t have OpenCV nor your input file, but when I create a simple test case around the code you have shown, I don’t see any issues:

# cat t134.cu
#include <iostream>
#include <fstream>

#define HEIGHT 480
#define WIDTH 640
#define T1 2.0f

__global__ void multiply(float* In1, float* In2, float* Out, int N);

int main()
{

    float *I_gpu;
    float *tmp;
    float *h;
    int size = HEIGHT*WIDTH*sizeof(float);
    h = new float[HEIGHT*WIDTH];
    for (int i = 0; i< HEIGHT*WIDTH; i++) h[i]  = T1;
    cudaMalloc((void**)& I_gpu, size);
    cudaMemcpy(I_gpu, h, size, cudaMemcpyHostToDevice);
    cudaMalloc((void**)& tmp, size);
    cudaMemset(tmp, 0, size);
    int bw2 = 1024;
    dim3 dimBlock2(bw2);
    dim3 dimGrid2(ceil(WIDTH*HEIGHT / float(bw2)));

    multiply <<<dimGrid2,dimBlock2,bw2*sizeof(float)>>>(reinterpret_cast<float*>(I_gpu), reinterpret_cast<float*>(I_gpu), reinterpret_cast<float*>(tmp),HEIGHT*WIDTH);
    cudaMemcpy(h, tmp, size, cudaMemcpyDeviceToHost);
    for (int i = 0; i< HEIGHT*WIDTH; i++) if (h[i] != T1*T1) {std::cout << "Error at: " << i << " was: " << h[i] << " should be: " << T1*T1 << std::endl; return 0;}

    return 0;
}

__global__ void multiply(float* In1, float* In2, float* Out, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N){
            Out[i] = In1[i] * In2[i];

    }
}
# nvcc -o t134 t134.cu
# compute-sanitizer ./t134
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
# ./t134
#

So I suspect the problem lies in something you haven’t shown or something you haven’t told us.