I have a simple code that runs on GPU (with cuda version 8.0.72). The code read an image from binary file (which works correctly) and then copy image to GPU, perform multiplication and copy back result to CPU. When I run code with cuda-memcheck it works as expected, but when I run it normally the result is zeros. Any idea what may cause this and how to fix it?
Code is listed below.
#include <iostream>
#include <fstream>
#include <opencv2/opencv.hpp>
#include <device_launch_parameters.h>
#include <cuda_runtime.h>
#define HEIGHT 480
#define WIDTH 640
__global__ void multiply(float* In1, float* In2, float* Out, int N);
int main()
{
std::ifstream file("posle_BPR.bin", std::ios::in | std::ios::binary);
cv::Mat I(1,HEIGHT * WIDTH, CV_64F);
cv::Mat I_out(1,HEIGHT * WIDTH, CV_32F);
float *I_gpu;
float *tmp;
file.read(reinterpret_cast<char*>(I.data), HEIGHT * WIDTH * I.elemSize());
file.close();
I.convertTo(I,CV_32F);
int size = HEIGHT*WIDTH*sizeof(float);
cudaMalloc((void**)& I_gpu, size);
cudaMemcpy(I_gpu, I.data, size, cudaMemcpyHostToDevice);
cudaMalloc((void**)& tmp, size);
cudaMemcpy(tmp, I.data, size, cudaMemcpyHostToDevice);
int bw2 = 1024;
dim3 dimBlock2(bw2);
dim3 dimGrid2(ceil(WIDTH*HEIGHT / float(bw2)));
multiply <<<dimGrid2,dimBlock2,bw2*sizeof(float)>>>(reinterpret_cast<float*>(I_gpu), reinterpret_cast<float*>(I_gpu), reinterpret_cast<float*>(tmp),HEIGHT*WIDTH);
cudaDeviceSynchronize();
cudaMemcpy(I_out.data, tmp, size, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
std::cout<<I_out<<std::endl;
return 0;
}
__global__ void multiply(float* In1, float* In2, float* Out, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N){
Out[i] = In1[i] * In2[i];
}
}