CudaMemcopy from Devicetohost crashes program

Hello

I m trying to find a minimum of RGB around the patch size of 15 x 15

In source.cpp file at

SAFE_CALL(cudaMemcpy(rgbmin_h, rgbmin_d, size2, cudaMemcpyDeviceToHost));

program get crashed

Here is my code snippet

darkprior.h

#ifndef DARKPRIOR_H_INCLUDED
#define DARKPRIOR_H_INCLUDED

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include "opencv2/opencv.hpp"

#define SAFE_CALL(call)                                                                                                         \
    do                                                                                                                          \
		{                                                                                                                           \
		cudaError_t err = (call);                                                                                               \
		if(cudaSuccess != err)                                                                                                  \
				{                                                                                                                       \
			fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err));    \
			cudaDeviceReset();                                                                                                  \
			exit(EXIT_FAILURE);                                                                                                 \
				}                                                                                                                       \
		}                                                                                                                           \
			while (0)

void dark_channel(float *image_d, float *rgbmin_d, int height, int width, int step);

#endif

Source.cpp

#include "DarkPrior.h"
#include <opencv2/opencv.hpp>

using namespace std;
using namespace cv;

int main()
{
	//load the image
	Mat src = imread("foggy-school-morning.jpg");

	//check whether image loaded is empty or not.
	if (src.empty())
	{
		cerr << "no image"; return -1;
	}
	
	//Mat rgbMin(src.size(), CV_MAKETYPE(src.depth(), 1));
//	int step = src.step;
	float *image_h = NULL;
	float *image_d = NULL;
	float *Dark_d = NULL;
	float *Dark_h = NULL;
	//Mat rgbmin(src.size(), CV_MAKETYPE(src.depth(), 1));

	size_t size1 = src.step * src.rows * sizeof(float);
	size_t size2 = src.cols * src.rows * sizeof(float);

	image_h = (float *)malloc(size1);
	Dark_h = (float *)malloc(size2);

	SAFE_CALL(cudaMalloc((void**)&image_d, size1));
	SAFE_CALL(cudaMalloc((void**)&Dark_d, size2));

	//convert image from CV::MAT to float*.
	Mat dst;
	src.convertTo(dst, CV_32F);
	image_h = dst.ptr<float>();

	SAFE_CALL(cudaMemcpy(image_d, image_h, size1, cudaMemcpyHostToDevice));

	cout << "Calculating Minimum of RGB ..." << endl;
	dark_channel(image_d, Dark_d, src.rows, src.cols, src.step);

	SAFE_CALL(cudaMemcpy(Dark_h, Dark_d, size2, cudaMemcpyDeviceToHost));

	Mat Dark_out(src.rows, src.cols, CV_32FC1, Dark_h);
	imwrite("MinRGB.jpg", Dark_out);

	cudaFree(image_d);
	cudaFree(Dark_d);

	//free(image_h);
	//free(rgbmin_h);

	return 0;
}

MinRGB.cu

#include "DarkPrior.h"
#define min(x,y) ((x<y)?x:y)

__device__ float safe_get(float *rgbMin, int width, int height, int x, int y)
{
	
	// Clamp indices to image boundaries
	x = min( max(0, x), width - 1);
	y = min( max(0, y), height - 1);

	// Translate 2D index into 1D index
	const int idx = y * width + x ;

	
	return rgbMin[idx];
}

__device__ void estimate_minimum_patch(float *rgbMin, float *darkCh, int width, int height, int radius)
{
	int x = blockIdx.x; // Current column
	int y = blockIdx.y; // Current row
    int tid = y * width + x;

	int Minval = 255.0;

	for(int i = -radius; i <= radius; i++)
	{
		for(int j = -radius; j <= radius; j++)
		{
			float ch = safe_get(rgbMin, width, height, x+i, y+j);		

			darkCh[tid] = min(ch, Minval);
		}
	}
	
}

__device__ void estimate_minimum(float3 *image, float *tmp_dark, int height, int width)
{
	int x = blockIdx.x; // Current column
	int y = blockIdx.y; // Current row
	int i = y * width + x;

	if(x > height && y > width)
	{
		return;
	}
	
	tmp_dark[i] = min(image[i].x, min(image[i].y, image[i].z));

}

__global__ void kernel_findMinRGB (float3 *image, float *tmp_dark, int height, int width)
{
	estimate_minimum(image, tmp_dark, height, width);
}

__global__ void kernel_darkChannel (float *rgbMin, float *darkCh, int height, int width)
{
	int radius  = 7;

	estimate_minimum_patch(rgbMin, darkCh, width, height, radius);
}

void dark_channel(float *image_d, float *Dark_d, int height, int width, int step)
{
	dim3 grid(width, height);

	float *tmp_min;
	cudaMalloc((void **)(&tmp_min), sizeof(float)*height*width);

	kernel_findMinRGB <<<grid, 1>>> ((float3 *)image_d, tmp_min, height, width);
	printf("RGB min is found\n");

	kernel_darkChannel <<<grid, 1>>> (tmp_min, Dark_d, height, width);
	printf("patch of minimum is also found\n");
	
	
	return;
}

My code getting crashed with an error of unknown error @ line 45 of source.cpp

I’m totally out of thoughts what is the reason, maybe you’ll be able to help.

the error may very well be from

  1. dark_channel(image_d, rgbmin_d, src.rows, src.cols, step);

already

do error checking on the kernel launch

and/ or add a breakpoint to the kernel 1st line, to note whether the kernel actually runs

Yes Kernel 1 and Kernel 2 Both Runs,
In Kernel 1 I m Finding minimum of RGB in a image and make it from 3 channel to 1 channel.
I just Executed only 1st kernel, and i m getting a perfect output

After each Kernel launch is done and returns back i m printing a statement, these statements also perfectly getting executed… So i can conclude that my kernel launch are done

"After each Kernel launch is done and returns back i m printing a statement, these statements also perfectly getting executed… So i can conclude that my kernel launch are done "

kernel launches are asynchronous; also the reason why you can not catch an error on it directly
hence, i truly hope you are not

“So i can conclude that my kernel launch are done”

based on:

"“After each Kernel launch is done and returns back i m printing a statement, these statements also perfectly getting executed.”

if you indeed verified that your kernels are running, via afore-mentioned methods, i would suggest debugging the memory copy by creating an independent memory copy, and porting the defective memory copy parameters to it, a parameter at time, until the independent memory copy crashes too
that way, you can identify problematic parameters - if the memory copy fails, it is likely because of the parameters passed to it