cudaMemcpy returns invalid value

I have a problem with a cudaMemcpy, it returns cudaErrorInvalidValue but I don’t understand why.

I have a main program which call a DLL where cuda functions are. I feed a gpuMat class with data from a sliding windows.

gpuMat is an header on a device pointer, just used to store some informations (number of cols, rows,… of the image,

My code :

Main :

int const height = 10008;
int const width = 7092;
size_t size = sizeof(uchar) * height * width + sizeof(uchar)*height * 2;
img_windowsed *host_data1;
uchar *dev_data1;

host_data1 = (img_windowsed*)malloc(sizeof(img_windowsed));
host_data1->start = (uchar*)malloc(size);
host_data1->cursor = host_data1->start;
host_data1->end = host_data1->start + size;

for (int j = 0; j < size; j++)
	host_data1->start[j] = j % 256;

if (plockmem(host_data1->start, size) == 0 )
	dev_data1 = (uchar*)cuda_image(host_data1->cursor, size);
	//it works perfectly, returns a new pointer
	if (dev_data1 != NULL )
	{	//create a GpuMat with 1 row, and width cols from a device_pointer, OK
		GpuMat gtest1 = create_gray_gpumat_from_data(1, width, dev_data1, 2);

#define NUMBER_OF_LINES 1400

		for (int i = 0; i < height; i += NUMBER_OF_LINES)
		{	//error if gtest1.rows + NUMBER_OF_LINES goes over about 3000
			if (update_gpumat_stream(host_data1, gtest1, NUMBER_OF_LINES) != 0)

Functions :

int update_gpumat_stream(img_windowsed * data_to_add, GpuMat &img, int rows_added){
	uchar * ptr;

	if (data_to_add->cursor + rows_added*img.step > data_to_add->end)
		return -2;

	ptr = + (img.rows - 1)*img.step + (img.cols - 1)*img.step;
	if (upload_rows(ptr, data_to_add->cursor, (size_t)rows_added*img.step) == 0)
		img.rows += rows_added;

		data_to_add->cursor += rows_added*img.step;
		if (data_to_add->cursor >= data_to_add->end)
			data_to_add->cursor = data_to_add->start;
		return 0;
		return -1;

int upload_rows(uchar * dst, uchar * data, size_t size){
	return cudaMemcpy((void*)dst, (void*)data, size, cudaMemcpyHostToDevice);

void * cuda_image(uchar * data, size_t size) {
	uchar *ptr;
	cudaMalloc(&ptr, size); //Allocate GPU space

	if (upload_rows(ptr, data, size) == cudaSuccess)
		return ptr;
		return NULL;

int plockmem(void * ptr, size_t size){
	int err=cudaHostRegister(ptr, size, cudaHostRegisterDefault);

	if (err == cudaSuccess)
		return 0;
		return 1;

I don’t understand because, in my function cuda_image I use without errors cudaMemcpy and if I define NUMBER_OF_LINES to height I have cudaErrorInvalidValue at the call from line 27 in the main.


EDIT : My error seems to come from update_gpumat_stream, on line : ptr = + (img.rows - 1)*img.step + (img.cols - 1)*img.step;
If I change it to ptr = I can send 10000 rows without errors.

You would want to check whether the addresses passed to cudaMemcpy(), are within the bounds of the memory allocations on the host and he device side. The issue may be incorrect computation of the addresses, or incorrect choice of allocation size(s). You could also try running with cuda-memcheck, but I am not sure it can pinpoint the problem in this case any better than you have done by narrowing it down manually.

Thanks for your answer, computation of the address in ptr was incorrect. ptr was out of allocated device memory when img.rows was greater than about 2900.

Problem solved !

I thought that the error returned by cudaMemcpy if a pointer is out of allocated memory will be cudaErrorInvalidDevicePointer and not cudaErrorInvalidValue, so I didn’t look at all in this direction.

Thanks a lot