cudaMemcpy error, all data not being transferred. but cudaMemcpy returns cudaSuccess

ghost_03 · August 3, 2008, 8:56pm

Hi all,

I’m having a really weird problem with a program. The code worked 100% OK with CUDA 1 (and I think cuda1 beta but I can’t remember) on my 8800 GTS (the 1st release, G80 I think?).

I’ve upgraded recently to CudaBeta2 and have a GTX 280 with Windows Vista (all at once). It produces faulty output now.

Through use of the debugger I’ve found out that of the two arrays that are sent to the card, only the first copies correctly with cudaMemcpy. Part of the second array makes it but the rest doesn’t. At first I thought it might be a space/allocation problem, but all cudaMalloc and cudaMemcpy functions return cudaSuccess. Further investigation even reveals that cudaMemset works–I can fill the array with values that way but can’t copy over the data I need to.

Has anyone seen this kind of behavior before? Any help/insight would be much appreciated.

tmurray · August 3, 2008, 9:03pm

Do you have code that can be poked at?

ghost_03 · August 3, 2008, 9:30pm

Unfortunately it’s mostly under NDA. I’ve included the actual allocations and transfers but I’m not sure if that’s of any use. The debugger shows that all the values in the host array are correct, but only to a point will they arrive on the device. Basically at this point I’m not sure what else to try, to me it kind of looks like a bug somewhere in cudaMemcpy.

Also, thanks for all help :biggrin:.

float* A_device, B_device;  //pointers to A and B on device

sizeA =  lengthA * sizeof(float);

sizeB =  lengthB * sizeof(float);

cudaError_t mem_allocA, mem_allocB, mem_transferA, mem_transferB;  //error checking parameters

  

mem_allocA = cudaMalloc((void**)&A_device, sizeA);

cudaMemset(A_device, 0, sizeA);

mem_transferA = cudaMemcpy(A_device, A, sizeA, cudaMemcpyHostToDevice);

mem_allocB = cudaMalloc((void**)&B_device, sizeB);

cudaMemset(B_device, 0, sizeB);

mem_transferB = cudaMemcpy(B_device, B, sizeB, cudaMemcpyHostToDevice);

Keldor314 · August 24, 2008, 12:57pm

I’m having the exact same problem. cudaMemcpy simply fails to copy any actual data from the host to the device, even though it returns success. I get the same results as if I had not called cudaMemcpy at all.

I’ve also had some strange crashes and even bluescreens apparently related to largish (several MB) memory allocations and (?) copys to the device. Possibly related?

Another thing to note is that global memory is not being retained between different kernel calls. Perhaps this is why the memcpy is not working? The data simply may not be being retained from the cudaMemcpy call up to the kernel.

I’m also running Cuda beta 2.0 on Vista32 with a GTX 280.

Here’s my code:

#include <cuda_runtime_api.h>

#include "BuddabrotCUDA.cuh"

#include <stdlib.h>

#include <stdio.h>

#include "cutil.h"

__device__ float4* lastPoints;

__device__ float4* points;

__device__ int* countdown;

float4 rands[BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY];

int countdowns[BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY];

void resetCUDABuffers()

{

	printf("Allocated %d bytes for initial points\n",BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4));

	for (int n = 0; n < BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY; n++)

	{

  rands[n].x = 4.0f*((float)rand()/(float)RAND_MAX-0.5f);

  rands[n].y = 4.0f*((float)rand()/(float)RAND_MAX-0.5f);

  rands[n].z = (float)rand()/(float)RAND_MAX;

  rands[n].w = (float)rand()/(float)RAND_MAX;

	}

	for (int n = 0; n < BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY; n++)

	{

  countdowns[n] = ITERATIONS;

	}

	cudaFree(points);

	cudaMalloc((void**)&points,ITERATIONS*BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4));

	printf("Allocated %d bytes for point list\n",ITERATIONS*BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4));

	cudaFree(lastPoints);

	cudaMalloc((void**)&lastPoints,BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4));

	cudaMemcpy(lastPoints,rands,BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4),cudaMemcpyHostToDevice);

	cudaFree(countdown);

	cudaMalloc((void**)&countdown,BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(int));

	cudaMemcpy(countdown,countdowns,BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(int), cudaMemcpyHostToDevice);

	cudaThreadSynchronize();

}

__global__ void runBatch(float4 *renderTarget, const int imageW, const int imageH)

{

	const int ix = blockDim.x*blockIdx.x+threadIdx.x;

	const int iy = blockDim.y*blockIdx.y+threadIdx.y;

	const int idx = iy*gridDim.x+ix;

	int count = countdown[idx];

	float xo = lastPoints[idx].x;

	float yo = lastPoints[idx].y;

	float rx = lastPoints[idx].z;

	float ry = lastPoints[idx].w;

	float x = xo;

	float y = yo;

	float itr = 0.0;//lastPoints[idx].z;

	for (int m = 0; m < 500; m++)

	{

  for (int n = 0; n < ITERATIONS; n++)

  {

  	int rdrIdx = (n*gridDim.x*gridDim.y+idx);

  	count--;

  	if (count < 0)

  	{

    unsigned int xdx = (unsigned int)(((points[rdrIdx].x+2.0f)/3.0f)*(float)imageW);

    unsigned int ydx = (unsigned int)(((points[rdrIdx].y+1.5f)/3.0f)*(float)imageH);

    if ((xdx < imageW) && (ydx < imageH))

    {

    	int rdrIdx2 = ydx*imageW+xdx;

    	renderTarget[rdrIdx2].x+=0.0001;

    	renderTarget[rdrIdx2].y+=0.0001;

    	renderTarget[rdrIdx2].z+=0.0001;

    }

  	}

  	float x2 = x*x-y*y+xo;

  	y = 2.0f*x*y+yo;

  	x = x2;

  	points[rdrIdx] = make_float4(x,y,itr,0.0f);

  	itr++;

  	if ((x*x+y*y > 64.0f)||(itr > (float)ITERATIONS))

  	{

    if (itr > (float)ITERATIONS)

    	count = ITERATIONS;

    xo = 4.0f*(rx-0.5f);

    yo = 4.0f*(ry-0.5f);

    x = xo;

    y = yo;

    rx = __int_as_float(((__float_as_int(rx)*1664525+1013904223UL)&0x007FFFFF)|0x3F800000)-1.0f;

    ry = __int_as_float(((__float_as_int(ry)*1664525+1013904223UL)&0x007FFFFF)|0x3F800000)-1.0f;

    itr = 0.0f;

  	}

  }

	}

	lastPoints[idx].x=x;

	lastPoints[idx].y=y;

	lastPoints[idx].z=rx;

	lastPoints[idx].w=ry;

	//lastPoints[idx].z=itr;

	countdown[idx]=count;

}

void runBatchWrapper(float4 *renderTarget, const int imageW, const int imageH)

{

	dim3 threads(BLOCK_WIDTHX,BLOCK_WIDTHY);

	dim3 grid(BLOCKSX,BLOCKSY);

	/*float4* rands = (float4*) malloc(BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4));

	for (int n = 0; n < BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY; n++)

	{

  rands[n].x = 4.0f*((float)rand()/(float)RAND_MAX-0.5f);

  rands[n].y = 4.0f*((float)rand()/(float)RAND_MAX-0.5f);

  rands[n].z = (float)rand()/(float)RAND_MAX;

  rands[n].w = (float)rand()/(float)RAND_MAX;

	}

	cudaMemcpy(lastPoints,rand,BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4),cudaMemcpyHostToDevice);*/

	runBatch<<<grid, threads>>>(renderTarget,imageW,imageH);

	cudaThreadSynchronize();

	//free(rands);

}

jorgoboy · March 3, 2009, 2:12pm

Well, my code doesn’t work either. The trouble is the same: Data is not copied to the device. Here is my code, if you have any tips, please post it! Thanks

#include <stdio.h>

#include <cutil.h>

#include <cuda.h>

#include <cutil_inline.h>

typedef unsigned char Byte;

__global__ void feketit(Byte* Gimage, int size)

{

	const unsigned int tid = threadIdx.x;

	unsigned char adat = Gimage[tid];

	if(tid<size)

	{

		Gimage[tid] = Gimage[tid]-2;

	}

}

extern "C" void feketecsinal(int argc, const char** argv, Byte* image, int width, int height, int szinszam)

{

	//szinszam = 1;

	//printf("szelesseg: %d\n", width);

	/*int deviceCount;

	cudaGetDeviceCount(&deviceCount);

	printf("device szam: %d\n", cutGetMaxGflopsDeviceId());*/

	if( cutCheckCmdLineFlag(argc, (const char**) argv, "device") )

	{

		cutilDeviceInit(argc, (char**)argv);

		printf("Eszkoz\n");

	}

	else

	{

		cudaSetDevice( cutGetMaxGflopsDeviceId() );

	}

	int size = width * height*szinszam*sizeof(unsigned char);

	Byte* Gimage;

	cutilSafeCall(cudaMalloc((void**) &Gimage, size));

	cudaMemcpy(Gimage, image, size, cudaMemcpyHostToDevice);

	for (int i = 0; i < 280; i++)

	{

		printf("%d: %d\n", i, Gimage[i]);

	}

	dim3 grid(1, 1, 1);

		dim3 threads(4, 1, 1);

	//feketit<<< grid, threads >>>(Gimage, size);

	cutilCheckMsg("Kernel execution failed");

	cutilSafeCall(cudaMemcpy(image, Gimage, size, cudaMemcpyDeviceToHost) );

}

alireza.elecat · August 7, 2019, 9:03am

did you guys find any tip to solve this problem ?? I am having the same problem, somehow, in my case, the Memcpy only copies first elements from Host to Device!!.

Topic		Replies	Views
cudaMemcpy() returns success and copy incorrect data CUDA Programming and Performance	3	2118	March 4, 2017
cudaMemcpy does not copy data from the host to device CUDA Programming and Performance	6	6980	June 20, 2012
cudaMemcpy problem problem with using cudaMemcpy CUDA Programming and Performance	12	5892	January 5, 2010
cudaDeviceSynchronize needed between kernel launch and cudaMemcpy ? CUDA Programming and Performance	15	16126	September 29, 2017
Got out of memory from cudaMemcpy CUDA Programming and Performance	13	3910	January 28, 2022
Concurrent Kernel executions & Data Transfers CUDA Programming and Performance cuda	3	577	March 8, 2023
GPU Transfer problems GPU won't correctly read data out from Device to Host CUDA Programming and Performance	15	2633	August 2, 2010
FAO: Nvidia Engineers:- Memory Leak in cudaMemcpyAsync Only occurs on Host To Device memory transfer CUDA Programming and Performance	4	5870	August 18, 2010
Question about CUDA_SAFE_CALL(cudaMemcpy(hostPx, CUDA_SAFE_CALL(cudaMemcpy(hostPx, device CUDA Programming and Performance	6	47469	January 23, 2009
__const__ writting with cudaMemcpyToSymbol CUDA Programming and Performance	15	10812	August 30, 2007

cudaMemcpy error, all data not being transferred. but cudaMemcpy returns cudaSuccess

Related topics