cudaMemcpy error, all data not being transferred. but cudaMemcpy returns cudaSuccess

Hi all,

I’m having a really weird problem with a program. The code worked 100% OK with CUDA 1 (and I think cuda1 beta but I can’t remember) on my 8800 GTS (the 1st release, G80 I think?).

I’ve upgraded recently to CudaBeta2 and have a GTX 280 with Windows Vista (all at once). It produces faulty output now.

Through use of the debugger I’ve found out that of the two arrays that are sent to the card, only the first copies correctly with cudaMemcpy. Part of the second array makes it but the rest doesn’t. At first I thought it might be a space/allocation problem, but all cudaMalloc and cudaMemcpy functions return cudaSuccess. Further investigation even reveals that cudaMemset works–I can fill the array with values that way but can’t copy over the data I need to.

Has anyone seen this kind of behavior before? Any help/insight would be much appreciated.

Do you have code that can be poked at?

Unfortunately it’s mostly under NDA. I’ve included the actual allocations and transfers but I’m not sure if that’s of any use. The debugger shows that all the values in the host array are correct, but only to a point will they arrive on the device. Basically at this point I’m not sure what else to try, to me it kind of looks like a bug somewhere in cudaMemcpy.

Also, thanks for all help :biggrin:.

float* A_device, B_device;  //pointers to A and B on device

sizeA =  lengthA * sizeof(float);

sizeB =  lengthB * sizeof(float);

cudaError_t mem_allocA, mem_allocB, mem_transferA, mem_transferB;  //error checking parameters

  

mem_allocA = cudaMalloc((void**)&A_device, sizeA);

cudaMemset(A_device, 0, sizeA);

mem_transferA = cudaMemcpy(A_device, A, sizeA, cudaMemcpyHostToDevice);

mem_allocB = cudaMalloc((void**)&B_device, sizeB);

cudaMemset(B_device, 0, sizeB);

mem_transferB = cudaMemcpy(B_device, B, sizeB, cudaMemcpyHostToDevice);

I’m having the exact same problem. cudaMemcpy simply fails to copy any actual data from the host to the device, even though it returns success. I get the same results as if I had not called cudaMemcpy at all.

I’ve also had some strange crashes and even bluescreens apparently related to largish (several MB) memory allocations and (?) copys to the device. Possibly related?

Another thing to note is that global memory is not being retained between different kernel calls. Perhaps this is why the memcpy is not working? The data simply may not be being retained from the cudaMemcpy call up to the kernel.

I’m also running Cuda beta 2.0 on Vista32 with a GTX 280.

Here’s my code:

#include <cuda_runtime_api.h>

#include "BuddabrotCUDA.cuh"

#include <stdlib.h>

#include <stdio.h>

#include "cutil.h"

__device__ float4* lastPoints;

__device__ float4* points;

__device__ int* countdown;

float4 rands[BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY];

int countdowns[BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY];

void resetCUDABuffers()

{

	printf("Allocated %d bytes for initial points\n",BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4));

	for (int n = 0; n < BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY; n++)

	{

  rands[n].x = 4.0f*((float)rand()/(float)RAND_MAX-0.5f);

  rands[n].y = 4.0f*((float)rand()/(float)RAND_MAX-0.5f);

  rands[n].z = (float)rand()/(float)RAND_MAX;

  rands[n].w = (float)rand()/(float)RAND_MAX;

	}

	for (int n = 0; n < BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY; n++)

	{

  countdowns[n] = ITERATIONS;

	}

	cudaFree(points);

	cudaMalloc((void**)&points,ITERATIONS*BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4));

	printf("Allocated %d bytes for point list\n",ITERATIONS*BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4));

	cudaFree(lastPoints);

	cudaMalloc((void**)&lastPoints,BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4));

	cudaMemcpy(lastPoints,rands,BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4),cudaMemcpyHostToDevice);

	cudaFree(countdown);

	cudaMalloc((void**)&countdown,BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(int));

	cudaMemcpy(countdown,countdowns,BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(int), cudaMemcpyHostToDevice);

	cudaThreadSynchronize();

}

__global__ void runBatch(float4 *renderTarget, const int imageW, const int imageH)

{

	const int ix = blockDim.x*blockIdx.x+threadIdx.x;

	const int iy = blockDim.y*blockIdx.y+threadIdx.y;

	const int idx = iy*gridDim.x+ix;

	int count = countdown[idx];

	float xo = lastPoints[idx].x;

	float yo = lastPoints[idx].y;

	float rx = lastPoints[idx].z;

	float ry = lastPoints[idx].w;

	float x = xo;

	float y = yo;

	float itr = 0.0;//lastPoints[idx].z;

	for (int m = 0; m < 500; m++)

	{

  for (int n = 0; n < ITERATIONS; n++)

  {

  	int rdrIdx = (n*gridDim.x*gridDim.y+idx);

  	count--;

  	if (count < 0)

  	{

    unsigned int xdx = (unsigned int)(((points[rdrIdx].x+2.0f)/3.0f)*(float)imageW);

    unsigned int ydx = (unsigned int)(((points[rdrIdx].y+1.5f)/3.0f)*(float)imageH);

    if ((xdx < imageW) && (ydx < imageH))

    {

    	int rdrIdx2 = ydx*imageW+xdx;

    	renderTarget[rdrIdx2].x+=0.0001;

    	renderTarget[rdrIdx2].y+=0.0001;

    	renderTarget[rdrIdx2].z+=0.0001;

    }

  	}

  	float x2 = x*x-y*y+xo;

  	y = 2.0f*x*y+yo;

  	x = x2;

  	points[rdrIdx] = make_float4(x,y,itr,0.0f);

  	itr++;

  	if ((x*x+y*y > 64.0f)||(itr > (float)ITERATIONS))

  	{

    if (itr > (float)ITERATIONS)

    	count = ITERATIONS;

    xo = 4.0f*(rx-0.5f);

    yo = 4.0f*(ry-0.5f);

    x = xo;

    y = yo;

    rx = __int_as_float(((__float_as_int(rx)*1664525+1013904223UL)&0x007FFFFF)|0x3F800000)-1.0f;

    ry = __int_as_float(((__float_as_int(ry)*1664525+1013904223UL)&0x007FFFFF)|0x3F800000)-1.0f;

    itr = 0.0f;

  	}

  }

	}

	lastPoints[idx].x=x;

	lastPoints[idx].y=y;

	lastPoints[idx].z=rx;

	lastPoints[idx].w=ry;

	//lastPoints[idx].z=itr;

	countdown[idx]=count;

}

void runBatchWrapper(float4 *renderTarget, const int imageW, const int imageH)

{

	dim3 threads(BLOCK_WIDTHX,BLOCK_WIDTHY);

	dim3 grid(BLOCKSX,BLOCKSY);

	/*float4* rands = (float4*) malloc(BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4));

	for (int n = 0; n < BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY; n++)

	{

  rands[n].x = 4.0f*((float)rand()/(float)RAND_MAX-0.5f);

  rands[n].y = 4.0f*((float)rand()/(float)RAND_MAX-0.5f);

  rands[n].z = (float)rand()/(float)RAND_MAX;

  rands[n].w = (float)rand()/(float)RAND_MAX;

	}

	cudaMemcpy(lastPoints,rand,BLOCK_WIDTHX*BLOCK_WIDTHY*BLOCKSX*BLOCKSY*sizeof(float4),cudaMemcpyHostToDevice);*/

	runBatch<<<grid, threads>>>(renderTarget,imageW,imageH);

	cudaThreadSynchronize();

	//free(rands);

}

Well, my code doesn’t work either. The trouble is the same: Data is not copied to the device. Here is my code, if you have any tips, please post it! Thanks

#include <stdio.h>

#include <cutil.h>

#include <cuda.h>

#include <cutil_inline.h>

typedef unsigned char Byte;

__global__ void feketit(Byte* Gimage, int size)

{

	const unsigned int tid = threadIdx.x;

	unsigned char adat = Gimage[tid];

	if(tid<size)

	{

		Gimage[tid] = Gimage[tid]-2;

	}

}

extern "C" void feketecsinal(int argc, const char** argv, Byte* image, int width, int height, int szinszam)

{

	//szinszam = 1;

	//printf("szelesseg: %d\n", width);

	/*int deviceCount;

	cudaGetDeviceCount(&deviceCount);

	printf("device szam: %d\n", cutGetMaxGflopsDeviceId());*/

	if( cutCheckCmdLineFlag(argc, (const char**) argv, "device") )

	{

		cutilDeviceInit(argc, (char**)argv);

		printf("Eszkoz\n");

	}

	else

	{

		cudaSetDevice( cutGetMaxGflopsDeviceId() );

	}

	int size = width * height*szinszam*sizeof(unsigned char);

	Byte* Gimage;

	cutilSafeCall(cudaMalloc((void**) &Gimage, size));

	cudaMemcpy(Gimage, image, size, cudaMemcpyHostToDevice);

	for (int i = 0; i < 280; i++)

	{

		printf("%d: %d\n", i, Gimage[i]);

	}

	dim3 grid(1, 1, 1);

		dim3 threads(4, 1, 1);

	//feketit<<< grid, threads >>>(Gimage, size);

	cutilCheckMsg("Kernel execution failed");

	cutilSafeCall(cudaMemcpy(image, Gimage, size, cudaMemcpyDeviceToHost) );

}

did you guys find any tip to solve this problem ?? I am having the same problem, somehow, in my case, the Memcpy only copies first elements from Host to Device!!.