cudaMemcpy copying wrong data set back from GPU

Hey All,

So I have some code which is supposed to do the following.

  1. Take some input arrays and copy them to the GPU.
  2. For each of those input arrays, take one component, and then multiply it by a phase factor, and then add it to a target location on a destination array using atomic add. (So one input array element is added to each element of the target array)
  3. For each component of the input arrays, use a single thread, and index over the target array locations using nested for loops.
  4. Then index over the components of the input arrays using the threads and blocks of the kernel.

The code works as expected and in the excerpt below, the printf command at the bottom gives me the result I expect for the element I request from the d_screenErho array. However, when I try to cudaMemcpy the d_screenErho array back to the host, it copies the d_screenEphi array back to the host instead. Any ideas why it should be doing this? Thanks for any input you can give me.

Here is the kernel.

__global__ void adding(cuDoubleComplex* d_Ez, cuDoubleComplex* d_Ephi, cuDoubleComplex* d_Erho, cuDoubleComplex* d_screenEz,cuDoubleComplex* d_screenErho, cuDoubleComplex* d_screenEphi,
					   int width, int height, double k, int width_half, int height_half, int screenx_half, int screeny_half, double invsd, double focal_length, double tar)
{

	int mode_element = ...//calculation of input arrays indicies based on thread index and block index (1-D arrays)

	cuDoubleComplex Ez  = d_Ez[mode_element]; // getting component of input array 1
	cuDoubleComplex Ephi = d_Ephi[mode_element]; //same input array 2
	cuDoubleComplex Erho = d_Erho[mode_element]; //same input array 3

	for (int j = -screeny_half ; j < screeny_half; j++) {
		for (int i = -screenx_half; i < screenx_half; i++) { //these two loops index over the target array in a 2-d manner
			int screen_element =  ..//calculate the target array's index
			
                        cuDoubleComplex Phase = make_cuDoubleComplex(1,0); //set the phase factor to something simple

			cuDoubleComplex ScreenEz = cuCmul(Phase, Ez); //multiplying the phase by the input array and assigning it a variable
			atomicAddComplex(&d_screenEz[screen_element], ScreenEz); //combining

			cuDoubleComplex ScreenEphi = cuCmul(Phase, Ephi);
			atomicAddComplex(&d_screenEphi[screen_element], ScreenEphi);

			cuDoubleComplex ScreenErho = cuCmul(Phase, Erho);
			atomicAddComplex(&d_screenErho[screen_element], ScreenErho);

		}
	}
	printf("%.1f ", cuCreal(d_screenErho[2]));
}

Here is the code I use to copy the array back.

gpuErrchk(cudaMemcpy(h_out, d_screenErho, SCREEN_X * SCREEN_Y* sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));

And here is the helper function atomicAddComplex

__device__ void atomicAddComplex(cuDoubleComplex* a, cuDoubleComplex b) {
	//transform the addresses of real and imaginary parts of a to double pointers
	double* x = (double*)a;
	double* y = x + 1;
	//use atomicAdd for double variables
	atomicAdd(x, cuCreal(b));
	atomicAdd(y, cuCimag(b));
}

One possibility might be stack corruption in host code. Without a reproducer, that’s just a guess of course.

If all the things you’ve left out of your posting are truly unimportant to resolving the issue, it should be a fairly straightforward matter to convert what you’ve shown into a minimal reproducer of the problem.

When I attempt to do that, I have no luck, things seem to work for me:

$ cat t1021.cu
#include <cuComplex.h>
#include <stdio.h>

#define DW 128
#define DH 128
#define SW 64
#define SH 64


__device__ double atomicAdd(double* address, double val) {
  unsigned long long int* address_as_ull = (unsigned long long int*)address;
  unsigned long long int old = *address_as_ull, assumed;
  do {
    assumed = old;
    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
   // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
    } while (assumed != old);
  return __longlong_as_double(old);
}

__device__ void atomicAddComplex(cuDoubleComplex* a, cuDoubleComplex b) {
 //transform the addresses of real and imaginary parts of a to double pointers
 double* x = (double*)a;
 double* y = x + 1;
 //use atomicAdd for double variables
 atomicAdd(x, cuCreal(b));
 atomicAdd(y, cuCimag(b));
}

__global__ void adding(cuDoubleComplex* d_Ez, cuDoubleComplex* d_Ephi, cuDoubleComplex* d_Erho, cuDoubleComplex* d_screenEz,cuDoubleComplex* d_screenErho, cuDoubleComplex* d_screenEphi,int width, int height, double k, int width_half, int height_half, int screenx_half, int screeny_half, double invsd, double focal_length, double tar)
{

 int mode_element = threadIdx.x+blockDim.x*blockIdx.x; //calculation of input arrays indicies based on thread index and block index (1-D arrays)

 cuDoubleComplex Ez  = d_Ez[mode_element]; // getting component of input array 1
 cuDoubleComplex Ephi = d_Ephi[mode_element]; //same input array 2
 cuDoubleComplex Erho = d_Erho[mode_element]; //same input array 3

 for (int j = -screeny_half ; j < screeny_half; j++) {
  for (int i = -screenx_half; i < screenx_half; i++) { //these two loops index over the target array in a 2-d manner
   int screen_element =  i+screenx_half + (j+screeny_half)*2*screenx_half; //calculate the target array's index

                        cuDoubleComplex Phase = make_cuDoubleComplex(1,0); //set the phase factor to something simple

   cuDoubleComplex ScreenEz = cuCmul(Phase, Ez); //multiplying the phase by the input array and assigning it a variable
   atomicAddComplex(&d_screenEz[screen_element], ScreenEz); //combining

   cuDoubleComplex ScreenEphi = cuCmul(Phase, Ephi);
   atomicAddComplex(&d_screenEphi[screen_element], ScreenEphi);

   cuDoubleComplex ScreenErho = cuCmul(Phase, Erho);
   atomicAddComplex(&d_screenErho[screen_element], ScreenErho);

  }
 }
        printf("%.1f ", cuCreal(d_screenErho[2]));
}

int main(){

  const int SCREEN_X = SW;
  const int SCREEN_Y = SH;
  const int dsize = DW*DH;
  const int ssize = SW*SH;
  cuDoubleComplex *d_Ez, *d_Ephi, *d_Erho, *d_screenEz, *d_screenErho, *d_screenEphi, *h_out, *h_Ez, *h_Ephi, *h_Erho;
  int width,height, width_half, height_half;
  double k = 0.0, invsd = 0.0, focal_length = 0.0, tar = 0.0;

  cudaMalloc(&d_Ez, dsize*sizeof(cuDoubleComplex));
  cudaMalloc(&d_Ephi, dsize*sizeof(cuDoubleComplex));
  cudaMalloc(&d_Erho, dsize*sizeof(cuDoubleComplex));
  cudaMalloc(&d_screenEz, ssize*sizeof(cuDoubleComplex));
  cudaMalloc(&d_screenErho, ssize*sizeof(cuDoubleComplex));
  cudaMalloc(&d_screenEphi, ssize*sizeof(cuDoubleComplex));
  h_out = (cuDoubleComplex *)malloc(ssize*sizeof(cuDoubleComplex));
  h_Ez = (cuDoubleComplex *)malloc(dsize*sizeof(cuDoubleComplex));
  h_Ephi = (cuDoubleComplex *)malloc(dsize*sizeof(cuDoubleComplex));
  h_Erho = (cuDoubleComplex *)malloc(dsize*sizeof(cuDoubleComplex));

  cudaMemset(d_screenEz, 0, ssize*sizeof(cuDoubleComplex));
  cudaMemset(d_screenEphi, 0, ssize*sizeof(cuDoubleComplex));
  cudaMemset(d_screenErho, 0, ssize*sizeof(cuDoubleComplex));

  for (int i = 0; i < dsize; i++){
    h_Ez[i]   = make_cuDoubleComplex(1,0);
    h_Ephi[i] = make_cuDoubleComplex(2,0);
    h_Erho[i] = make_cuDoubleComplex(3,0);}


  cudaMemcpy(d_Ez, h_Ez, dsize*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
  cudaMemcpy(d_Ephi, h_Ephi, dsize*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
  cudaMemcpy(d_Erho, h_Erho, dsize*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);


  width = DW;
  height = DH;
  width_half = width/2;
  height_half = height/2;

  adding<<<DW,DH >>>(d_Ez, d_Ephi, d_Erho, d_screenEz, d_screenErho, d_screenEphi, width, height, k, width_half, height_half, SCREEN_X/2, SCREEN_Y/2, invsd, focal_length, tar);


  cudaMemcpy(h_out, d_screenErho, SCREEN_X * SCREEN_Y* sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
  printf("\n\n\n%.1f\n", cuCreal(h_out[2]));
  cudaMemcpy(h_out, d_screenEphi, SCREEN_X * SCREEN_Y* sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
  printf("%.1f\n", cuCreal(h_out[2]));
  cudaMemcpy(h_out, d_screenEz, SCREEN_X * SCREEN_Y* sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
  printf("%.1f\n", cuCreal(h_out[2]));

  return 0;
}
$ nvcc -o t1021 t1021.cu
$ cuda-memcheck ./t1021

=====ONLY SHOWING TAIL END OF OUTPUT==========
49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0


49152.0
32768.0
16384.0
========= ERROR SUMMARY: 0 errors
$

So my guess is the problem is in something you haven’t shown.

Thanks for your prompt response. I’ll go through your code to see if I can pick up some best practices and port them over to my code to see if that helps.

I didn’t post the whole code because I wanted to make it easier to follow, but for reference here is the entire Kernel, and most of the host code (except the atomic add helper function for cuda complex).

Kernel.

__global__ void adding(cuDoubleComplex* d_Ez, cuDoubleComplex* d_Ephi, cuDoubleComplex* d_Erho, cuDoubleComplex* d_screenEz,cuDoubleComplex* d_screenErho, cuDoubleComplex* d_screenEphi,
					   int width, int height, double sd, double k, double pitch, double pitch_squared, int width_half, int height_half, int screenx_half, int screeny_half, double invsd, double focal_length, double tar)
{
	int x = threadIdx.x + blockIdx.x*blockDim.x - width_half; //offset so that x=0 is centered
	int y = threadIdx.y + blockIdx.y*blockDim.y - height_half; //offset so that y=0 is centered
	int mode_element = (x + width_half) + (gridDim.x*blockDim.x)*(y + height_half); //the last two terms get rid of the offset introduced into x and y.
	double distance;
	double argument;

	cuDoubleComplex Ez  = d_Ez[mode_element]; //getting the input component of the Ez array
	cuDoubleComplex Ephi = d_Ephi[mode_element]; //now Ephi array
	cuDoubleComplex Erho = d_Erho[mode_element]; //now Erho array

	for (int j = -screeny_half ; j < screeny_half; j++) {
		for (int i = -screenx_half; i < screenx_half; i++) { //these two loops index over the screen in a 2-d manner
			int screen_element = (i+screenx_half) + (j+screeny_half)*SCREEN_X; //since the screen array is 1-D, this calculates which element we're on
			distance = sqrt(((sd+pitch*double(x*tan(tar)))*(sd+pitch*double(x*tan(tar)))+pitch_squared*double(((x-4*i)*(x-4*i) + (y-4*j)*(y-4*j))))) //distance from mode element to screen
				+ focal_length - sqrt(focal_length*focal_length+pitch_squared*double((x/cos(tar))*(x/cos(tar)) + y*y)); //distance "added" by effect of lens
			argument = k * distance; //The argument of the exponential
			//cuDoubleComplex Phase = make_cuDoubleComplex( cos(argument)/distance, sin(argument)/distance); //Writing the phase factor in its real and imaginary terms
			cuDoubleComplex Phase = make_cuDoubleComplex(1,0); //simple phase for debugging

			cuDoubleComplex ScreenEz = cuCmul(Phase, Ez); //multiplying the phase by the input field and assinging it a variable
			atomicAddComplex(&d_screenEz[screen_element], ScreenEz); //combining

			cuDoubleComplex ScreenEphi = cuCmul(Phase, Ephi);
			atomicAddComplex(&d_screenEphi[screen_element], ScreenEphi);

			cuDoubleComplex ScreenErho = cuCmul(Phase, Erho);
			atomicAddComplex(&d_screenErho[screen_element], ScreenErho);

		}
	}
	
}

Host code

int main(int argc, char** argv)
{
	const int width = 4, height = 4, depth = 4; //define lots of constants
	const double sd = .2;
	const double invsd = 1/sd;
	const double focal_length = .2;
	const double pi = 3.14159265358979311;
	const double k = 2*pi/.0000008;
	const double pitch = .000006;
	const double pitch_squared = pitch*pitch;
	const double waist = .00002;
	const double tilt_angle = 0;
	const double tar = (pi/180)*tilt_angle;
	int width_half = width/2;
	int height_half = height/2;
	int screenx_half = SCREEN_X/2;
	int screeny_half = SCREEN_Y/2;
	const double C = 1.0;
	const double w_0 = 250;

	//declare the host pointers and allocate their memory

	cuDoubleComplex* h_Ez;
	cuDoubleComplex* h_Ephi;
	cuDoubleComplex* h_Erho;
	cuDoubleComplex* h_screenEz;
	cuDoubleComplex* h_screenEphi;
	cuDoubleComplex* h_screenErho;
	cuDoubleComplex* h_out;

	h_Ez = (cuDoubleComplex *)malloc(width*height*sizeof(cuDoubleComplex));
	h_Ephi = (cuDoubleComplex *)malloc(width*height*sizeof(cuDoubleComplex));
	h_Erho = (cuDoubleComplex *)malloc(width*height*sizeof(cuDoubleComplex));
	h_screenEz = (cuDoubleComplex *)malloc(SCREEN_X*SCREEN_Y*sizeof(cuDoubleComplex));
	h_screenEphi = (cuDoubleComplex *)malloc(SCREEN_X*SCREEN_Y*sizeof(cuDoubleComplex));
	h_screenErho = (cuDoubleComplex *)malloc(SCREEN_X*SCREEN_Y*sizeof(cuDoubleComplex));
	h_out = (cuDoubleComplex *)malloc(SCREEN_X*SCREEN_Y*sizeof(cuDoubleComplex));

	//declare the pointers on the GPU and allocate their memory
	cuDoubleComplex* d_Ez;
	cuDoubleComplex* d_Ephi;
	cuDoubleComplex* d_Erho;
	cuDoubleComplex* d_screenEz;
	cuDoubleComplex* d_screenEphi;
	cuDoubleComplex* d_screenErho;

	gpuErrchk(cudaMalloc((void **) &d_Ez, width * height * sizeof(cuDoubleComplex)));
	gpuErrchk(cudaMalloc((void **) &d_Ephi, width * height * sizeof(cuDoubleComplex)));
	gpuErrchk(cudaMalloc((void **) &d_Erho, width * height * sizeof(cuDoubleComplex)));
	gpuErrchk(cudaMalloc((void **) &d_screenEz, SCREEN_X * SCREEN_Y * sizeof(cuDoubleComplex)));
	gpuErrchk(cudaMalloc((void **) &d_screenEphi, SCREEN_X * SCREEN_Y * sizeof(cuDoubleComplex)));
	gpuErrchk(cudaMalloc((void **) &d_screenErho, SCREEN_X * SCREEN_Y * sizeof(cuDoubleComplex)));

	//generate and copy the blank output arrays to the GPU
	
	for (int i = 0; i < SCREEN_X * SCREEN_Y; i++) {
		cuDoubleComplex null = make_cuDoubleComplex(0,0);
		h_screenEz[i] = null;
		h_screenEphi[i] = null;
		h_screenErho[i] = null;
	}

	gpuErrchk(cudaMemcpy(d_screenEz, h_screenEz, SCREEN_X * SCREEN_Y * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(d_screenEphi, h_screenEphi, SCREEN_X * SCREEN_Y * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(d_screenErho, h_screenErho, SCREEN_X * SCREEN_Y * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));

	//generate the input array on the host

	for (int l=0; l<1; l++) { //NB I loop over the Kernel to generate multiple outputs for incremented input arrays.  I only run 1 loop for debugging.
		double time = .05*(l+19)*.0000008/300000000;
		std::string index = std::to_string(l);

	
	for (int j = 0; j < height; j++) { //this is for generating the input arrays.
		for (int i =0; i < width; i++) {
		double x = i - width/2;
		double y = j - height/2;
		double amplitude = exp( -waist * (x*x + y*y));
		//h_Ez[i + j*width] = make_cuDoubleComplex(amplitude*cos(-k*x*tan(tar)), amplitude*sin(-k*x*tan(tar)));
		h_Ez[i + j*width] = make_cuDoubleComplex(5.0,0); //simple debugging input arrays
		h_Ephi[i + j*width] = make_cuDoubleComplex(2.0,0);
		h_Erho[i + j*width] = make_cuDoubleComplex(3.0,0);

		}
	}

	//save input mode file to disk

	ofstream myfile;
	myfile.open ("Input File" + index +".csv");
	myfile << "This is the input file.\n";

	for (int j = 0; j < height; j++) {
		for (int i = 0; i < width; i++) {
			myfile << cuCreal(h_Ez[i + j*width]);
			if (i != width-1)
				myfile << ",\t";
		}
		myfile << endl;
	}
	myfile.close();

	//declare timer events
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	
	//copy the arrays to the GPU
	gpuErrchk(cudaMemcpy(d_Ez, h_Ez, width*height * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(d_Ephi, h_Ephi, width*height * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
	gpuErrchk(cudaMemcpy(d_Erho, h_Erho, width*height * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));

	//launch the kernel
	cudaEventRecord(start);
	adding<<<dim3(2, 2, 1), dim3(2, 2, 1)>>>(d_Ez, d_Ephi, d_Erho, d_screenEz, d_screenEphi, d_screenErho, 
		width, height, sd, k, pitch, pitch_squared, width_half, height_half, screenx_half, screeny_half, invsd, focal_length, tar);
	gpuErrchk( cudaPeekAtLastError() );
	gpuErrchk( cudaDeviceSynchronize() );
	cudaEventRecord(stop);

	//copy the results back to the CPU
	gpuErrchk(cudaMemcpy(h_out, d_screenEphi, SCREEN_X * SCREEN_Y* sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost));

	cudaEventSynchronize(stop);
	float milliseconds = 0;
	cudaEventElapsedTime(&milliseconds, start, stop);

	//save out file to disk, first real, then complex
	ofstream real;
	real.open ("Real Output" + index + ".csv");
	real << "This is the real output file.\n";
	for (int j = 0; j < SCREEN_Y; j++) {
		for (int i = 0; i < SCREEN_X; i++) {
			real << setprecision(16) << cuCreal(h_out[i + j*SCREEN_X]);
			if (i != SCREEN_X-1)
				real << ",\t";
		}
		real << endl;
	}
	real.close();

	ofstream imag;
	imag.open ("Imag Output" + index + ".csv");
	imag << "This is the imaginary output file.\n";
	for (int j = 0; j < SCREEN_Y; j++) {
		for (int i = 0; i < SCREEN_X; i++) {
			imag << setprecision(16) << cuCimag(h_out[i + j*SCREEN_X]);
			if (i != SCREEN_X-1)
				imag << ",\t";
		}
		imag << endl;
	}
	imag.close();

	printf("\n %f \n" , milliseconds);

	for(int j = 0; j < SCREEN_Y; j++) {
		for(int i = 0; i < SCREEN_X; i++) {
			printf("(%.1f, %.1f)", cuCreal(h_out[i + j*SCREEN_X]), cuCimag(h_out[i + j*SCREEN_X]));
			printf(((i % width) !=width-1) ? "\t" : "\n");
		}
	}
	}

	// free GPU memory allocation
	cudaFree(d_Ez);
	cudaFree(d_Ephi);
	cudaFree(d_Erho);
	cudaFree(d_screenEz);
	cudaFree(d_screenEphi);
	cudaFree(d_screenErho);

	//free Host memory allocation
	free(h_Ez);
	free(h_Ephi);
	free(h_Erho);
	free(h_screenEz);
	free(h_screenEphi);
	free(h_screenErho);
	free(h_out);

}