One possibility might be stack corruption in host code. Without a reproducer, that’s just a guess of course.
If all the things you’ve left out of your posting are truly unimportant to resolving the issue, it should be a fairly straightforward matter to convert what you’ve shown into a minimal reproducer of the problem.
When I attempt to do that, I have no luck, things seem to work for me:
$ cat t1021.cu
#include <cuComplex.h>
#include <stdio.h>
#define DW 128
#define DH 128
#define SW 64
#define SH 64
__device__ double atomicAdd(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
__device__ void atomicAddComplex(cuDoubleComplex* a, cuDoubleComplex b) {
//transform the addresses of real and imaginary parts of a to double pointers
double* x = (double*)a;
double* y = x + 1;
//use atomicAdd for double variables
atomicAdd(x, cuCreal(b));
atomicAdd(y, cuCimag(b));
}
__global__ void adding(cuDoubleComplex* d_Ez, cuDoubleComplex* d_Ephi, cuDoubleComplex* d_Erho, cuDoubleComplex* d_screenEz,cuDoubleComplex* d_screenErho, cuDoubleComplex* d_screenEphi,int width, int height, double k, int width_half, int height_half, int screenx_half, int screeny_half, double invsd, double focal_length, double tar)
{
int mode_element = threadIdx.x+blockDim.x*blockIdx.x; //calculation of input arrays indicies based on thread index and block index (1-D arrays)
cuDoubleComplex Ez = d_Ez[mode_element]; // getting component of input array 1
cuDoubleComplex Ephi = d_Ephi[mode_element]; //same input array 2
cuDoubleComplex Erho = d_Erho[mode_element]; //same input array 3
for (int j = -screeny_half ; j < screeny_half; j++) {
for (int i = -screenx_half; i < screenx_half; i++) { //these two loops index over the target array in a 2-d manner
int screen_element = i+screenx_half + (j+screeny_half)*2*screenx_half; //calculate the target array's index
cuDoubleComplex Phase = make_cuDoubleComplex(1,0); //set the phase factor to something simple
cuDoubleComplex ScreenEz = cuCmul(Phase, Ez); //multiplying the phase by the input array and assigning it a variable
atomicAddComplex(&d_screenEz[screen_element], ScreenEz); //combining
cuDoubleComplex ScreenEphi = cuCmul(Phase, Ephi);
atomicAddComplex(&d_screenEphi[screen_element], ScreenEphi);
cuDoubleComplex ScreenErho = cuCmul(Phase, Erho);
atomicAddComplex(&d_screenErho[screen_element], ScreenErho);
}
}
printf("%.1f ", cuCreal(d_screenErho[2]));
}
int main(){
const int SCREEN_X = SW;
const int SCREEN_Y = SH;
const int dsize = DW*DH;
const int ssize = SW*SH;
cuDoubleComplex *d_Ez, *d_Ephi, *d_Erho, *d_screenEz, *d_screenErho, *d_screenEphi, *h_out, *h_Ez, *h_Ephi, *h_Erho;
int width,height, width_half, height_half;
double k = 0.0, invsd = 0.0, focal_length = 0.0, tar = 0.0;
cudaMalloc(&d_Ez, dsize*sizeof(cuDoubleComplex));
cudaMalloc(&d_Ephi, dsize*sizeof(cuDoubleComplex));
cudaMalloc(&d_Erho, dsize*sizeof(cuDoubleComplex));
cudaMalloc(&d_screenEz, ssize*sizeof(cuDoubleComplex));
cudaMalloc(&d_screenErho, ssize*sizeof(cuDoubleComplex));
cudaMalloc(&d_screenEphi, ssize*sizeof(cuDoubleComplex));
h_out = (cuDoubleComplex *)malloc(ssize*sizeof(cuDoubleComplex));
h_Ez = (cuDoubleComplex *)malloc(dsize*sizeof(cuDoubleComplex));
h_Ephi = (cuDoubleComplex *)malloc(dsize*sizeof(cuDoubleComplex));
h_Erho = (cuDoubleComplex *)malloc(dsize*sizeof(cuDoubleComplex));
cudaMemset(d_screenEz, 0, ssize*sizeof(cuDoubleComplex));
cudaMemset(d_screenEphi, 0, ssize*sizeof(cuDoubleComplex));
cudaMemset(d_screenErho, 0, ssize*sizeof(cuDoubleComplex));
for (int i = 0; i < dsize; i++){
h_Ez[i] = make_cuDoubleComplex(1,0);
h_Ephi[i] = make_cuDoubleComplex(2,0);
h_Erho[i] = make_cuDoubleComplex(3,0);}
cudaMemcpy(d_Ez, h_Ez, dsize*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
cudaMemcpy(d_Ephi, h_Ephi, dsize*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
cudaMemcpy(d_Erho, h_Erho, dsize*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
width = DW;
height = DH;
width_half = width/2;
height_half = height/2;
adding<<<DW,DH >>>(d_Ez, d_Ephi, d_Erho, d_screenEz, d_screenErho, d_screenEphi, width, height, k, width_half, height_half, SCREEN_X/2, SCREEN_Y/2, invsd, focal_length, tar);
cudaMemcpy(h_out, d_screenErho, SCREEN_X * SCREEN_Y* sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
printf("\n\n\n%.1f\n", cuCreal(h_out[2]));
cudaMemcpy(h_out, d_screenEphi, SCREEN_X * SCREEN_Y* sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
printf("%.1f\n", cuCreal(h_out[2]));
cudaMemcpy(h_out, d_screenEz, SCREEN_X * SCREEN_Y* sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
printf("%.1f\n", cuCreal(h_out[2]));
return 0;
}
$ nvcc -o t1021 t1021.cu
$ cuda-memcheck ./t1021
=====ONLY SHOWING TAIL END OF OUTPUT==========
49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0 49152.0
49152.0
32768.0
16384.0
========= ERROR SUMMARY: 0 errors
$
So my guess is the problem is in something you haven’t shown.