Hello to everyone,
I’m trying to write a kernel that modifies the elements of a cuDoubleComplex array based on their index. Unfortunately I always get wrong output. If I compile with device emulation then everything work as expected.
I wrote a sample code to modify an array vec initialized with all elements at (1+1i). In the kernel, I want to replace each element of vec with:
-
(0.0+0.0i) - vec[j] if j is even
-
(1.0+1.0i) - vec[j] if j is odd
Here the code:
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#include "cublas.h"
#define NELEMS 10
static cuDoubleComplex *devVect;
__device__ inline cuDoubleComplex doubleComplexSub(cuDoubleComplex, cuDoubleComplex);
__global__ void compute(cuDoubleComplex *, int);
int main(int argc, char ** argv) {
int i;
dim3 grid(64,1,1);
dim3 block(128,1,1);
cuDoubleComplex vec[NELEMS];
for(i = 0; i < NELEMS; i++) {
vec[i] = make_cuDoubleComplex(1.0, 1.0);
}
CUDA_SAFE_CALL( cudaMalloc((void **)&devVect, sizeof(cuDoubleComplex) * NELEMS) );
CUDA_SAFE_CALL( cudaMemcpy(devVect,
vec,
sizeof(cuDoubleComplex) * NELEMS,
cudaMemcpyHostToDevice) );
compute<<<grid,block>>>(devVect, NELEMS);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUDA_SAFE_CALL( cudaMemcpy(vec,
devVect,
sizeof(cuDoubleComplex) * NELEMS,
cudaMemcpyDeviceToHost) );
for(i = 0; i < NELEMS; i++) {
fprintf(stderr, "vec[%d] = %f + %fi\n", i, cuCreal(vec[i]), cuCimag(vec[i]));
}
return 1;
}
static __device__ inline cuDoubleComplex doubleComplexSub(cuDoubleComplex a, cuDoubleComplex b) {
cuDoubleComplex ret;
ret.x = a.x - b.x;
ret.y = a.y - b.y;
return ret;
}
static __global__ void compute(cuDoubleComplex *vect, int dim) {
unsigned int tid = (blockDim.x * blockIdx.x) + threadIdx.x;
unsigned int num_threads = gridDim.x * blockDim.x;
unsigned int tot_elems = dim;
cuDoubleComplex zero = {0.0, 0.0};
cuDoubleComplex one = {1.0, 1.0};
int istart;
for(istart = tid; istart < tot_elems; istart += num_threads) {
vect[istart] = doubleComplexSub((istart % 2) == 0 ? zero : one, vect[istart]);
}
return;
}
The wrong output I get is:
vec[0] = -1.000000 + 1.000000i
vec[1] = -0.001953 + 1.000000i
vec[2] = -1.000000 + 1.000000i
vec[3] = -0.001953 + 1.000000i
vec[4] = -1.000000 + 1.000000i
vec[5] = -0.001953 + 1.000000i
vec[6] = -1.000000 + 1.000000i
vec[7] = -0.001953 + 1.000000i
vec[8] = -1.000000 + 1.000000i
vec[9] = -0.001953 + 1.000000i
In place of the correct one that I get if I compile with --device-emulation:
vec[0] = -1.000000 + -1.000000i
vec[1] = 0.000000 + 0.000000i
vec[2] = -1.000000 + -1.000000i
vec[3] = 0.000000 + 0.000000i
vec[4] = -1.000000 + -1.000000i
vec[5] = 0.000000 + 0.000000i
vec[6] = -1.000000 + -1.000000i
vec[7] = 0.000000 + 0.000000i
vec[8] = -1.000000 + -1.000000i
vec[9] = 0.000000 + 0.000000i
What am I doing wrong?
I have tried different ways of computing vect[tid] inside the kernel but always get the same behaviour. I didn’t find any documentation on the right way to use double complex numbers in kernels (the CUBLAS manual only deal with CUBLAS calls).
I know that I must be doing something wrong but I can’t find it out!
Thanks,
Mauro