access array in CUDA kernel

Hello,

I am trying to convert an array of cuComplex into CUDA kernel, but when I compare to output from HOST to that generated by the DEVICE kernel the values don’t match. can anyone help - it is probably something simple that I am overlooking but would appreciate any help.

I have copied snippets of code being used for CUDA kernel as well as HOST C/C++ code.

The original C code follows:

...
float tol = 0.01f;
const unsigned int N = 5;
cuComplex *h_a, *d_a, *chk;

h_a = (cuComplex*)malloc(N*sizeof(cuComplex));
chk = (cuComplex*)malloc(N*sizeof(cuComplex));

h_a = genRandomArray(N);  // generates random array of cuComplex

cudaMalloc((void**)&d_a, N*sizeof(cuComplex));

// copy to DEVICE
cudaMemcpy(d_a, h_a, N*sizeof(cuComplex), cudaMemcpyHostToDevice);

// call CUDA kernel
unsigned int tpb = 1024;
unsigned int nblocks = (N + tpb - 1) / tpb;
myKern<<<nblocks,tpb>>>(N, d_a);

// copy from DEVICE
cudaMemcpy(chk, d_a, N*sizeof(cuComplex), cudaMemcpyDeviceToHost);

// PROBLEM: chk array and h_a don't match
for(unsigned int i = 0; i < N; ++i){
  if(fabs(chk[i].x - h_a[i].x) > tol){
    printf("Something went wrong!\n");
  }
}
...
// execute HOST 
float a = 0.5*(1.5 + 0.75);
float b = 0.5*(1.5 - 0.75);
unsigned int r = N % 2;
unsigned int L = (N - r) / 2;
unsigned int i = 0;
unsigned int k = 0;
for(; i < L; ++i){
  float theta = (float)(2*(i + 1) + N - 1);
  h_a[k].x = a*cosf(theta); h_a[k].y = -b*cosf(theta); k++;
  h_a[k].x = a*cosf(theta); h_a[k].y = b*cosf(theta); k++;
}

free(h_a); free(chk);
cudaFree(d_a);
...

The CUDA kernel code follows:

__global__ void myKern(const unsigned int N, cuComplex *d_a){
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

  // check range
  if(i >= N) return;

  float a = 0.5*(1.5 + 0.75);
  float b = 0.5*(1.5 - 0.75);
  unsigned int r = N % 2;
  unsigned int L = (N - r) / 2;
  unsigned int k = i;
  if(i < L){
    float theta = (float)(2*(i + 1) + N - 1);
    d_a[k].x = a*cosf(theta); d_a[k].y = -b*cosf(theta); k++;
    d_a[k].x = a*cosf(theta); d_a[k].y = b*cosf(theta); k++;
  }
}

One problem is here:

unsigned int k = i;

it should be:

unsigned int k = 2*i;

also, its not clear what order your code is in, but the order you have shown, where the results validation is occurring before you calculate the host results, doesn’t make sense.

this seems to give matching results for me:

$ cat t1500.cu
#include <stdio.h>
#include <iostream>
#include <cuComplex.h>

__global__ void myKern(const unsigned int N, cuComplex *d_a){
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

  // check range
  if(i >= N) return;

  float a = 0.5*(1.5 + 0.75);
  float b = 0.5*(1.5 - 0.75);
  unsigned int r = N % 2;
  unsigned int L = (N - r) / 2;
  unsigned int k = 2*i;
  if(i < L){
    float theta = (float)(2*(i + 1) + N - 1);
    d_a[k].x = a*cosf(theta); d_a[k].y = -b*cosf(theta); k++;
    d_a[k].x = a*cosf(theta); d_a[k].y = b*cosf(theta); k++;
  }
}


int main(){

float tol = 0.01f;
const unsigned int N = 5;
cuComplex *h_a, *d_a, *chk;

h_a = (cuComplex*)malloc(N*sizeof(cuComplex));
chk = (cuComplex*)malloc(N*sizeof(cuComplex));

//h_a = genRandomArray(N);  // generates random array of cuComplex

cudaMalloc((void**)&d_a, N*sizeof(cuComplex));

// copy to DEVICE
//cudaMemcpy(d_a, h_a, N*sizeof(cuComplex), cudaMemcpyHostToDevice);

// call CUDA kernel
unsigned int tpb = 1024;
unsigned int nblocks = (N + tpb - 1) / tpb;
myKern<<<nblocks,tpb>>>(N, d_a);

// copy from DEVICE
cudaMemcpy(chk, d_a, N*sizeof(cuComplex), cudaMemcpyDeviceToHost);

// PROBLEM: chk array and h_a don't match
// execute HOST
float a = 0.5*(1.5 + 0.75);
float b = 0.5*(1.5 - 0.75);
unsigned int r = N % 2;
unsigned int L = (N - r) / 2;
unsigned int i = 0;
unsigned int k = 0;
for(; i < L; ++i){
  float theta = (float)(2*(i + 1) + N - 1);
  h_a[k].x = a*cosf(theta); h_a[k].y = -b*cosf(theta); k++;
  h_a[k].x = a*cosf(theta); h_a[k].y = b*cosf(theta); k++;
}

for(unsigned int i = 0; i < N; ++i){
  if(fabs(chk[i].x - h_a[i].x) > tol){
    printf("index %d, host: %f, gpu: %f\n", i, h_a[i].x, chk[i].x);
  }
}

free(h_a); free(chk);
cudaFree(d_a);
}
$ nvcc -o t1500 t1500.cu
$ ./t1500
$

Thanks for the reply. You are the CUDA master :)

Yes, my posted order for validation doesn’t make sense. I meant to put results validation AFTER the calculation of HOST results. I messed up on where I placed the code on my post via copy and paste.

So the only problem (ignoring my misplacement of the results validation) was that

unsigned int k = i;

should have been

unsigned int k = 2 * i;

Robert_Corvella,

Can you briefly explain why setting ‘k’ to ‘2*i’ works?

Thank you again for all your help.

You are computing two consecutive output elements per thread. that is why you need a stride of 2.

with k = i, thread 0 computes a[0] and a[1] , and thread 1 computes a[1] and a[2] which conflicts with the calculations of thread 0.

with k = 2i, thread 0 computes a[0] and a[1], and thread 1 computes a[2] and a[3]

striker159,

Thank you for the information. Now it makes perfect sense :)

One last question, if I may (from anyone out there). What if I added another parameter to DEVICE kernel (d_b) and HOST (h_b) such as in the following code where new additions are listed as ‘ADDED’:

#include <stdio.h>
#include <iostream>
#include <cuComplex.h>

__global__ void myKern(const unsigned int N, cuComplex *d_a, cuComplex *d_b){
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

  // check range
  if(i >= N) return;

  float a = 0.5*(1.5 + 0.75);
  float b = 0.5*(1.5 - 0.75);
  unsigned int r = N % 2;
  unsigned int L = (N - r) / 2;
  unsigned int k = 2*i;
  if(i < L){
    float theta = (float)(2*(i + 1) + N - 1);
    d_a[k].x = a*cosf(theta); d_a[k].y = -b*cosf(theta); k++;
    d_a[k].x = a*cosf(theta); d_a[k].y = b*cosf(theta); k++;
  }

 // ADDED: PROBLEM - doesn't seem to give same value as HOST code version
 d_b[0].x = r ? 1.0f : 1.0f / sqrtf(1.0f + a);
 d_b[0].x *= d_a[i].x;
 d_b[0].y *= d_a[i].y;
}

int main(){

float tol = 0.01f;
const unsigned int N = 5;
cuComplex *h_a, *d_a, *chk;

// ADDED:
cuComplex *d_b;
cuComplex h_b;
cuComplex chkB;
h_b.x = 0.0f; h_b.y = 0.0f;

h_a = (cuComplex*)malloc(N*sizeof(cuComplex));
chk = (cuComplex*)malloc(N*sizeof(cuComplex));

//h_a = genRandomArray(N);  // generates random array of cuComplex

cudaMalloc((void**)&d_a, N*sizeof(cuComplex));

// ADDED:
cudaMalloc((void**)&d_b, sizeof(cuComplex));

// copy to DEVICE
//cudaMemcpy(d_a, h_a, N*sizeof(cuComplex), cudaMemcpyHostToDevice);

// ADDED:
cudaMemcpy(d_b, &h_b, sizeof(cuComplex), cudaMemcpyHostToDevice);

// call CUDA kernel
unsigned int tpb = 1024;
unsigned int nblocks = (N + tpb - 1) / tpb;
myKern<<<nblocks,tpb>>>(N, d_a, d_b);

// copy from DEVICE
cudaMemcpy(chk, d_a, N*sizeof(cuComplex), cudaMemcpyDeviceToHost);

// ADDED: 
cudaMemcpy(&chkB, d_b, sizeof(cuComplex), cudaMemcpyDeviceToHost);
 
// PROBLEM: chk array and h_a don't match
// execute HOST
float a = 0.5*(1.5 + 0.75);
float b = 0.5*(1.5 - 0.75);
unsigned int r = N % 2;
unsigned int L = (N - r) / 2;
unsigned int i = 0;
unsigned int k = 0;
for(; i < L; ++i){
  float theta = (float)(2*(i + 1) + N - 1);
  h_a[k].x = a*cosf(theta); h_a[k].y = -b*cosf(theta); k++;
  h_a[k].x = a*cosf(theta); h_a[k].y = b*cosf(theta); k++;
}

// ADDED:
h_b.x = r ? 1.0f : 1.0f / sqrtf(1.0f + a);
for(unsigned int i = 0; i < N; ++i){
  h_b.x *= h_a[i].x;
  h_b.y *= h_a[i].y;
}

// ADDED: Problem HOST and DEVICE don't match 
printf("B - HOST: %0.6f, DEVICE: %0.6f\n", h_b.x, chkB.x);

for(unsigned int i = 0; i < N; ++i){
  if(fabs(chk[i].x - h_a[i].x) > tol){
    printf("index %d, host: %f, gpu: %f\n", i, h_a[i].x, chk[i].x);
  }
}

free(h_a); free(chk);
cudaFree(d_a);

// ADDED:
cudaFree(d_b);

}

The above code doesn’t give a value for h_b that is the same as value computed on DEVICE. Again, thank you all for the help.