Hello,

I am trying to mirror an array of length N to an array of length 4*N such that every even index is skipped using a CUDA kernel and get weird results. It is probably something simple but a fresh pair of eyes helps.

Any help would be greatly appreciated.

The code follows:

```
#include <cuda.h>
#include <stdio.h>
#define TPB 1024
// HOST version
void hostMirror(unsigned int N, float *in, float *out){
int rinCounter = 0;
for(int i =1; i < (2*N) ; i+=2){
out[i] = in[rinCounter];
rinCounter++;
}
rinCounter =0;
for(int i=(4*N-1) ; i >= (2*N) ; i-=2){
out[i] = in[rinCounter];
rinCounter++;
}
}
// DEVICE version
__global__
void kernelMirror(unsigned int N, float *in, float *out){
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int j = 2*i + 1;
if(j < 2*N){
out[j] = in[i];
}
if(j >= 2*N && j < 4*N){
out[j] = in[4*N-j];
}
}
int main(){
unsigned int N = 2; // example is small
float *h_in, *h_out;
float *d_in, *d_out, *chk;
h_in = (float*)malloc(N*sizeof(float));
h_out = (float*)malloc(4*N*sizeof(float));
chk = (float*)malloc(4*N*sizeof(float));
cudaMalloc((void**)&d_in, N*sizeof(float));
cudaMalloc((void**)&d_out, 4*N*sizeof(float));
memset(chk, 0, 4*N*sizeof(float));
memset(h_out, 0, 4*N*sizeof(float));
// load some dummy data
for(unsigned int i = 0; i < N; ++i){
h_in[i] = i + 1;
}
cudaMemcpy(d_in, h_in, N*sizeof(float), cudaMemcpyHostToDevice);
// call HOST version
hostMirror(N, h_in, h_out);
unsigned int nb = (4*N + TPB -1) / TPB; // define DEVICE dimension(s)
// call DEVICE version
kernelMirror<<<nb,TPB>>>(N, d_in, d_out);
// copy results back from DEVICE
cudaMemcpy(chk, d_out, 4*N*sizeof(float), cudaMemcpyDeviceToHost);
// compare results
for(unsigned int i = 0; i < 4*N; ++i){
printf("Idx: %d -- HOST: %0.4f, DEVICE: %0.4f\n", i, h_out[i], chk[i]);
}
// free memory
free(h_in);
free(h_out);
free(chk);
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
```