Hi there,

I am porting a code form thrust to CUDA. I follow some examples online but I am not sure if this is correct or now since I am just starting. I have to following code compiled without any problem and I have the compiled code run in NVIDIA Tesla C2075, there is no error comes out in run time but I think there is something wrong with the code since the return data is not as what I expected.

```
#include <cuda.h>
#include <iostream>
#include "matio.h"
using namespace std;
#define N 600
int InitGPUSet()
{
char GPU[100] = "GPU: ";
cudaDeviceProp tCard;
int num = 0;
if (cudaSuccess == cudaGetDeviceCount(&num))
{
for (int i = 0; i < num; ++ i)
{
cudaSetDevice(i);
cudaGetDeviceProperties(&tCard, i);
puts(strcat(GPU , tCard.name));
}
}
else return 0;
return 1;
}
__global__ void findStd(double A0, double B0, double *data)
{
unsigned int x = blockIdx.x;
unsigned int y = threadIdx.x;
double A[100000], B[100000];
double phi=0, sum=0.0, sum2=0.0, mean=0.0, slope=0.0, stddev=0.0;
A[0] = A0;
B[0] = B0;
for (int n=0; n<100000-1; n++)
{
A[n+1] = A[n] + B[n];
B[n+1] = B[n] + A[n+1];
sum += A[n];
sum2 += A[n]*A[n];
}
mean = sum/100000.0;
stddev = (sum2 - 2*sum*mean + 100000.0*mean*mean)/(double)100000.0;
if (stddev>=5) data[y+x*N] = 255.0;
else data[y+x*N] = -12.0;
}
int main(void)
{
if(!InitGPUSet()) puts("device is not ready!");
else
{
int ret=0;
double data[N*N];
cudaEvent_t start, end;
cout << "staring ..." << endl;
cudaEventCreate(&start);
cudaEventCreate(&end);
cudaEventRecord(start,0);
for (double ii=0; ii<=4; ii+=0.01)
{
for (double jj=-1.5; jj<=1.5; jj+=0.01)
{
findStd<<<N, N>>>(ii, jj, &data[0]);
cudaDeviceSynchronize();
}
}
cudaEventRecord(end, 0);
cudaEventSynchronize(end);
float elapsed_time;
cudaEventElapsedTime(&elapsed_time, start, end);
cout << "Time: " << elapsed_time/1000 << " sec!" << endl;
}
}
```

Here is a brief introduction of what the code did. I have a recurrence relation have A and B involved. For a given initial condition (A[0] and B[0]), I have it iterate 100000 time in a given way so I will return a 10000 array for A and B. I then calculate the standard deviation (stddev) on A and depends on the stddev, I will set the value at the corresponding cell in an output array. I have totally 600x600 initial conditions need to be tested so I have to run above function 600x600 times. To make it parallel, I setup above algorithm as a kernel and set the number of grid and number of threads to 600. But after I run the code, I find the returned array (data) have some random number instead. There must be something wrong but I cannot tell what’s going on there due to my limit experience in CUDA programming.