I’m just starting out with cuda. Today I worked off of one of the the examples on the Dr. Dobbs site, http://www.ddj.com/hpc-high-performance-computing/207200659.

The problem I’m working can be approached as a 1-d diffusion (or “smoothing”), algorithm. Basically, there’s a large array, a[i], and after each successive time step the new value of a[i] is given by,

da[i] = (a[i+1] -2a[i] + a[i-1])/4

a[i] += da[i]

The computation of da for all elements of the array is parallel, but updating a[i] has to wait until all of the da[i] terms have been computed. I think I have a working implementation, which I’ll post below. I have a few questions about the code I wrote though - I’d appreciate your review of the code if you have the time.

(1) As I wrote the code, I made da a global array, but its only used in the kernel. Is there a good way to define it only in the kernel?

(2) Is my use of __syncthreads() in the kernel appropriate? To be accurate, all da elements need to be computed before a is updated. Is there an easier way?

(3) I also wrote a regular version of the program, also posted below, which runs about SIX times slower on a 2.3GHz Shanghai Opteron than the cuda code runs on a cheap 9500GT card. Is this kind of speedup really possible?

(4) I put no thought at all into the blockSize or the number of blocks used by the kernel - I just stuck with what the example code came with. Any suggestions about how to think about this would be appreciated.

**CUDA version of the code**

```
// working off of example at:
// http://www.ddj.com/hpc-high-performance-computing/207402986
//
// smoothing_device.cu
// idea is to initialize set of vectors and then "smooth" them,
// by making each vector the average of its neighbors, eg
// du = u(i-1) - 2u(i) + u(i+1)
//
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <time.h>
__global__ void smoothDataOnDevice(float *a, float *da, int N, float one_over_kappa, int repeat)
{
int idx, j;
idx = blockIdx.x*blockDim.x + threadIdx.x;
for(j=0;j<repeat;j++) {
if(idx==0) {
da[idx] = (-2.0*a[idx] + a[idx+1])*one_over_kappa;
}
else if (idx<(N-1)) {
da[idx] = (a[idx-1] - 2.0*a[idx] + a[idx+1])*one_over_kappa;
}
else if (idx==(N-1)) {
da[idx] = (a[idx-1] - 2.0*a[idx])*one_over_kappa;
}
__syncthreads();
if(idx<N){
a[idx] = a[idx] + da[idx];
}
__syncthreads();
}
}
int main(void)
{
float *a_h; // pointers to host memory
float *a_d, *da_d; // pointer to device memory
int i, N = 1000000;
size_t size = N*sizeof(float);
//
// this is a smoothing constant. For stability, needs to be ~< 0.25
float one_over_kappa = 1.0/(4.0);
// initialize the random number generator with the system time
time_t t1;
time(&t1);
long random_seed;
random_seed=(long) t1;
random_seed=100;
srand(random_seed);
// allocate arrays on host
a_h = (float *)malloc(size);
// allocate array on device
cudaMalloc((void **) &a_d, size);
cudaMalloc((void **) &da_d, size);
// initialization of host data
for (i=0; i<N; i++) {
a_h[i] = (float)(2.0*drand48()-1.0);
}
a_h[0]=0.0;
a_h[N-1]=0.0;
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(float)*N, cudaMemcpyHostToDevice);
// do calculation on device:
// Part 1 of 2. Compute execution configuration
int blockSize = 512;
int nBlocks = N/blockSize;
if(N%blockSize!=0) {
nBlocks +=1;
}
printf("N = %d\n",N);
printf("blockSize = %d\n",blockSize);
printf("nBlocks = %d\n",nBlocks);
int num_steps_to_simulate=5000;
smoothDataOnDevice<<< nBlocks, blockSize >>> (a_d, da_d, N, one_over_kappa,num_steps_to_simulate);
char out_filename[128];
FILE *OUT;
sprintf(out_filename,"smoothing.seed.%d.laststep.tsv",random_seed);
OUT = fopen(out_filename,"w");
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
for(i=0;i<N;i++){
fprintf(OUT,"%d\t%f\n",i,a_h[i]);
}
fclose(OUT);
// cleanup
free(a_h); cudaFree(da_d); cudaFree(a_d);
return 1;
}
```

[b]traditional code, runs about 6 times slower than Shanghai Opteron

[/b]

```
// smoothing_host.c
//
// idea is to initialize set of vectors and then "smooth" them,
// by making each vector the average of its neighbors, eg
// du = u(i-1) - 2u(i) + u(i+1)
//
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main(void)
{
float *a, *da; // pointer to device memory
int step, i, N = 1000000;
size_t size = N*sizeof(float);
// this is a smoothing constant. For stability, needs to be ~< 0.25
float one_over_kappa = 1.0/(4.0);
// allocate arrays
a = (float *)malloc(size);
da = (float *)malloc(size);
// initialize the random number generator with the system time
time_t t1;
time(&t1);
long random_seed;
random_seed=(long) t1;
random_seed=100;
srand(random_seed);
// initialization of host data
for (i=0; i<N; i++) {
a[i] = (float)(2.0*drand48()-1.0);
}
a[0]=0.0;
a[N-1]=0.0;
// do calculation on device:
printf("N = %d\n",N);
int num_steps_to_simulate=5000;
printf("smoothing iterations: %d\n",num_steps_to_simulate);
for(step=0;step<num_steps_to_simulate;step++) {
for(i=0;i<N;i++){
if(i==0) {
da[i] = (-2.0*a[i] + a[i+1])*one_over_kappa;
}
else if (i<(N-1)) {
da[i] = (a[i-1] - 2.0*a[i] + a[i+1])*one_over_kappa;
}
else if (i==(N-1)) {
da[i] = (a[i-1] - 2.0*a[i])*one_over_kappa;
}
}
for(i=0;i<N;i++) {
a[i] = a[i] + da[i];
}
}
// final output visualization, had to repeat this to include the last smoothing
char out_filename[128];
FILE * OUT;
sprintf(out_filename,"cversion_smoothing.seed.%d.step.%d.tsv",random_seed,step);
OUT = fopen(out_filename,"w");
for (i=0; i<N; i++) {
fprintf(OUT,"%d\t%f\n",i,a[i]);
}
fclose(OUT);
// cleanup
free(a); free(da);
return 1;
}
```