Many thanks to a certain “Chris” for his/her help on my last post (“Simple Iterating Code–HELP!”)–it put me on the right track, and I eventually got it. Now I have another problem I’m trying (and, so far, failing) to crack.

The code below is the working solution for said 1-D code. Taking it to 2-D complicates things significantly, however. My foremost question is, how do I initialize the “y” dimension? “int idx = blockIdx.x*blockDim.x + threadIdx.x” gives me my x-dimension, but will repeating that for “int idy” merely give me a copy of “int idx,” or will it give me an array of “idx”-by-“idy”?

Thanks in advance for any answers.

# include <stdio.h>

# include <stdlib.h>

# include <cuda.h>

**global** void incrementArrayOnDevice(float *to, float from, int N)
{
int idx = blockIdx.xblockDim.x + threadIdx.x;
if(idx>0 && idx<N-1))
{
to[0] = 1;
to[idx] = 0.25*from[idx-1] + 0.5

*from[idx] + 0.25*from[idx+1];

}

}

int main(void)

{

printf("\n");

float *a_h, *b_h, *a_d, *q;
int i;
int N = 256;
size_t size = N*sizeof(float);

a_h = (float *)malloc(size);

b_h = (float *)malloc(size);

cudaMalloc((void **) &a_d, size);

cudaMalloc((void **) &q, size);

for (i=0; i<1; i++)

{

a_h[i] = 1;

}

for (i=1; i<N; i++)

{

a_h[i] = 0;

}

cudaMemcpy(a_d, a_h, sizeof(float)*N, cudaMemcpyHostToDevice);

int blockSize = N/128;

int nBlocks = N/blockSize + (N%blockSize == 0?0:1);

float *from = a_d;

float *to = q;

for (int iter = 0; iter<200000; iter++)

{

incrementArrayOnDevice <<< nBlocks, blockSize >>> (to, from, N);

float *temp = to; to = from; from = temp;

}

cudaMemcpy(b_h, from, sizeof(float)*N, cudaMemcpyDeviceToHost);

for(i=0; i<N; i++)

{

printf("%d --> %f\n",i,b_h[i]);

}

free(a_h); free(b_h); cudaFree(to); cudaFree(from);

printf("\n");

}