Hi ladies and gents!

I have problem with sum calculation in CUDA. I have an struct which contains an array of 784 elements and i need to calculate it’s sum.

I tried to modify the code from nvidia reduction example, but i still get wrong sum. Maybe I used bad block and thread dimensions.

Please, can anybody post a small kernel function which will compute the sum of this array? - please include blockidx and threadidx dimensions.

Or any help how to do this will help me.

Thanks in advance.

PS: sorry for my English, it’s not my native language.

I tried to figure it out on a simple example which will calculate sum of 16 elements, but i again get wrong result

here is the code:

[codebox]

#include <stdio.h>

#include <stdlib.h>

#include <cutil_inline.h>

#include <cuda.h>

**global** void

reduce0(float* g_idata,float* g_odata, unsigned int n)

{

```
extern __shared__ float temp[];
int thid = threadIdx.x;
int pout = 0, pin = 1;
temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
__syncthreads();
for(int offset = 1;offset < n; offset *= 2)
{
pout = 1 - pout;
pin = 1 - pout;
if(thid >= offset)
temp[pout*n+thid] += temp[pin*n+thid - offset];
else
temp[pout*n+thid] = temp[pin*n+thid];
__syncthreads();
}
g_odata[thid] = temp[pout*n + thid];
```

}

int main(int argc, char **argv)

{

float *data;

float *odata;

data=(float *)malloc((16)*sizeof(float));

odata=(float *)malloc((16)*sizeof(float));

for(int i=0;i<16;i++){data[i] = (float)i/100; printf(“data[%d]=%f\n”,i,data[i]);}

float *g_idata;

float *g_odata;

cutilSafeCall( cudaMalloc( (void**) &g_idata, (16)*sizeof(float)));

cutilSafeCall( cudaMalloc( (void**) &g_odata, (16)*sizeof(float)));

cutilSafeCall( cudaMemcpy( g_idata, data, 16*sizeof(float), cudaMemcpyHostToDevice) );

reduce0<<<2, 8>>>(g_idata,g_odata, 16);

cudaMemcpy(odata, g_odata, 16*sizeof(float), cudaMemcpyDeviceToHost);

for(int i=0;i<16;i++){printf(“data[%d]=%f\n”,i,odata[i]);}

system(“PAUSE”);

}

[/codebox]