Hi ladies and gents!
I have problem with sum calculation in CUDA. I have an struct which contains an array of 784 elements and i need to calculate it’s sum.
I tried to modify the code from nvidia reduction example, but i still get wrong sum. Maybe I used bad block and thread dimensions.
Please, can anybody post a small kernel function which will compute the sum of this array? - please include blockidx and threadidx dimensions.
Or any help how to do this will help me.
Thanks in advance.
PS: sorry for my English, it’s not my native language.
I tried to figure it out on a simple example which will calculate sum of 16 elements, but i again get wrong result
here is the code:
[codebox]
#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>
#include <cuda.h>
global void
reduce0(float* g_idata,float* g_odata, unsigned int n)
{
extern __shared__ float temp[];
int thid = threadIdx.x;
int pout = 0, pin = 1;
temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
__syncthreads();
for(int offset = 1;offset < n; offset *= 2)
{
pout = 1 - pout;
pin = 1 - pout;
if(thid >= offset)
temp[pout*n+thid] += temp[pin*n+thid - offset];
else
temp[pout*n+thid] = temp[pin*n+thid];
__syncthreads();
}
g_odata[thid] = temp[pout*n + thid];
}
int main(int argc, char **argv)
{
float *data;
float *odata;
data=(float *)malloc((16)*sizeof(float));
odata=(float *)malloc((16)*sizeof(float));
for(int i=0;i<16;i++){data[i] = (float)i/100; printf(“data[%d]=%f\n”,i,data[i]);}
float *g_idata;
float *g_odata;
cutilSafeCall( cudaMalloc( (void**) &g_idata, (16)*sizeof(float)));
cutilSafeCall( cudaMalloc( (void**) &g_odata, (16)*sizeof(float)));
cutilSafeCall( cudaMemcpy( g_idata, data, 16*sizeof(float), cudaMemcpyHostToDevice) );
reduce0<<<2, 8>>>(g_idata,g_odata, 16);
cudaMemcpy(odata, g_odata, 16*sizeof(float), cudaMemcpyDeviceToHost);
for(int i=0;i<16;i++){printf(“data[%d]=%f\n”,i,odata[i]);}
system(“PAUSE”);
}
[/codebox]