Hi, anyone,
I’m a freshman in CUDA and parallel programm.
Now I’m performing correlation of two signals with CUDA. And the problem is that when I accumulate the partial sum of each block in the kernel, the gpu result is zero. The kernel code is as follows,
global void
reduce0_kernel( float* g_i1, float* g_i2, float* g_odata, unsigned int n)
{
// shared memory // the size is determined by the host application
extern shared float sdata ;
// access thread id
unsigned int tid = threadIdx.x;
// access number of threads in this block
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
//multiplication
sdata[tid] = (i < n) ? (g_i1[i]*g_i2[i]) : 0;
__syncthreads();
for(unsigned int s=1;s<blockDim.x;s*=2)
{
if(tid % (2*s) == 0)
sdata[tid] += sdata[tid+s];
__syncthreads();
}
if(tid==0)
g_odata[blockIdx.x] = sdata[0];
for(unsigned int k=1;k<blockIdx.x;k++)
g_odata[0] += g_odata[i]; // accumulate the partial sum of each block
}
can anyone tell me why? thank you very much
Best regards,
Hi, anyone,
I’m a freshman in CUDA and parallel programm.
Now I’m performing correlation of two signals with CUDA. And the problem is that when I accumulate the partial sum of each block in the kernel, the gpu result is zero. The kernel code is as follows,
global void
reduce0_kernel( float* g_i1, float* g_i2, float* g_odata, unsigned int n)
{
// shared memory // the size is determined by the host application
extern shared float sdata ;
// access thread id
unsigned int tid = threadIdx.x;
// access number of threads in this block
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
//multiplication
sdata[tid] = (i < n) ? (g_i1[i]*g_i2[i]) : 0;
__syncthreads();
for(unsigned int s=1;s<blockDim.x;s*=2)
{
if(tid % (2*s) == 0)
sdata[tid] += sdata[tid+s];
__syncthreads();
}
if(tid==0)
g_odata[blockIdx.x] = sdata[0];
for(unsigned int k=1;k<blockIdx.x;k++)
g_odata[0] += g_odata[i]; // accumulate the partial sum of each block
}
can anyone tell me why? thank you very much
Best regards,
Does anyone help me? thanks a lot
Does anyone help me? thanks a lot
gshi
October 22, 2010, 2:49pm
5
The accumulation of the partial sum of each block has to happen in another kernel or in cpu.
gshi
October 22, 2010, 2:49pm
6
The accumulation of the partial sum of each block has to happen in another kernel or in cpu.
yyfn
October 23, 2010, 1:24am
7
Hi, anyone,
I’m a freshman in CUDA and parallel programm.
Now I’m performing correlation of two signals with CUDA. And the problem is that when I accumulate the partial sum of each block in the kernel, the gpu result is zero. The kernel code is as follows,
global void
reduce0_kernel( float* g_i1, float* g_i2, float* g_odata, unsigned int n)
{
// shared memory // the size is determined by the host application
extern shared float sdata ;
// access thread id
unsigned int tid = threadIdx.x;
// access number of threads in this block
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
//multiplication
sdata[tid] = (i < n) ? (g_i1[i]*g_i2[i]) : 0;
__syncthreads();
for(unsigned int s=1;s<blockDim.x;s*=2)
{
if(tid % (2*s) == 0)
sdata[tid] += sdata[tid+s];
__syncthreads();
}
if(tid==0)
g_odata[blockIdx.x] = sdata[0];
for(unsigned int k=1;k<blockIdx.x;k++)
g_odata[0] += g_odata[i]; // accumulate the partial sum of each block
}
can anyone tell me why? thank you very much
Best regards,
maybe you need to have a look at the reduction algorthm!
yyfn
October 23, 2010, 1:24am
8
Hi, anyone,
I’m a freshman in CUDA and parallel programm.
Now I’m performing correlation of two signals with CUDA. And the problem is that when I accumulate the partial sum of each block in the kernel, the gpu result is zero. The kernel code is as follows,
global void
reduce0_kernel( float* g_i1, float* g_i2, float* g_odata, unsigned int n)
{
// shared memory // the size is determined by the host application
extern shared float sdata ;
// access thread id
unsigned int tid = threadIdx.x;
// access number of threads in this block
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
//multiplication
sdata[tid] = (i < n) ? (g_i1[i]*g_i2[i]) : 0;
__syncthreads();
for(unsigned int s=1;s<blockDim.x;s*=2)
{
if(tid % (2*s) == 0)
sdata[tid] += sdata[tid+s];
__syncthreads();
}
if(tid==0)
g_odata[blockIdx.x] = sdata[0];
for(unsigned int k=1;k<blockIdx.x;k++)
g_odata[0] += g_odata[i]; // accumulate the partial sum of each block
}
can anyone tell me why? thank you very much
Best regards,
maybe you need to have a look at the reduction algorthm!