Hello everyone,
I am studying the parallel reduction examples in Mark Harris’s document.
I feel there is a problem of using the “extern __ shared__ int sdata”
I made this simple code based on “reduce0” example to show the problem:
#include <iostream>
__global__ void reduce0(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main(int argc, char** argv)
{
int len = 20000;
int size = len*sizeof(int);
int* in = (int*)malloc(size);
for(int i = 0; i < len; i++)
{
in[i] = rand();
// std::cout << in[i] << std::endl;
}
int *g_idata, *g_odata;
cudaMalloc((void**)&g_idata, size);
cudaMemcpy(g_idata, in, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid = (len + threadsPerBlock - 1) / threadsPerBlock;
cudaMalloc((void**)&g_odata, sizeof(int)*blocksPerGrid);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
reduce0<<<blocksPerGrid, threadsPerBlock>>>(g_idata, g_odata);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
int *d_odata;
d_odata = new int[blocksPerGrid];
memset(d_odata, 0, blocksPerGrid*4);
cudaMemcpy(d_odata, g_odata, blocksPerGrid*4, cudaMemcpyDeviceToHost);
std::cout << d_odata[0] << std::endl;
}
I can compile and run this code without any error. But the print out value of “d_odata[0]” is 0, which should be a large random integer number.
Can anyone help me to figure out where the problem is? Thanks