Hi,
I’m trying to do a parallel reduction but cant get __syncthreads() to work.
I’m running:
CUDA4.0
Nsight 2.1
Visual Studio 2010
Here is the code:
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
#include <device_functions.h>
global void
reduce0(int *g_idata, int *g_odata)
{
extern shared int sdata;
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2) {
// modulo arithmetic is slow!
if ((tid % (2*s)) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main()
{
//set parameters in host
const int threads =10;
const int blocks =1;
dim3 dimBlock(threads, 1, 1);
dim3 dimGrid(blocks, 1, 1);
int smemSize = threads * sizeof(int);
int in_list[threads];
int out_list[threads];
for(int i =0;i<threads; i++)
in_list[i] = i;
//set parameters i device
int *d_idata, *d_odata;
cudaMalloc((void**)&d_idata,sizeof(int)*threads);
cudaMalloc((void**)&d_odata,sizeof(int)*threads);
cudaMemcpy(d_idata,in_list,sizeof(int)*threads,cudaMemcpyHostToDevice);
//run kernel
reduce0<<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata);
//copy back to host
cudaMemcpy(out_list,d_odata,sizeof(int)*threads,cudaMemcpyDeviceToHost);
cudaFree(d_idata);
cudaFree(d_odata);
return 0;
}
I get the following error when I compile the program, “IntelliSense: identifier “__syncthreads” is undefined”. Because I can’t sync my threads it constantly fails, due to race condition :( please help! I really need to get this to work.