Hi All,
Hopefully this I am missing something obvious here, as I imagine this is a common task.
In the following code I would expect that after the call to testSampleKernel, I would have d_dist filled with 32 values of 32, as each thread in in each block just accumulates one to a volatile shared variable (32 blocks of 32 threads). Instead I get 32 values of 1.
Can someone shed some light on why I’m not getting what I think I should be getting?
Cheers,
Simple
#include <stdio.h>
#define NLU 32
__global__ void testSampleKernel(int* dist) {
volatile __shared__ int count;
if(threadIdx.x == 0) count = 0;
__syncthreads();
++count;
__syncthreads();
if(threadIdx.x == 0) dist[blockIdx.x] = count;
}
void testSample() {
int
*h_dist = new int[NLU],
*d_dist,
i;
cudaMalloc((void**)&d_dist, NLU*sizeof(int));
testSampleKernel<<<NLU, NLU>>>(d_dist);
cudaThreadSynchronize();
cudaMemcpy(h_dist, d_dist, NLU*sizeof(int), cudaMemcpyDeviceToHost);
unsigned int tot(0u);
for(i=0; i<NLU; ++i) {
printf("%d, ", h_dist[i]);
tot += h_dist[i];
}
printf("tot = %d (should equal %d)\n", tot, NLU*NLU);
cudaFree(d_dist);
delete [] h_dist;
}
int main(int argc, char* argv[]) {
testSample();
system("PAUSE");
return 1; // ...it isn't really working.
}