I am a newbie to CUDA. I am not able to understand how to use the shared variable.
Could somebody help me in understanding these shared variables?
I wrote a simple code which counts number odd numbers in a given vector of 10000 elements.
I try to use the shared variable to count the number of odd numbers in each block. However, i am didn’t succeed in this. The code is working fine in Emulation mode but not in regular mode.
I want to return the total number instead of count in each block.
The process of returning the sum of something (like an odd number count) over all the threads is very common in CUDA programming. It’s called reduction.
There’s an excellent example of reduction in the CUDA SDK demonstration projects.
Thank you so much Worely, I don’t get it how to create a counter which can be incremented over all blocks. Could you kindly help me? I appreciate your help.
Here is my code.
global void check_gpu( int *ele, int *out, int no_ele)
{
long int idx=blockIdx.x*blockDim.x+threadIdx.x;
__shared__ int sele[250]; // Loaded from Global memory to shared memory
sele[threadIdx.x]=ele[idx];
__syncthreads();
if(sele[threadIdx.x]%2==0) // Finding whether it is odd or even number
out[idx]=1;
else
out[idx]=0;
}
int main(int argc, char* argv)
{
time_t time1;
time1=time(NULL);
// Allocate memory for host
int no_ele=100000;
int *ele_host;
ele_host=(int*)malloc(sizeof(int)*no_ele);
int *out_host;
out_host=(int*)malloc(sizeof(int)*no_ele);
for(int k=0; k<no_ele; k++){
ele_host[k]=k+1;
}
// Memory allocation DEVICE
int *ele_dev; // Elements in device
cudaMalloc((void **) &ele_dev, sizeof(int)*no_ele);
int *out_dev;
cudaMalloc((void **) &out_dev, sizeof(int)*no_ele);
// Copy data from host memory to device memory
cudaMemcpy(ele_dev, ele_host, sizeof(int)*(no_ele), cudaMemcpyHostToDevice);
// Configure Device threads and blocks
int no_blocks=400;
int no_threads=250;
check_gpu<<<no_blocks, no_threads>>>(ele_dev, out_dev, no_ele);
CUT_CHECK_ERROR("Kernel execution failed");
cudaMemcpy(out_host, out_dev, sizeof(int)*no_ele, cudaMemcpyDeviceToHost);
for(int k=0; k<no_ele; k++){
printf("%d\n", out_host[k]);
}
time_t time3;
time3=time(NULL);
printf("%f\n", difftime(time3, time1));
cudaFree(ele_dev);
cudaFree(out_dev);
free(ele_host);
free(out_host);