shared memory

Hi,

I am a newbie to CUDA. I am not able to understand how to use the shared variable.

Could somebody help me in understanding these shared variables?

I wrote a simple code which counts number odd numbers in a given vector of 10000 elements.

I try to use the shared variable to count the number of odd numbers in each block. However, i am didn’t succeed in this. The code is working fine in Emulation mode but not in regular mode.

I want to return the total number instead of count in each block.

I greatly appreciate your help.

Thanks

The process of returning the sum of something (like an odd number count) over all the threads is very common in CUDA programming. It’s called reduction.

There’s an excellent example of reduction in the CUDA SDK demonstration projects.

Thank you so much Worely, I don’t get it how to create a counter which can be incremented over all blocks. Could you kindly help me? I appreciate your help.

Here is my code.

global void check_gpu( int *ele, int *out, int no_ele)

{

long int idx=blockIdx.x*blockDim.x+threadIdx.x;

__shared__ int sele[250]; // Loaded from Global memory to shared memory

sele[threadIdx.x]=ele[idx];

__syncthreads();	



if(sele[threadIdx.x]%2==0) // Finding whether it is odd or even number

	out[idx]=1;

else

	out[idx]=0;

}

int main(int argc, char* argv)

{

time_t time1;

time1=time(NULL);



// Allocate memory for host	

int no_ele=100000;

int *ele_host;

ele_host=(int*)malloc(sizeof(int)*no_ele);

int *out_host;

out_host=(int*)malloc(sizeof(int)*no_ele);

for(int k=0; k<no_ele; k++){

	ele_host[k]=k+1;

}

// Memory allocation DEVICE

int *ele_dev; // Elements in device

cudaMalloc((void **) &ele_dev, sizeof(int)*no_ele);

int *out_dev;

cudaMalloc((void **) &out_dev, sizeof(int)*no_ele);

// Copy data from host memory to device memory 

cudaMemcpy(ele_dev, ele_host, sizeof(int)*(no_ele), cudaMemcpyHostToDevice);

// Configure Device threads and blocks

int no_blocks=400;

int no_threads=250;



check_gpu<<<no_blocks, no_threads>>>(ele_dev, out_dev, no_ele);



CUT_CHECK_ERROR("Kernel execution failed");

cudaMemcpy(out_host, out_dev, sizeof(int)*no_ele, cudaMemcpyDeviceToHost);

for(int k=0; k<no_ele; k++){

	printf("%d\n", out_host[k]);

}

time_t time3;

time3=time(NULL);

printf("%f\n", difftime(time3, time1));

cudaFree(ele_dev);

cudaFree(out_dev);	

free(ele_host);

free(out_host);

}