Hi,
I am new to CUDA, i want to know whether the following program makes sense under CUDA parallel computing. Also, i have some problem in using shared memory.
The program i am trying to write is: Counting the number of values in a given vector whose values are (<1300 and >999990).
I create a vector of number between 0 to 1000000.
Divide the vector into 1000 slices of each contains 1000 elements.
To run these 1000 slices parallel, I used the following parameters
No. of blocks = 4
No. of threads / block = 250
Also, i created a shared variable to count the number elements in the slice less then 1300 and > 999990. Then i want to calculate the total number of elements in the vector.
Expected Output:
Number of elements less then 1300 and greater then 999990
Code:
global void counter(int in1, int no, int nseg, int hits)
{
shared int sp;
long int idx=blockIdx.x*blockDim.x+threadIdx.x;
// These start and end will help as slices
int start=idx*nseg;
int end=start+nseg;
for(int k=start; k<end; k++){
if((in1[k] < 1300) || (in1[k]>999990)){
sp=sp+1;
}
}
__syncthreads();
if(threadIdx.x==0){
hits[blockIdx.x]=sp;
}
}
int main(int argc, char* argv)
{
time_t time1;
time1=time(NULL);
int no_ele=1000000;
int slice=1000;
// Allocation of memory in CPU
int input;
input=(int) malloc(no_ele*sizeof(int));
int out;
out=(int) malloc(4*sizeof(int));
for(int s=0; s<no_ele; s++){
input[s]=s;
}
//Allocate memory to GPU
int in_gpu;
cudaMalloc((void*) &in_gpu, sizeof(int)no_ele);
int out_gpu;
cudaMalloc((void**) &out_gpu, sizeof(int)no_ele);
int hit_gpu;
cudaMalloc((void**) &hit_gpu, 4*sizeof(int));
// Memory copy from CPU to GPU
cudaMemcpy(in_gpu, input, sizeof(int)*no_ele, cudaMemcpyHostToDevice);
counter<<<4, 250>>>(in_gpu, no_ele, slice, hit_gpu);
cudaMemcpy(out, hit_gpu, 4*sizeof(int), cudaMemcpyDeviceToHost);
printf(“%d\n”, out[0]);
cudaFree(in_gpu);
cudaFree(out_gpu);
free(input);
free(out);
}
I greatly appreciate any inputs.
Thank you so much