First, I would suggest you take a look at the cuda sdk because there is project in there called “reduction” that does exactly what you want but very efficiently for CUDA. You could learn a few things from it.
Secondly, to answer your question, the reason your code is only giving you zeroes is because you didn’t actually set any of the values for arrays on the device. For example, you define d_values and malloc it correctly but your cudaMemcpy is wrong. So, d_values is filled with arbitrary values (which are most likely zero). The problem is cudaMemcpy requires that you pass in a pointer to the first address of your array of values (for both the target AND the source arrays) but you typed &d_value which is a pointer to the pointer. As we should all remember from learning C/C++, arrays and pointers are the same thing (float *a and float a are equivalent; in both cases a is a pointer to the first element in an array which has yet to be defined). All of your memory setting and copying commands are incorrect. I think you got confused by cudaMalloc (which you did actually use correctly). Where cudaMemset and cudaMemcpy are reading/writing the values ONLY within the arrays, cudaMalloc is finding a spot in memory that can be allocated and then storing that address within the pointer. So, one command is modifying the array while the other is modifying the pointer to the array. That is why you need to pass in the array pointer by reference for cudaMalloc but you only need to pass it by value for cudaMemset and cudaMemcpy.
So, change all of your cudaMemset and cudaMemcpy commands that are like this:
#include <stdio.h>
__shared__ int global_offset;
__global__ void reduction(int * values,int * index,int * cluster_index,int * offset,int * id_within_cluster,int * new_work_item_id,int * reduce_array_size)
{
printf("global_offset is %d \n",global_offset);
printf("values of %d is %d \n",threadIdx.x,values[threadIdx.x]);
printf("cluster_index of %d is %d \n",threadIdx.x,cluster_index[threadIdx.x]);
printf("index of of %d is %d \n",threadIdx.x,index[cluster_index[threadIdx.x]]);
printf("reduce_array_size of %d is %d \n",threadIdx.x,reduce_array_size[threadIdx.x]);
printf("id within cluster of %d is %d \n",threadIdx.x,id_within_cluster[threadIdx.x]);
id_within_cluster[threadIdx.x]=atomicAdd(&index[cluster_index[threadIdx.x]],1);
printf("id_within_cluster of %d is %d \n",threadIdx.x,id_within_cluster[threadIdx.x]);
printf("index of of %d is %d \n",threadIdx.x,index[cluster_index[threadIdx.x]]);
//__syncthreads();
reduce_array_size[threadIdx.x]=index[cluster_index[threadIdx.x]];
printf("reduce_array_size of %d is %d \n",threadIdx.x,reduce_array_size[threadIdx.x]);
if(threadIdx.x==0)global_offset=0;
if(id_within_cluster==0)
{
index[cluster_index[threadIdx.x]]=atomicAdd(&global_offset,reduce_array_size[threadIdx.x]);
}
offset[threadIdx.x]=index[cluster_index[threadIdx.x]];
new_work_item_id[threadIdx.x]=offset[threadIdx.x]+id_within_cluster[threadIdx.x];
index[new_work_item_id[threadIdx.x]]=threadIdx.x;
/*
for(int i=8/2;i>0;i/2)
{
if(id_within_cluster<(reduce_array_size/2))
{
reduce_array_size=ceil(reduce_array_size/2);
values[index[newTid]]+=values[index[newTid+reduce_array_size]];
}
}
*/
printf("global_offset is %d \n",global_offset);
__syncthreads();
}
int main()
{
int values[8]={5,2,9,4,8,11,3,1};
int cluster_index[8]={0,2,3,2,1,3,1,3};
int offset[8];
int index[8];
int new_work_item_id[8];
int * d_values;
int * d_cluster_index;
int * d_offset;
int * d_id_within_cluster;
int * d_index;
int * d_reduce_array_size;
int * d_new_work_item_id;
cudaMalloc(&d_values,8*sizeof(int));
cudaMalloc(&d_offset,8*sizeof(int));
cudaMalloc(&d_cluster_index,8*sizeof(int));
cudaMalloc(&d_id_within_cluster,8*sizeof(int));
cudaMalloc(&d_index,8*sizeof(int));
cudaMalloc(&d_reduce_array_size,8*sizeof(int));
cudaMalloc(&d_new_work_item_id,8*sizeof(int));
cudaMalloc(&d_reduce_array_size,8*sizeof(int));
cudaMemset(d_id_within_cluster,0,8*sizeof(int));
cudaMemset(d_index,0,8*sizeof(int));
cudaMemset(d_reduce_array_size,0,8*sizeof(int));
cudaMemset(d_new_work_item_id,0,8*sizeof(int));
cudaMemset(d_reduce_array_size,0,8*sizeof(int));
cudaMemcpy(d_values,values,8*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_offset,offset,8*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_cluster_index,cluster_index,8*sizeof(int),cudaMemcpyHostToDevice);
reduction<<<1,8>>>(d_values,index,d_cluster_index,d_offset,d_id_within_cluster,d_new_work_item_id,d_reduce_array_size);
/*
for(int i=0;i<8;i++)
{
printf("values of %d is %d \n",i,values[i]);
printf("offset of %d is %d \n",i,offset[i]);
printf("cluster_index of %d is %d \n",i,cluster_index[i]);
printf("index of %d is %d \n",i,index[i]);
}
*/
}
i get this output:
global_offset is 0
global_offset is 0
global_offset is 0
global_offset is 0
global_offset is 0
global_offset is 0
global_offset is 0
global_offset is 0
values of 0 is 5
values of 1 is 2
values of 2 is 9
values of 3 is 4
values of 4 is 8
values of 5 is 11
values of 6 is 3
values of 7 is 1
cluster_index of 0 is 0
cluster_index of 1 is 2
cluster_index of 2 is 3
cluster_index of 3 is 2
cluster_index of 4 is 1
cluster_index of 5 is 3
cluster_index of 6 is 1
cluster_index of 7 is 3
index of of 0 is 0
index of of 1 is 0
index of of 2 is 0
index of of 3 is 0
index of of 4 is 0
index of of 5 is 0
index of of 6 is 0
index of of 7 is 0
reduce_array_size of 0 is 0
reduce_array_size of 1 is 0
reduce_array_size of 2 is 0
reduce_array_size of 3 is 0
reduce_array_size of 4 is 0
reduce_array_size of 5 is 0
reduce_array_size of 6 is 0
reduce_array_size of 7 is 0
id within cluster of 0 is 0
id within cluster of 1 is 0
id within cluster of 2 is 0
id within cluster of 3 is 0
id within cluster of 4 is 0
id within cluster of 5 is 0
id within cluster of 6 is 0
id within cluster of 7 is 0
for some reason the execution seems to halt after the first few commands of the reduction function.
As you can see the memcopies are done correctly.
I really can’t tell why this is happening,can anybody please help me?
I don’t know how you weren’t having a problem with your reduction part as you weren’t passing in anything but zeroes into the kernel but alright… I am not about to figure out what your code is supposed to do and try programming it in CUDA for you. But, I will point out a few issues that jump out at me. Hopefully, addressing these will fix your problem.
[list=1]
[*]You pass index instead of d_index into the kernel. Since index is only allocated in the system memory, which the GPU cannot reach, I have no idea how the program will respond to something like that. I am actually surprised it doesn’t crash for you.
[*]You copy the values from offset to d_offset but you never actually set any values for offset. Is this on purpose?
[*]You allocate new_work_item_id on the host but then you don’t actually use it for anything. It is my opinion that new_work_item_id, offset, index have no reason to be defined and allocated on the host (at least with how your code is defined now).
[*]I don’t think this matters with so few threads, but I think there should be a __syncthreads() after initializing global_offset to zero because some threads might race ahead and execute the following code before it is set to zero.
[*]I have never used atomic operations before so I may be mistaken about this. But from what I just now read when looking it up, the value returned from an atomicAdd (or any other operation) is not the result of the addition but rather the old value that was in the address that is being atomically modified. I don’t know if you are aware of this and counting on it or not. I figured I would just point it out.
[*]You have an if statement that says compares id_within_cluster but id_within_cluster is an array, not a single value. I think that instead you want id_within_cluster[threadIdx.x].
[*]You may want another __syncthreads() after that last if statement as well because some threads may race ahead and read the value that is being updated within the conditional. With so few threads in this code, that will probably not happen but it can when more threads are added. Or, perhaps there is something about the logic of the code the precludes that from being an issue. I don’t know. I am just pointing out whatever I notice.
Here is the same code where I have made the changes mentioned above:
__shared__ int global_offset;
__global__ void reduction(int *values, int *index, int *cluster_index, int *offset, int *id_within_cluster, int *new_work_item_id, int *reduce_array_size)
{
id_within_cluster[threadIdx.x] = atomicAdd( &index[cluster_index[threadIdx.x]], 1);
__syncthreads();
reduce_array_size[threadIdx.x] = index[cluster_index[threadIdx.x]];
if (threadIdx.x == 0)
global_offset = 0;
__syncthreads();
if (id_within_cluster[threadIdx.x] == 0)
index[cluster_index[threadIdx.x]] = atomicAdd(&global_offset, reduce_array_size[threadIdx.x]);
__syncthreads();
offset[threadIdx.x] = index[cluster_index[threadIdx.x]];
new_work_item_id[threadIdx.x] = offset[threadIdx.x] + id_within_cluster[threadIdx.x];
index[new_work_item_id[threadIdx.x]] = threadIdx.x;
}
int main()
{
int values[8] = {5, 2, 9, 4, 8, 11, 3, 1};
int cluster_index[8] = {0, 2, 3, 2, 1, 3, 1, 3};
int *d_values;
int *d_cluster_index;
int *d_offset;
int *d_id_within_cluster;
int *d_index;
int *d_reduce_array_size;
int *d_new_work_item_id;
cudaMalloc(&d_values, 8*sizeof(int));
cudaMalloc(&d_offset, 8*sizeof(int));
cudaMalloc(&d_cluster_index, 8*sizeof(int));
cudaMalloc(&d_id_within_cluster, 8*sizeof(int));
cudaMalloc(&d_index, 8*sizeof(int));
cudaMalloc(&d_reduce_array_size, 8*sizeof(int));
cudaMalloc(&d_new_work_item_id, 8*sizeof(int));
cudaMalloc(&d_reduce_array_size, 8*sizeof(int));
cudaMemset(d_id_within_cluster, 0, 8*sizeof(int));
cudaMemset(d_index, 0, 8*sizeof(int));
cudaMemset(d_reduce_array_size, 0, 8*sizeof(int));
cudaMemset(d_new_work_item_id, 0, 8*sizeof(int));
cudaMemset(d_reduce_array_size, 0, 8*sizeof(int));
cudaMemcpy(d_values, values, 8*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_cluster_index, cluster_index, 8*sizeof(int), cudaMemcpyHostToDevice);
reduction<<<1, 8>>>(d_values, d_index, d_cluster_index, d_offset, d_id_within_cluster, d_new_work_item_id, d_reduce_array_size);
}
I almos finished porting the pseudocode in the paper to CUDA but it fails in the reduction part now (the part that i haven’t tried to implement thus far).
The code is the following
__shared__ int global_offset;
#include <stdio.h>
#include <math.h>
__global__ void reduction(int *values, int *index, int *cluster_index, int *offset, int *id_within_cluster, int *new_work_item_id, int *reduce_array_size)
{
id_within_cluster[threadIdx.x] = atomicAdd( &index[cluster_index[threadIdx.x]], 1);
__syncthreads();
reduce_array_size[threadIdx.x] = index[cluster_index[threadIdx.x]];
if (threadIdx.x == 0)
global_offset = 0;
__syncthreads();
if (id_within_cluster[threadIdx.x] == 0)
index[cluster_index[threadIdx.x]] = atomicAdd(&global_offset, reduce_array_size[threadIdx.x]);
__syncthreads();
offset[threadIdx.x] = index[cluster_index[threadIdx.x]];
new_work_item_id[threadIdx.x] = offset[threadIdx.x] + id_within_cluster[threadIdx.x];
index[new_work_item_id[threadIdx.x]] = threadIdx.x;
for(int i= 8/2; i>0; i/=2)
{
if( id_within_cluster[threadIdx.x] < ( (reduce_array_size[threadIdx.x])/2 ) )
{
reduce_array_size[threadIdx.x]=ceilf(((reduce_array_size[threadIdx.x])/2) );
values[index[new_work_item_id[threadIdx.x]]]+=values[index[new_work_item_id[threadIdx.x]+reduce_array_size[threadIdx.x]]];
}
__syncthreads();
}
printf("values of %d after reduction is %d \n",threadIdx.x,values[threadIdx.x]);
}
int main()
{
int values[8] = {5, 2, 9, 4, 8, 11, 3, 1};
int cluster_index[8] = {0, 2, 3, 2, 1, 3, 1, 3};
int *d_values;
int *d_cluster_index;
int *d_offset;
int *d_id_within_cluster;
int *d_index;
int *d_reduce_array_size;
int *d_new_work_item_id;
cudaMalloc(&d_values, 8*sizeof(int));
cudaMalloc(&d_offset, 8*sizeof(int));
cudaMalloc(&d_cluster_index, 8*sizeof(int));
cudaMalloc(&d_id_within_cluster, 8*sizeof(int));
cudaMalloc(&d_index, 8*sizeof(int));
cudaMalloc(&d_reduce_array_size, 8*sizeof(int));
cudaMalloc(&d_new_work_item_id, 8*sizeof(int));
cudaMalloc(&d_reduce_array_size, 8*sizeof(int));
cudaMemset(d_id_within_cluster, 0, 8*sizeof(int));
cudaMemset(d_index, 0, 8*sizeof(int));
cudaMemset(d_reduce_array_size, 0, 8*sizeof(int));
cudaMemset(d_new_work_item_id, 0, 8*sizeof(int));
cudaMemset(d_reduce_array_size, 0, 8*sizeof(int));
cudaMemcpy(d_values, values, 8*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_cluster_index, cluster_index, 8*sizeof(int), cudaMemcpyHostToDevice);
reduction<<<1, 8>>>(d_values, d_index, d_cluster_index, d_offset, d_id_within_cluster, d_new_work_item_id, d_reduce_array_size);
}
and the result is
values of 0 after reduction is 5
values of 1 after reduction is 6
values of 2 after reduction is 20
values of 3 after reduction is 4
values of 4 after reduction is 11
values of 5 after reduction is 11
values of 6 after reduction is 3
values of 7 after reduction is 1
while according to the paper it should be:
values of 0 after reduction is 5
values of 1 after reduction is 6
values of 2 after reduction is 21
values of 3 after reduction is 4
values of 4 after reduction is 11
values of 5 after reduction is 11
values of 6 after reduction is 3
values of 7 after reduction is 1
from what i get the final addition between 20 and 1 isn’t happening.
My guess is it might have to do with the ceilf function operating on integers,could it be the case?
Any ideas are welcome,
I will also have a look in the reduction example of the CUDA SDK
i really cant tell what is going wrong with my code.
I run the code in pen and paper,and the values of reduce_array_size are the ones they should be,but my code still computes the wrong values.
i really don’t get it.
Can anybody take a look?
I’m attaching my code
Edit:scratch that,i found the error ,in the first iteration reduce_array_size[2] should be 2 and not 1.That means that my suspicion that the ceilf function not working correctly was correct,
but when i try to use ceil instead of ceilf i get the following error:
"irregular2.cu(37): error: calling a host function("std::ceil<int> ") from a __device__/__global__ function("reduction") is not allowed
Edit2:finally got it to work correctly,i needed to cast reduce_array_size[threadIdx.x] to float in order for it to work correcty.
The attached file now works correctly.
I’m also posting the code for the sake of completeness.
Thank you for your help everyone
Apostolis
__shared__ int global_offset;
#include <stdio.h>
#include <math.h>
__global__ void reduction(int *values, int *index, int *cluster_index, int *offset, int *id_within_cluster, int *new_work_item_id, int *reduce_array_size)
{
id_within_cluster[threadIdx.x] = atomicAdd( &index[cluster_index[threadIdx.x]], 1);
__syncthreads();
reduce_array_size[threadIdx.x] = index[cluster_index[threadIdx.x]];
if (threadIdx.x == 0)
global_offset = 0;
__syncthreads();
if (id_within_cluster[threadIdx.x] == 0)
index[cluster_index[threadIdx.x]] = atomicAdd(&global_offset, reduce_array_size[threadIdx.x]);
__syncthreads();
offset[threadIdx.x] = index[cluster_index[threadIdx.x]];
new_work_item_id[threadIdx.x] = offset[threadIdx.x] + id_within_cluster[threadIdx.x];
index[new_work_item_id[threadIdx.x]] = threadIdx.x;
int count=0;
for(int i= 8/2; i>0; i/=2)
{
if( id_within_cluster[threadIdx.x] < ( (reduce_array_size[threadIdx.x])/2 ) )
{
reduce_array_size[threadIdx.x]=ceilf(((float)(reduce_array_size[threadIdx.x])/2) );
values[index[new_work_item_id[threadIdx.x]]]+=values[index[new_work_item_id[threadIdx.x]+reduce_array_size[threadIdx.x]]];
}
__syncthreads();
}
}
int main()
{
int values[8] = {5, 2, 9, 4, 8, 11, 3, 1};
int cluster_index[8] = {0, 2, 3, 2, 1, 3, 1, 3};
int *d_values;
int *d_cluster_index;
int *d_offset;
int *d_id_within_cluster;
int *d_index;
int *d_reduce_array_size;
int *d_new_work_item_id;
cudaMalloc(&d_values, 8*sizeof(int));
cudaMalloc(&d_offset, 8*sizeof(int));
cudaMalloc(&d_cluster_index, 8*sizeof(int));
cudaMalloc(&d_id_within_cluster, 8*sizeof(int));
cudaMalloc(&d_index, 8*sizeof(int));
cudaMalloc(&d_reduce_array_size, 8*sizeof(int));
cudaMalloc(&d_new_work_item_id, 8*sizeof(int));
cudaMalloc(&d_reduce_array_size, 8*sizeof(int));
cudaMemset(d_id_within_cluster, 0, 8*sizeof(int));
cudaMemset(d_index, 0, 8*sizeof(int));
cudaMemset(d_reduce_array_size, (float)0, 8*sizeof(int));
cudaMemset(d_new_work_item_id, 0, 8*sizeof(int));
cudaMemset(d_reduce_array_size, 0, 8*sizeof(int));
cudaMemcpy(d_values, values, 8*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_cluster_index, cluster_index, 8*sizeof(int), cudaMemcpyHostToDevice);
reduction<<<1, 8>>>(d_values, d_index, d_cluster_index, d_offset, d_id_within_cluster, d_new_work_item_id, d_reduce_array_size);
}
I should point out a warning about this code. It only works because we are working with integers. This code would fail for floating point types and so the ceil function would be appropriate in that case.