porting opencl pseudocode to CUDA

I’m trying to implement the reduction code described here:

so far i’ve done the following and i’m worrying whether i’m on the right track

__global__ void reduction(int * values,int * index,int * cluster_index)

{

int tid=threadIdx.x;

int c=cluster_index[tid];

int id_within_cluster=index[c];

atomicAdd(1,index[c]);

__syncthreads();

reduce_array_size=index[c];

if(tid==0)global_offset=0;

if(id_within_cluster==0)

  {

    index[c]=atomicAdd(reduce_array_size,global_offset);

  }

offset=index[c];

int newTid=offset+id_within_cluster;

index[newTid]=tid;

for(int i=thread_count/2;i>0;i/2)

{

  if(id_within_cluster<(reduce_array_size/2))

  {

    reduce_array_size=ceil(reduce_array_size/2);

    values[index[newTid]]+=values[index[newTid+reduce_array_size]]

  }

__syncthreads();

}

}

Thank you very much for your time

Apostolis

no views in half a day?
nobody is interested?

I’m still trying to get it right but i’m not getting it.

The version i’m attaching compiles just fine but it doesn’t seem to do anything.(The values in the tables are zeroes).

I’m confused.

Thanks for your help

Apostolis

#include <stdio.h>

__shared__ int global_offset;

__global__ void reduction(int * values,int * index,int * cluster_index,int * offset,int * id_within_cluster,int * new_work_item_id,int * reduce_array_size)

{

printf("global_offset is %d \n",global_offset);

printf("tid is %d \n",threadIdx.x);

id_within_cluster[threadIdx.x]=atomicAdd(&index[cluster_index[threadIdx.x]],1);

printf("id_within_cluster of %d is %d \n",threadIdx.x,id_within_cluster[threadIdx.x]);

__syncthreads();

reduce_array_size[threadIdx.x]=index[cluster_index[threadIdx.x]];

printf("reduce_array_size of %d is %d \n",threadIdx.x,reduce_array_size[threadIdx.x]);

if(threadIdx.x==0)global_offset=0;

if(id_within_cluster==0)

  {

    index[cluster_index[threadIdx.x]]=atomicAdd(&global_offset,reduce_array_size[threadIdx.x]);

  }

offset[threadIdx.x]=index[cluster_index[threadIdx.x]];

new_work_item_id[threadIdx.x]=offset[threadIdx.x]+id_within_cluster[threadIdx.x];

index[new_work_item_id[threadIdx.x]]=threadIdx.x;

/*

for(int i=8/2;i>0;i/2)

{

  if(id_within_cluster<(reduce_array_size/2))

  {

    reduce_array_size=ceil(reduce_array_size/2);

    values[index[newTid]]+=values[index[newTid+reduce_array_size]];

  }

}

*/

printf("global_offset is %d \n",global_offset);

__syncthreads();

}

int main()

{

int values[8]={5,2,9,4,8,11,3,1};

int cluster_index[8]={0,2,3,2,1,3,1,3};

int offset[8];

int index[8];

int new_work_item_id[8];

int * d_values;

int * d_cluster_index;

int * d_offset;

int * d_global_offset;

int * d_id_within_cluster;

int * d_index;

int * d_reduce_array_size;

int * d_new_work_item_id;

cudaMalloc(&d_values,8*sizeof(int));

cudaMalloc(&d_offset,8*sizeof(int));

cudaMalloc(&d_cluster_index,8*sizeof(int));

cudaMalloc(&d_id_within_cluster,8*sizeof(int));

cudaMalloc(&d_index,8*sizeof(int));

cudaMalloc(&d_reduce_array_size,8*sizeof(int));

cudaMalloc(&d_new_work_item_id,8*sizeof(int));

cudaMalloc(&d_reduce_array_size,8*sizeof(int));

cudaMemset(&d_id_within_cluster,0,8*sizeof(int));

cudaMemset(&d_index,0,8*sizeof(int));

cudaMemset(&d_reduce_array_size,0,8*sizeof(int));

cudaMemset(&d_new_work_item_id,0,8*sizeof(int));

cudaMemset(&d_reduce_array_size,0,8*sizeof(int));

cudaMemcpy(&d_values,values,8*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(&d_offset,offset,8*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(&d_cluster_index,cluster_index,8*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(&d_global_offset,&global_offset,sizeof(int),cudaMemcpyHostToDevice);

reduction<<<1,8>>>(d_values,index,d_cluster_index,d_offset,d_id_within_cluster,d_new_work_item_id,d_reduce_array_size);

}

First, I would suggest you take a look at the cuda sdk because there is project in there called “reduction” that does exactly what you want but very efficiently for CUDA. You could learn a few things from it.

Secondly, to answer your question, the reason your code is only giving you zeroes is because you didn’t actually set any of the values for arrays on the device. For example, you define d_values and malloc it correctly but your cudaMemcpy is wrong. So, d_values is filled with arbitrary values (which are most likely zero). The problem is cudaMemcpy requires that you pass in a pointer to the first address of your array of values (for both the target AND the source arrays) but you typed &d_value which is a pointer to the pointer. As we should all remember from learning C/C++, arrays and pointers are the same thing (float *a and float a are equivalent; in both cases a is a pointer to the first element in an array which has yet to be defined). All of your memory setting and copying commands are incorrect. I think you got confused by cudaMalloc (which you did actually use correctly). Where cudaMemset and cudaMemcpy are reading/writing the values ONLY within the arrays, cudaMalloc is finding a spot in memory that can be allocated and then storing that address within the pointer. So, one command is modifying the array while the other is modifying the pointer to the array. That is why you need to pass in the array pointer by reference for cudaMalloc but you only need to pass it by value for cudaMemset and cudaMemcpy.

So, change all of your cudaMemset and cudaMemcpy commands that are like this:

cudaMemset(&d_id_within_cluster,0,8*sizeof(int));

to this:

cudaMemset(d_id_within_cluster,0,8*sizeof(int));

I’m not having problem with the reduction part.

I commented it in my code to check the first part of the algorithm.

I also changed my code according to your suggestion but nothing changes.

I’m attaching the corrected code.

Thank you for your help anyways.

Apostolis

__shared__ int global_offset;

__global__ void reduction(int * values,int * index,int * cluster_index,int * offset,int * id_within_cluster,int * new_work_item_id,int * reduce_array_size)

{

printf("global_offset is %d \n",global_offset);

printf("tid is %d \n",threadIdx.x);

printf("index of  of %d is %d \n",threadIdx.x,index[cluster_index[threadIdx.x]]);

id_within_cluster[threadIdx.x]=atomicAdd(&index[cluster_index[threadIdx.x]],1);

printf("id_within_cluster of %d is %d \n",threadIdx.x,id_within_cluster[threadIdx.x]);

__syncthreads();

reduce_array_size[threadIdx.x]=index[cluster_index[threadIdx.x]];

printf("reduce_array_size of %d is %d \n",threadIdx.x,reduce_array_size[threadIdx.x]);

if(threadIdx.x==0)global_offset=0;

if(id_within_cluster==0)

  {

    index[cluster_index[threadIdx.x]]=atomicAdd(&global_offset,reduce_array_size[threadIdx.x]);

  }

offset[threadIdx.x]=index[cluster_index[threadIdx.x]];

new_work_item_id[threadIdx.x]=offset[threadIdx.x]+id_within_cluster[threadIdx.x];

index[new_work_item_id[threadIdx.x]]=threadIdx.x;

/*

for(int i=8/2;i>0;i/2)

{

  if(id_within_cluster<(reduce_array_size/2))

  {

    reduce_array_size=ceil(reduce_array_size/2);

    values[index[newTid]]+=values[index[newTid+reduce_array_size]];

  }

}

*/

printf("global_offset is %d \n",global_offset);

__syncthreads();

}

int main()

{

int values[8]={5,2,9,4,8,11,3,1};

int cluster_index[8]={0,2,3,2,1,3,1,3};

int offset[8];

int index[8];

int new_work_item_id[8];

int * d_values;

int * d_cluster_index;

int * d_offset;

int * d_id_within_cluster;

int * d_index;

int * d_reduce_array_size;

int * d_new_work_item_id;

cudaMalloc(&d_values,8*sizeof(int));

cudaMalloc(&d_offset,8*sizeof(int));

cudaMalloc(&d_cluster_index,8*sizeof(int));

cudaMalloc(&d_id_within_cluster,8*sizeof(int));

cudaMalloc(&d_index,8*sizeof(int));

cudaMalloc(&d_reduce_array_size,8*sizeof(int));

cudaMalloc(&d_new_work_item_id,8*sizeof(int));

cudaMalloc(&d_reduce_array_size,8*sizeof(int));

cudaMemset(d_id_within_cluster,0,8*sizeof(int));

cudaMemset(d_index,0,8*sizeof(int));

cudaMemset(d_reduce_array_size,0,8*sizeof(int));

cudaMemset(d_new_work_item_id,0,8*sizeof(int));

cudaMemset(d_reduce_array_size,0,8*sizeof(int));

cudaMemcpy(d_values,values,8*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(d_offset,offset,8*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(d_cluster_index,cluster_index,8*sizeof(int),cudaMemcpyHostToDevice);

reduction<<<1,8>>>(d_values,index,d_cluster_index,d_offset,d_id_within_cluster,d_new_work_item_id,d_reduce_array_size);

}

When i execute the following code :

#include <stdio.h>

__shared__ int global_offset;

__global__ void reduction(int * values,int * index,int * cluster_index,int * offset,int * id_within_cluster,int * new_work_item_id,int * reduce_array_size)

{

printf("global_offset is %d \n",global_offset);

printf("values of %d is %d \n",threadIdx.x,values[threadIdx.x]);

printf("cluster_index of %d is %d \n",threadIdx.x,cluster_index[threadIdx.x]);

printf("index of  of %d is %d \n",threadIdx.x,index[cluster_index[threadIdx.x]]);

printf("reduce_array_size of %d is %d \n",threadIdx.x,reduce_array_size[threadIdx.x]);

printf("id within cluster of %d is %d \n",threadIdx.x,id_within_cluster[threadIdx.x]);

id_within_cluster[threadIdx.x]=atomicAdd(&index[cluster_index[threadIdx.x]],1);

printf("id_within_cluster of %d is %d \n",threadIdx.x,id_within_cluster[threadIdx.x]);

printf("index of  of %d is %d \n",threadIdx.x,index[cluster_index[threadIdx.x]]);

//__syncthreads();

reduce_array_size[threadIdx.x]=index[cluster_index[threadIdx.x]];

printf("reduce_array_size of %d is %d \n",threadIdx.x,reduce_array_size[threadIdx.x]);

if(threadIdx.x==0)global_offset=0;

if(id_within_cluster==0)

  {

    index[cluster_index[threadIdx.x]]=atomicAdd(&global_offset,reduce_array_size[threadIdx.x]);

  }

offset[threadIdx.x]=index[cluster_index[threadIdx.x]];

new_work_item_id[threadIdx.x]=offset[threadIdx.x]+id_within_cluster[threadIdx.x];

index[new_work_item_id[threadIdx.x]]=threadIdx.x;

/*

for(int i=8/2;i>0;i/2)

{

  if(id_within_cluster<(reduce_array_size/2))

  {

    reduce_array_size=ceil(reduce_array_size/2);

    values[index[newTid]]+=values[index[newTid+reduce_array_size]];

  }

}

*/

printf("global_offset is %d \n",global_offset);

__syncthreads();

}

int main()

{

int values[8]={5,2,9,4,8,11,3,1};

int cluster_index[8]={0,2,3,2,1,3,1,3};

int offset[8];

int index[8];

int new_work_item_id[8];

int * d_values;

int * d_cluster_index;

int * d_offset;

int * d_id_within_cluster;

int * d_index;

int * d_reduce_array_size;

int * d_new_work_item_id;

cudaMalloc(&d_values,8*sizeof(int));

cudaMalloc(&d_offset,8*sizeof(int));

cudaMalloc(&d_cluster_index,8*sizeof(int));

cudaMalloc(&d_id_within_cluster,8*sizeof(int));

cudaMalloc(&d_index,8*sizeof(int));

cudaMalloc(&d_reduce_array_size,8*sizeof(int));

cudaMalloc(&d_new_work_item_id,8*sizeof(int));

cudaMalloc(&d_reduce_array_size,8*sizeof(int));

cudaMemset(d_id_within_cluster,0,8*sizeof(int));

cudaMemset(d_index,0,8*sizeof(int));

cudaMemset(d_reduce_array_size,0,8*sizeof(int));

cudaMemset(d_new_work_item_id,0,8*sizeof(int));

cudaMemset(d_reduce_array_size,0,8*sizeof(int));

cudaMemcpy(d_values,values,8*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(d_offset,offset,8*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(d_cluster_index,cluster_index,8*sizeof(int),cudaMemcpyHostToDevice);

reduction<<<1,8>>>(d_values,index,d_cluster_index,d_offset,d_id_within_cluster,d_new_work_item_id,d_reduce_array_size);

/*

for(int i=0;i<8;i++)

{

printf("values of %d is %d \n",i,values[i]);

printf("offset of %d is %d \n",i,offset[i]);

printf("cluster_index of %d is %d \n",i,cluster_index[i]);

printf("index of %d is %d \n",i,index[i]);

}

*/

}

i get this output:

global_offset is 0 

global_offset is 0 

global_offset is 0 

global_offset is 0 

global_offset is 0 

global_offset is 0 

global_offset is 0 

global_offset is 0 

values of 0 is 5 

values of 1 is 2 

values of 2 is 9 

values of 3 is 4 

values of 4 is 8 

values of 5 is 11 

values of 6 is 3 

values of 7 is 1 

cluster_index of 0 is 0 

cluster_index of 1 is 2 

cluster_index of 2 is 3 

cluster_index of 3 is 2 

cluster_index of 4 is 1 

cluster_index of 5 is 3 

cluster_index of 6 is 1 

cluster_index of 7 is 3 

index of  of 0 is 0 

index of  of 1 is 0 

index of  of 2 is 0 

index of  of 3 is 0 

index of  of 4 is 0 

index of  of 5 is 0 

index of  of 6 is 0 

index of  of 7 is 0 

reduce_array_size of 0 is 0 

reduce_array_size of 1 is 0 

reduce_array_size of 2 is 0 

reduce_array_size of 3 is 0 

reduce_array_size of 4 is 0 

reduce_array_size of 5 is 0 

reduce_array_size of 6 is 0 

reduce_array_size of 7 is 0 

id within cluster of 0 is 0 

id within cluster of 1 is 0 

id within cluster of 2 is 0 

id within cluster of 3 is 0 

id within cluster of 4 is 0 

id within cluster of 5 is 0 

id within cluster of 6 is 0 

id within cluster of 7 is 0

for some reason the execution seems to halt after the first few commands of the reduction function.

As you can see the memcopies are done correctly.

I really can’t tell why this is happening,can anybody please help me?

Thanks a lot

Apostolis

I don’t know how you weren’t having a problem with your reduction part as you weren’t passing in anything but zeroes into the kernel but alright… I am not about to figure out what your code is supposed to do and try programming it in CUDA for you. But, I will point out a few issues that jump out at me. Hopefully, addressing these will fix your problem.

[list=1]

[*]You pass index instead of d_index into the kernel. Since index is only allocated in the system memory, which the GPU cannot reach, I have no idea how the program will respond to something like that. I am actually surprised it doesn’t crash for you.

[*]You copy the values from offset to d_offset but you never actually set any values for offset. Is this on purpose?

[*]You allocate new_work_item_id on the host but then you don’t actually use it for anything. It is my opinion that new_work_item_id, offset, index have no reason to be defined and allocated on the host (at least with how your code is defined now).

[*]I don’t think this matters with so few threads, but I think there should be a __syncthreads() after initializing global_offset to zero because some threads might race ahead and execute the following code before it is set to zero.

[*]I have never used atomic operations before so I may be mistaken about this. But from what I just now read when looking it up, the value returned from an atomicAdd (or any other operation) is not the result of the addition but rather the old value that was in the address that is being atomically modified. I don’t know if you are aware of this and counting on it or not. I figured I would just point it out.

[*]You have an if statement that says compares id_within_cluster but id_within_cluster is an array, not a single value. I think that instead you want id_within_cluster[threadIdx.x].

[*]You may want another __syncthreads() after that last if statement as well because some threads may race ahead and read the value that is being updated within the conditional. With so few threads in this code, that will probably not happen but it can when more threads are added. Or, perhaps there is something about the logic of the code the precludes that from being an issue. I don’t know. I am just pointing out whatever I notice.

Here is the same code where I have made the changes mentioned above:

__shared__ int global_offset;

__global__ void reduction(int *values, int *index, int *cluster_index, int *offset, int *id_within_cluster, int *new_work_item_id, int *reduce_array_size)

{

	id_within_cluster[threadIdx.x] = atomicAdd( &index[cluster_index[threadIdx.x]], 1);

	__syncthreads();

	reduce_array_size[threadIdx.x] = index[cluster_index[threadIdx.x]];

	if (threadIdx.x == 0)

		global_offset = 0;

	__syncthreads();

	if (id_within_cluster[threadIdx.x] == 0)

		index[cluster_index[threadIdx.x]] = atomicAdd(&global_offset, reduce_array_size[threadIdx.x]);

	__syncthreads();

	offset[threadIdx.x] = index[cluster_index[threadIdx.x]];

	new_work_item_id[threadIdx.x] = offset[threadIdx.x] + id_within_cluster[threadIdx.x];

	index[new_work_item_id[threadIdx.x]] = threadIdx.x;

}

int main()

{

	int values[8] = {5, 2, 9, 4, 8, 11, 3, 1};

	int cluster_index[8] = {0, 2, 3, 2, 1, 3, 1, 3};

	int *d_values;

	int *d_cluster_index;

	int *d_offset;

	int *d_id_within_cluster;

	int *d_index;

	int *d_reduce_array_size;

	int *d_new_work_item_id;

	cudaMalloc(&d_values, 8*sizeof(int));

	cudaMalloc(&d_offset, 8*sizeof(int));

	cudaMalloc(&d_cluster_index, 8*sizeof(int));

	cudaMalloc(&d_id_within_cluster, 8*sizeof(int));

	cudaMalloc(&d_index, 8*sizeof(int));

	cudaMalloc(&d_reduce_array_size, 8*sizeof(int));

	cudaMalloc(&d_new_work_item_id, 8*sizeof(int));

	cudaMalloc(&d_reduce_array_size, 8*sizeof(int));

	cudaMemset(d_id_within_cluster, 0, 8*sizeof(int));

	cudaMemset(d_index, 0, 8*sizeof(int));

	cudaMemset(d_reduce_array_size, 0, 8*sizeof(int));

	cudaMemset(d_new_work_item_id, 0, 8*sizeof(int));

	cudaMemset(d_reduce_array_size, 0, 8*sizeof(int));

	cudaMemcpy(d_values, values, 8*sizeof(int), cudaMemcpyHostToDevice);

	cudaMemcpy(d_cluster_index, cluster_index, 8*sizeof(int), cudaMemcpyHostToDevice);

	reduction<<<1, 8>>>(d_values, d_index, d_cluster_index, d_offset, d_id_within_cluster, d_new_work_item_id, d_reduce_array_size);

}

Your suggestions were correct.

I almos finished porting the pseudocode in the paper to CUDA but it fails in the reduction part now (the part that i haven’t tried to implement thus far).

The code is the following

__shared__ int global_offset;

#include <stdio.h>

#include <math.h>

__global__ void reduction(int *values, int *index, int *cluster_index, int *offset, int *id_within_cluster, int *new_work_item_id, int *reduce_array_size)

{

id_within_cluster[threadIdx.x] = atomicAdd( &index[cluster_index[threadIdx.x]], 1);

__syncthreads();

        reduce_array_size[threadIdx.x] = index[cluster_index[threadIdx.x]];

if (threadIdx.x == 0)

                global_offset = 0;

        __syncthreads();

if (id_within_cluster[threadIdx.x] == 0)

                index[cluster_index[threadIdx.x]] = atomicAdd(&global_offset, reduce_array_size[threadIdx.x]);

        __syncthreads();

offset[threadIdx.x] = index[cluster_index[threadIdx.x]];

new_work_item_id[threadIdx.x] = offset[threadIdx.x] + id_within_cluster[threadIdx.x];

index[new_work_item_id[threadIdx.x]] = threadIdx.x;

for(int i= 8/2; i>0; i/=2)

{

  if( id_within_cluster[threadIdx.x] < ( (reduce_array_size[threadIdx.x])/2 ) )

  {

    reduce_array_size[threadIdx.x]=ceilf(((reduce_array_size[threadIdx.x])/2) );

    values[index[new_work_item_id[threadIdx.x]]]+=values[index[new_work_item_id[threadIdx.x]+reduce_array_size[threadIdx.x]]];

}

__syncthreads();

}

printf("values of %d after reduction is %d \n",threadIdx.x,values[threadIdx.x]);

}

int main()

{

        int values[8] = {5, 2, 9, 4, 8, 11, 3, 1};

        int cluster_index[8] = {0, 2, 3, 2, 1, 3, 1, 3};

int *d_values;

        int *d_cluster_index;

        int *d_offset;

        int *d_id_within_cluster;

        int *d_index;

        int *d_reduce_array_size;

        int *d_new_work_item_id;

cudaMalloc(&d_values, 8*sizeof(int));

        cudaMalloc(&d_offset, 8*sizeof(int));

        cudaMalloc(&d_cluster_index, 8*sizeof(int));

cudaMalloc(&d_id_within_cluster, 8*sizeof(int));

        cudaMalloc(&d_index, 8*sizeof(int));

        cudaMalloc(&d_reduce_array_size, 8*sizeof(int));

        cudaMalloc(&d_new_work_item_id, 8*sizeof(int));

        cudaMalloc(&d_reduce_array_size, 8*sizeof(int));

cudaMemset(d_id_within_cluster, 0, 8*sizeof(int));

        cudaMemset(d_index, 0, 8*sizeof(int));

        cudaMemset(d_reduce_array_size, 0, 8*sizeof(int));

        cudaMemset(d_new_work_item_id, 0, 8*sizeof(int));

        cudaMemset(d_reduce_array_size, 0, 8*sizeof(int));

cudaMemcpy(d_values, values, 8*sizeof(int), cudaMemcpyHostToDevice);

        cudaMemcpy(d_cluster_index, cluster_index, 8*sizeof(int), cudaMemcpyHostToDevice);

reduction<<<1, 8>>>(d_values, d_index, d_cluster_index, d_offset, d_id_within_cluster, d_new_work_item_id, d_reduce_array_size);

}

and the result is

values of 0 after reduction is 5 

values of 1 after reduction is 6 

values of 2 after reduction is 20 

values of 3 after reduction is 4 

values of 4 after reduction is 11 

values of 5 after reduction is 11 

values of 6 after reduction is 3 

values of 7 after reduction is 1

while according to the paper it should be:

values of 0 after reduction is 5 

values of 1 after reduction is 6 

values of 2 after reduction is 21 

values of 3 after reduction is 4 

values of 4 after reduction is 11 

values of 5 after reduction is 11 

values of 6 after reduction is 3 

values of 7 after reduction is 1

from what i get the final addition between 20 and 1 isn’t happening.

My guess is it might have to do with the ceilf function operating on integers,could it be the case?

Any ideas are welcome,

I will also have a look in the reduction example of the CUDA SDK

thanks for your help

Apostolis

i really cant tell what is going wrong with my code.

I run the code in pen and paper,and the values of reduce_array_size are the ones they should be,but my code still computes the wrong values.

i really don’t get it.

Can anybody take a look?

I’m attaching my code

Edit:scratch that,i found the error ,in the first iteration reduce_array_size[2] should be 2 and not 1.That means that my suspicion that the ceilf function not working correctly was correct,

but when i try to use ceil instead of ceilf i get the following error:

"irregular2.cu(37): error: calling a host function("std::ceil<int> ") from a __device__/__global__ function("reduction") is not allowed

Edit2:finally got it to work correctly,i needed to cast reduce_array_size[threadIdx.x] to float in order for it to work correcty.

The attached file now works correctly.

I’m also posting the code for the sake of completeness.

Thank you for your help everyone

Apostolis

__shared__ int global_offset;

#include <stdio.h>

#include <math.h>

__global__ void reduction(int *values, int *index, int *cluster_index, int *offset, int *id_within_cluster, int *new_work_item_id, int *reduce_array_size)

{

id_within_cluster[threadIdx.x] = atomicAdd( &index[cluster_index[threadIdx.x]], 1);

__syncthreads();

        reduce_array_size[threadIdx.x] = index[cluster_index[threadIdx.x]];

if (threadIdx.x == 0)

                global_offset = 0;

        __syncthreads();

if (id_within_cluster[threadIdx.x] == 0)

                index[cluster_index[threadIdx.x]] = atomicAdd(&global_offset, reduce_array_size[threadIdx.x]);

        __syncthreads();

offset[threadIdx.x] = index[cluster_index[threadIdx.x]];

new_work_item_id[threadIdx.x] = offset[threadIdx.x] + id_within_cluster[threadIdx.x];

index[new_work_item_id[threadIdx.x]] = threadIdx.x;

int count=0;

for(int i= 8/2; i>0; i/=2)

{

  if( id_within_cluster[threadIdx.x] < ( (reduce_array_size[threadIdx.x])/2 ) )

  {

    reduce_array_size[threadIdx.x]=ceilf(((float)(reduce_array_size[threadIdx.x])/2) );

values[index[new_work_item_id[threadIdx.x]]]+=values[index[new_work_item_id[threadIdx.x]+reduce_array_size[threadIdx.x]]];

}

__syncthreads();

}

}

int main()

{

        int values[8] = {5, 2, 9, 4, 8, 11, 3, 1};

        int cluster_index[8] = {0, 2, 3, 2, 1, 3, 1, 3};

int *d_values;

        int *d_cluster_index;

        int *d_offset;

        int *d_id_within_cluster;

        int *d_index;

       int *d_reduce_array_size;

        int *d_new_work_item_id;

cudaMalloc(&d_values, 8*sizeof(int));

        cudaMalloc(&d_offset, 8*sizeof(int));

        cudaMalloc(&d_cluster_index, 8*sizeof(int));

cudaMalloc(&d_id_within_cluster, 8*sizeof(int));

        cudaMalloc(&d_index, 8*sizeof(int));

        cudaMalloc(&d_reduce_array_size, 8*sizeof(int));

        cudaMalloc(&d_new_work_item_id, 8*sizeof(int));

        cudaMalloc(&d_reduce_array_size, 8*sizeof(int));

cudaMemset(d_id_within_cluster, 0, 8*sizeof(int));

        cudaMemset(d_index, 0, 8*sizeof(int));

        cudaMemset(d_reduce_array_size, (float)0, 8*sizeof(int));

        cudaMemset(d_new_work_item_id, 0, 8*sizeof(int));

        cudaMemset(d_reduce_array_size, 0, 8*sizeof(int));

cudaMemcpy(d_values, values, 8*sizeof(int), cudaMemcpyHostToDevice);

        cudaMemcpy(d_cluster_index, cluster_index, 8*sizeof(int), cudaMemcpyHostToDevice);

reduction<<<1, 8>>>(d_values, d_index, d_cluster_index, d_offset, d_id_within_cluster, d_new_work_item_id, d_reduce_array_size);

}

irregular2.cu (3.53 KB)

You would need to replace that with

reduce_array_size=ceil(reduce_array_size/2.0);

to make it work as intended. But why detour via floating point at all? Just make it

reduce_array_size=(reduce_array_size+1)/2;

I am very happy to hear you got it working. Good luck with everything.

Better yet, why bother with division at all? Just make it

reduce_array_size[threadIdx.x] = (reduce_array_size[threadIdx.x]+1) >> 1;

I should point out a warning about this code. It only works because we are working with integers. This code would fail for floating point types and so the ceil function would be appropriate in that case.

cool trick,i haven’t thought of that.
reduce array size is an integer according to the algorithm so no problem there.

Apostolis