I want to ask parallel reduction....

11friends1119 · January 30, 2019, 1:47am

Hello! lady and gentleman

I want to ask parallel reduction…

reduce0~2 works well, but reduce3~4 does not work.

It seems that this is not work because it is different.

reduce0~2 = unsigned int i = blockIdx.x* blockDim.x + threadIdx.x;
reduce3~4 = unsigned int i = blockIdx.x* (blockDim.x *2) + threadIdx.x;

I do not know exactly what the problem is.

I would appreciate your attention. thank you.

#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdlib.h>
#include <stdio.h>
#include <cooperative_groups.h>

global void reduce0(int *g_idata, int *g_odata,unsigned int n)
{
extern shared int sdata;

// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;



sdata[tid] = (i < n) ? g_idata[i] : 0;
__syncthreads();


// do reduction in shared mem
for (unsigned int s = 1; s < blockDim.x; s *= 2)
{
	// modulo arithmetic is slow!
	if ((tid % (2 * s)) == 0)
	{
		sdata[tid] += sdata[tid + s];
	}

	__syncthreads();
}

// write result for this block to global mem
if (tid == 0)
{
	g_odata[blockIdx.x] = sdata[0];
}

}

global void reduce1(int *g_idata, int *g_odata, unsigned int n)
{
extern shared int sdata;

// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;

sdata[tid] = (i < n) ? g_idata[i] : 0;
__syncthreads();

// do reduction in shared mem
for (unsigned int s = 1; s < blockDim.x; s *= 2)
{
	int index = 2 * s * tid;

	if (index < blockDim.x)
	{
		sdata[index] += sdata[index + s];
	}

	__syncthreads();
}

// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

global void reduce2(int *g_idata, int *g_odata , unsigned int n)
{
extern shared int sdata;

// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;

sdata[tid] = (i < n) ? g_idata[i] : 0;

__syncthreads();

// do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
	if (tid < s)
	{
		sdata[tid] += sdata[tid + s];
	}

	__syncthreads();
}

// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

global void reduce3(int *g_idata, int *g_odata, unsigned int n)
{
extern shared int sdata;

// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockDim.x * 2 ) + threadIdx.x;

sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
__syncthreads();

// do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
	if (tid < s)
	{
		sdata[tid] += sdata[tid + s];
	}

	__syncthreads();
}

// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

device void warpReduce(volatile int* sdata, int tid)
{
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}

global void reduce4(int *g_idata, int *g_odata, unsigned int n)
{
extern shared int sdata;

unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x* (blockDim.x *2) + threadIdx.x;

sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
__syncthreads();


for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
{
	if (tid < s)
		sdata[tid] += sdata[tid + s];
	__syncthreads();
}
if (tid < 32) warpReduce(sdata, tid);

if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

int main()
{

const int arraySize = 2000;
int a[arraySize];
int* g_idata_d;
int* g_odata_d;
int g_odata[1] = { 0 };





int i = 0;

for (i = 0; i < arraySize; i++)
{
	a[i] = i + i + i * i;

}



//디바이스 메모리할당
cudaMalloc((void**)&g_idata_d, sizeof(int)*arraySize);
cudaMalloc((void**)&g_odata_d, sizeof(int) * 1);

cudaMemcpy(g_idata_d, a, sizeof(int)*arraySize, cudaMemcpyHostToDevice);



reduce0 << <(arraySize + 1) / 1024, 1024, arraySize * sizeof(int) >> > (g_idata_d, g_odata_d, arraySize);




cudaMemcpy(g_odata, g_odata_d, sizeof(int), cudaMemcpyDeviceToHost);






printf(" sum = %d\n", g_odata[0]);



//디바이스 메모리 해제
cudaFree(g_idata_d);
cudaFree(g_odata_d);




getchar();
return 0;

}

Topic		Replies	Views
Question about using shared memory in parallel reduction example CUDA Programming and Performance	4	2659	November 7, 2011
Reduction & block dimension Using the easiest reduction example of the SDK CUDA Programming and Performance	6	2230	November 23, 2009
IS __syncthread() resetting shared memory values? CUDA Programming and Performance	2	718	August 9, 2018
Problems with Reduction Kernel CUDA Programming and Performance	7	1549	March 22, 2013
Correlation on GPU CUDA Programming and Performance	7	2067	October 23, 2010
Code doubt CUDA Programming and Performance	3	545	August 27, 2018
Reduction questions(newbie-ish) CUDA Programming and Performance	7	1800	January 14, 2009
Reduction Problem CUDA Programming and Performance	2	2110	May 9, 2010
I want to allocate a lot of memory in CUDA. CUDA Programming and Performance	1	405	January 28, 2019
Parallel reduction problem CUDA Programming and Performance	1	5083	November 29, 2010

I want to ask parallel reduction....

Related topics