Hello! lady and gentleman
I want to ask parallel reduction…
reduce0~2 works well, but reduce3~4 does not work.
It seems that this is not work because it is different.
reduce0~2 = unsigned int i = blockIdx.x* blockDim.x + threadIdx.x;
reduce3~4 = unsigned int i = blockIdx.x* (blockDim.x *2) + threadIdx.x;
I do not know exactly what the problem is.
I would appreciate your attention. thank you.
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdlib.h>
#include <stdio.h>
#include <cooperative_groups.h>
global void reduce0(int *g_idata, int *g_odata,unsigned int n)
{
extern shared int sdata;
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? g_idata[i] : 0;
__syncthreads();
// do reduction in shared mem
for (unsigned int s = 1; s < blockDim.x; s *= 2)
{
// modulo arithmetic is slow!
if ((tid % (2 * s)) == 0)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0)
{
g_odata[blockIdx.x] = sdata[0];
}
}
global void reduce1(int *g_idata, int *g_odata, unsigned int n)
{
extern shared int sdata;
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? g_idata[i] : 0;
__syncthreads();
// do reduction in shared mem
for (unsigned int s = 1; s < blockDim.x; s *= 2)
{
int index = 2 * s * tid;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
global void reduce2(int *g_idata, int *g_odata , unsigned int n)
{
extern shared int sdata;
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? g_idata[i] : 0;
__syncthreads();
// do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
global void reduce3(int *g_idata, int *g_odata, unsigned int n)
{
extern shared int sdata;
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockDim.x * 2 ) + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
__syncthreads();
// do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
device void warpReduce(volatile int* sdata, int tid)
{
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
global void reduce4(int *g_idata, int *g_odata, unsigned int n)
{
extern shared int sdata;
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x* (blockDim.x *2) + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i + blockDim.x];
__syncthreads();
for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
{
if (tid < s)
sdata[tid] += sdata[tid + s];
__syncthreads();
}
if (tid < 32) warpReduce(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
int main()
{
const int arraySize = 2000;
int a[arraySize];
int* g_idata_d;
int* g_odata_d;
int g_odata[1] = { 0 };
int i = 0;
for (i = 0; i < arraySize; i++)
{
a[i] = i + i + i * i;
}
//디바이스 메모리할당
cudaMalloc((void**)&g_idata_d, sizeof(int)*arraySize);
cudaMalloc((void**)&g_odata_d, sizeof(int) * 1);
cudaMemcpy(g_idata_d, a, sizeof(int)*arraySize, cudaMemcpyHostToDevice);
reduce0 << <(arraySize + 1) / 1024, 1024, arraySize * sizeof(int) >> > (g_idata_d, g_odata_d, arraySize);
cudaMemcpy(g_odata, g_odata_d, sizeof(int), cudaMemcpyDeviceToHost);
printf(" sum = %d\n", g_odata[0]);
//디바이스 메모리 해제
cudaFree(g_idata_d);
cudaFree(g_odata_d);
getchar();
return 0;
}