Hello to all,
I am trying to develop a reduction kernel ad hoc for my applications, copying the function reduce6 from the CUDA SDK reduction example (reduction_kernel.cu).
It works only if the number of threads per block is less or equal than 8.
In emulation mode it works also with more than 8 threads per block.
Could me explain why?
Thank you in advance
Francesco
[codebox]/*
Parallel reduction kernels
*/
#ifndef REDUCE_KERNEL_H
#define REDUCE_KERNEL_H
#include <stdio.h>
#include “sharedmem.cuh”
#ifdef DEVICE_EMULATION
#define EMUSYNC __syncthreads()
#define EMUSYNC
template <class T, unsigned int blockSize>
global void
reduce(T *g_idata, T *g_odata, unsigned int n)
{
SharedMemory<T> smem;
T *sdata = smem.getPointer();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
// we reduce multiple elements per thread. The number is determined by the
// number of active thread blocks (via gridSize). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
while (i < n)
{
sdata[tid] += g_idata[i] + g_idata[i+blockSize];
i += gridSize;
}
__syncthreads();
// do reduction in shared mem
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
#ifndef DEVICE_EMULATION
if (tid < 32)
{
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; EMUSYNC; }
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; EMUSYNC; }
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; EMUSYNC; }
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; EMUSYNC; }
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; EMUSYNC; }
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; EMUSYNC; }
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
#endif // #ifndef REDUCE_KERNEL_H[/codebox]
[codebox]#include “reduction_kernel.cu”
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil.h>
#define TYPE float
const unsigned int N=1024;
const unsigned DATA_SIZE=N*sizeof(int);
int main(int argc, char** argv)
{
int* h_vIn;
int* h_vOut;
int* d_vIn;
int* d_vOut;
int cpures;
CUT_DEVICE_INIT(argc, argv);
h_vIn = (int*) malloc(DATA_SIZE);
h_vOut = (int*) malloc(DATA_SIZE);
for ( int i=0; i!=N; i++ )
{
h_vIn[i]=i+1;
h_vOut[i]=0;
}
CUDA_SAFE_CALL(cudaMalloc((void**)&d_vIn, DATA_SIZE));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_vOut, DATA_SIZE));
CUDA_SAFE_CALL(cudaMemcpy(d_vIn, h_vIn, DATA_SIZE, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_vOut, h_vOut, DATA_SIZE, cudaMemcpyHostToDevice));
const unsigned int threadsPerBlock=8;
dim3 dimBlock(threadsPerBlock,1);
dim3 dimGrid(N/2/threadsPerBlock,1);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
reduce<int, threadsPerBlock><<<N/2/threadsPerBlock,threadsPerBlock>>>(d_vIn, d_vOut, N);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUT_CHECK_ERROR(“kernel execution failed.\n”);
CUDA_SAFE_CALL( cudaMemcpy(h_vOut, d_vOut, DATA_SIZE, cudaMemcpyDeviceToHost));
//CUDA_SAFE_CALL( cudaThreadSynchronize() );
cpures=0;
for (int i=0; i<N; i++)
cpures+=h_vIn[i];
printf(“computed cpures: %d\n”);
int gpures=0;
for (int i=0; i!=dimGrid.x; i++)
{
//printf("vOut[%d]: %d\n", i, h_vOut[i]);
gpures+=h_vOut[i];
}
printf(“gpures: %d\n”, gpures);
}[/codebox]