Hi all,
I can’t seem to get NPP to do a reduction sum. Can you help? I have tried to make a simple example.
/*
* reductiontest.cu
*
* Created on: 2 Aug 2011
* Author: tim
*/
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
#include <npp.h>
#include <nppcore.h>
static void HandleError( cudaError_t err,
const char *file,
int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
const int THREAD_DIM = 2;
const int BLOCK_DIM = 2;
const int GRID_DIM = THREAD_DIM * BLOCK_DIM;
__global__ void init_array(Npp32f *data_d)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
data_d[tid] = 1.0;
}
int main()
{
printf("Start test\n");
Npp32f *data_h;
Npp32f *data_d;
Npp32f *res_h;
Npp32f *res_d;
int bufferSize_h = 0;
Npp8u *buffer_d;
HANDLE_ERROR( cudaHostAlloc((void**) &data_h,sizeof(Npp32f)*GRID_DIM,
cudaHostAllocDefault) );
HANDLE_ERROR( cudaMalloc((void**) &data_d, sizeof(Npp32f)*GRID_DIM) );
HANDLE_ERROR( cudaHostAlloc((void**) &res_h,sizeof(Npp32f),
cudaHostAllocDefault) );
HANDLE_ERROR( cudaMalloc((void**) &res_d, sizeof(Npp32f)) );
nppsReductionGetBufferSize_32f(10, &bufferSize_h);
buffer_d = nppsMalloc_8u( bufferSize_h );
//call init kernel
init_array<<<BLOCK_DIM,THREAD_DIM>>>(data_d);
//Copy result array to host
HANDLE_ERROR( cudaMemcpy(data_h, data_d,
sizeof(Npp32f)*GRID_DIM, cudaMemcpyDeviceToHost) );
for (unsigned int i = 0; i < GRID_DIM; i++) {
printf("%f\n",data_h[i]);
}
// Sum all values together
nppsSum_32f(data_d, GRID_DIM, res_d, nppAlgHintNone, buffer_d);
//Copy result array to host
HANDLE_ERROR( cudaMemcpy(res_h, res_d,
sizeof(Npp32f), cudaMemcpyDeviceToHost) );
printf("%.7f\n",(float) *res_h);
return EXIT_SUCCESS;
}
Thanks in advance for any help External Image
Kisty