Hey guyz,
I’m currently working on parallel reduction and I tried to test the kernels that are
presented on the NVIDIA Paper, by Mark Harris, “Optimizing Parallel Reduction in CUDA”.
Here is the code:
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
#define _ARRAY_SIZE 4
global void reduction1(int *g_idata, int *g_odata)
{
extern shared int sdata;
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s = 1; s < blockDim.x; s *= 2) {
if(tid % (2*s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if(tid==0)
g_odata[blockIdx.x] = sdata[0];
}
global void reduction2(int *g_idata, int *g_odata)
{
extern shared int sdata;
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s = 1; s < blockDim.x; s *= 2) {
int index = 2*s*tid;
if(index < blockDim.x) {
sdata[index] += sdata[index + s];
}
__syncthreads();
}
if(tid==0)
g_odata[blockIdx.x] = sdata[0];
}
int main(int argc, char** argv)
{
const int arraySize = _ARRAY_SIZE;
int a[arraySize];
int temp = 0;
int* d_idata=0, *d_odata=0, *h_odata=0;
printf("Array created: ");
for(unsigned int i = 0; i < _ARRAY_SIZE; i++) {
a[i] = i;
temp += i;
printf("%d ", a[i]);
}
printf("\nWith sum: %d.\n", temp);
cudaMalloc((void**)&d_odata, _ARRAY_SIZE*sizeof(int));
if(d_odata == 0) {
printf("Couldn't allocate d_odata array.\n");
scanf("%d", &temp);
exit(1);
}
cudaMalloc((void**)&d_idata, _ARRAY_SIZE*sizeof(int));
if(d_idata == 0) {
printf("Couldn't allocate d_idata array.\n");
scanf("%d", &temp);
exit(1);
}
h_odata = (int*) malloc(_ARRAY_SIZE*sizeof(int));
if(h_odata == 0) {
printf("Couldn't allocate h_odata array.\n");
scanf("%d", &temp);
exit(1);
}
memset(h_odata, 0, _ARRAY_SIZE*sizeof(int));
cudaMemcpy(d_idata, a, _ARRAY_SIZE*sizeof(int), cudaMemcpyHostToDevice);
//reduction1<<<1, _ARRAY_SIZE>>>(d_idata, d_odata);
reduction2<<<1, _ARRAY_SIZE>>>(d_idata, d_odata);
cudaMemcpy(h_odata, d_odata, _ARRAY_SIZE*sizeof(int), cudaMemcpyDeviceToHost);
printf("Output array: ");
for(int i = 0; i < _ARRAY_SIZE; i++) {
printf("%d ", h_odata[i]);
}
printf("\nReduction result: %d.\n", h_odata[0]);
scanf("%d", &temp);
cudaFree(d_odata);
cudaFree(d_idata);
free(h_odata);
return 0;
}
Can anyone tell me why it isn’t working?I’m testing it as a CUDA Runtime v.4.0 project on Visual
Studio 2008, with cuda toolkit v.4.0.
Finally, can anyone tell me if I need 2 gpus in order to use cuda debubbing of Nsight.
Thank you.