Parallel Reduction

Hey guyz,
I’m currently working on parallel reduction and I tried to test the kernels that are
presented on the NVIDIA Paper, by Mark Harris, “Optimizing Parallel Reduction in CUDA”.
Here is the code:

#include “cuda_runtime.h”
#include “device_launch_parameters.h”

#include <stdio.h>

#define _ARRAY_SIZE 4

global void reduction1(int *g_idata, int *g_odata)
{
extern shared int sdata;

unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();

for(unsigned int s = 1; s < blockDim.x; s *= 2) {
	if(tid % (2*s) == 0) {
		sdata[tid] += sdata[tid + s];
	}
	__syncthreads();
}

if(tid==0)
	g_odata[blockIdx.x] = sdata[0];

}

global void reduction2(int *g_idata, int *g_odata)
{
extern shared int sdata;

unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();

for(unsigned int s = 1; s < blockDim.x; s *= 2) {
	int index = 2*s*tid;

	if(index < blockDim.x) {
		sdata[index] += sdata[index + s];
	}
	__syncthreads();
}

if(tid==0)
	g_odata[blockIdx.x] = sdata[0];

}

int main(int argc, char** argv)
{
const int arraySize = _ARRAY_SIZE;
int a[arraySize];
int temp = 0;
int* d_idata=0, *d_odata=0, *h_odata=0;

printf("Array created: ");
for(unsigned int i = 0; i < _ARRAY_SIZE; i++) {
	a[i] = i;
	temp += i;
	printf("%d ", a[i]);
}
printf("\nWith sum: %d.\n", temp);

cudaMalloc((void**)&d_odata, _ARRAY_SIZE*sizeof(int));
if(d_odata == 0) {
	printf("Couldn't allocate d_odata array.\n");
	scanf("%d", &temp);
	exit(1);
}
cudaMalloc((void**)&d_idata, _ARRAY_SIZE*sizeof(int));
if(d_idata == 0) {
	printf("Couldn't allocate d_idata array.\n");
	scanf("%d", &temp);
	exit(1);
}
h_odata = (int*) malloc(_ARRAY_SIZE*sizeof(int));
if(h_odata == 0) {
	printf("Couldn't allocate h_odata array.\n");
	scanf("%d", &temp);
	exit(1);
}
memset(h_odata, 0, _ARRAY_SIZE*sizeof(int));

cudaMemcpy(d_idata, a, _ARRAY_SIZE*sizeof(int), cudaMemcpyHostToDevice);
//reduction1<<<1, _ARRAY_SIZE>>>(d_idata, d_odata);
reduction2<<<1, _ARRAY_SIZE>>>(d_idata, d_odata);

cudaMemcpy(h_odata, d_odata, _ARRAY_SIZE*sizeof(int), cudaMemcpyDeviceToHost);

printf("Output array: ");
for(int i = 0; i < _ARRAY_SIZE; i++) {
	printf("%d ", h_odata[i]);
}
printf("\nReduction result: %d.\n", h_odata[0]);
scanf("%d", &temp);

cudaFree(d_odata);
cudaFree(d_idata);
free(h_odata);

return 0;

}
Can anyone tell me why it isn’t working?I’m testing it as a CUDA Runtime v.4.0 project on Visual
Studio 2008, with cuda toolkit v.4.0.

Finally, can anyone tell me if I need 2 gpus in order to use cuda debubbing of Nsight.

Thank you.

You declare “extern” shared memory, then you must give size of dynamic shared memory.

For example

//reduction1<<<1, _ARRAY_SIZE, (size of dynamic shared memory) >>>(d_idata, d_odata);

	reduction2<<<1, _ARRAY_SIZE, (size of dynamic shared memory) >>>(d_idata, d_odata);

Thank you LSChien. This actually worked. But, it doesn’t work for
all the array sizes, which seems odd to me.
I mean, when I use _ARRAY_SIZE 1024, it works. But when I use _ARRAY_SIZE 4098,
it doesn’t work.

Hi,

You can’t launch 4098 threads per block.

Maximum limit is 512/1024 depending on the compute capability.

Hi,

You should test the return values of your calls to the CUDA runtime library (e.g., cudaMemcpy) and add calls to cudaDeviceSynchronize() and cudaGetLastError() after your kernel launch. The cudaError_t value will help indicate what is wrong with your code.

Ken

Yeah brano, I just figured it out, when I searched for the Shared Memory a bit more. Thank you anyway for your
immediate response :biggrin:
kaberdude I will have this in mind, in order to locate errors. I appreciate your help :thanks:

Hi all, Can somebody help me with this, pleasee… The following is the code written for vector vector multiplication which results in a scalar. This uses the logic of sum reduction using shared memory.

Each block calcuates the partial sum of a set of elements using shared memory. The partial sum of each block is returned to the CPU and the CPU performs the addtion to get the final result. I dont understand it gives garbage values. I would be very grateful to know the reason. Please help!!

#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#define threadsperblock 1024
#include<math.h>
#include<stdlib.h>

global void vectvectshared(float *A,float *B,float *r,int N)
{
shared float temp[threadsperblock];
int tx = threadIdx.x;
int ty = threadIdx.y;
int tid = (ty * blockDim.x) + tx; // relative to block
int id = (tx + blockDim.x * blockIdx.x) + (blockDim.x * gridDim.x * (ty + blockDim.y * blockIdx.y)); // relative to grid
if(id < N)
temp[tid] = A[id] * B[id];

    __syncthreads();
    int i = blockDim.x * blockDim.y/2;
    while(i!=0)
    {
            if( tid < i)
                    temp[tid] += temp[tid+i];
            __syncthreads();
    i = i/2;
    }
    if(tid == 0)
            r[blockIdx.x] = temp[0];

}

int main(int argc, char *argv)
{
float *hostA, *hostB, *res_partial_host, *res;
float *devA, *devB, *res_partial_dev;
int vlen;
vlen = atoi(argv[1]);
int i=0;
int blockspergrid = vlen/1024;
if(vlen%1024!=0)
blockspergrid = blockspergrid + 1;
hostA = (float *)malloc(vlen * sizeof(float));
hostB = (float *)malloc(vlen * sizeof(float));
res_partial_host = (float *)malloc(blockspergrid * sizeof(float));
res = (float *)malloc(sizeof(float));
res[0] = 0;
for(i=0; i<vlen; i++)
{
// hostA[i] = rand()+(RAND_MAX - rand())/RAND_MAX;
// hostB[i] = rand()+(RAND_MAX - rand())/RAND_MAX;
hostA[i] = hostB[i] = 2.00f;
}
cudaMalloc((void )&devA, vlen * sizeof(float));
cudaMalloc((void )&devB, vlen * sizeof(float));
cudaMalloc((void )&res_partial_dev, blockspergrid * sizeof(float));
cudaMemcpy((void
)devA,(void
)hostA,vlen * sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy((void
)devB,(void
)hostB,vlen * sizeof(float),cudaMemcpyHostToDevice);
printf(“CPU result is\n”);
for(i=0; i<vlen; i++)
{
res[0] += hostA[i] * hostB[i];
}
printf("%f",res[0]);
dim3 blocks(32,32);
vectvectshared<<<blockspergrid, blocks>>>(devA, devB ,res_partial_dev,vlen);
cudaMemcpy((void
)res_partial_host,(void
)res_partial_dev,blockspergrid * sizeof(float),cudaMemcpyDeviceToHost);
printf(“result on GPU\n”);
for(i=0; i<blockspergrid; i++)
res_partial_host[0] += res_partial_host[i];
printf("%f",res_partial_host[0]);
cudaFree(devA);
cudaFree(devB);
cudaFree(res_partial_dev);

    free(hostA);
    free(hostB);
    free(res);
    free(res_partial_host);

}

Hi abcd,

I have several recommendations:

    Please use the “Insert code snippet” when you post your code. It is hard to read your code because all formatting (leading tabs and spaces) have been removed.

    You need to add printfs to your kernel to debug it. Set the nvcc compiler target to compute_20,sm_20 and run your program with these printf’s. If you do, you can then find out why your code does not work.

The main problem with the code is that you don’t initialize the entire shared array temp with zeros. Add code “temp[tid] = 0;” after the definition/initialization of “id”.

The code now works, at least for N=16…

#include<stdio.h>

#include<cuda.h>

#include<cuda_runtime.h>

#define threadsperblock 1024

#include<math.h>

#include<stdlib.h>

__global__ void vectvectshared(float *A,float *B,float *r,int N)

{

	__shared__ float temp[threadsperblock];

	int tx = threadIdx.x;

	int ty = threadIdx.y;

	int tid = (ty * blockDim.x) + tx; // relative to block

	int id = (tx + blockDim.x * blockIdx.x) + (blockDim.x * gridDim.x * (ty + blockDim.y * blockIdx.y)); // relative to grid

	

	temp[tid] = 0;

	printf("id %d N %d\n", id, N);

	if (id < N)

	{

		temp[tid] = A[id] * B[id];

		printf("t[%d] = A[%d] * B[%d] = %f * %f = %f\n", tid, id, id, A[id], B[id], temp[tid]);

	}

	__syncthreads();

	int i = blockDim.x * blockDim.y / 2;

	printf("tid %d i %d\n", tid, i);

	while(i!=0)

	{

		if( tid < i)

		{

			printf ("tid %d, i %d, t[tid] = %f, t[tid+i] = %f, += %f\n", tid, i, temp[tid], temp[tid+i], temp[tid] + temp[tid+i]);

			temp[tid] += temp[tid+i];

		}

		__syncthreads();

		i = i/2;

	}

	if(tid == 0)

		r[blockIdx.x] = temp[0];

}

int main(int argc, char *argv[])

{

	float *hostA, *hostB, *res_partial_host, *res;

	float *devA, *devB, *res_partial_dev;

	int vlen;

	vlen = 16; // atoi(argv[1]);

	int i=0;

	int blockspergrid = vlen/1024;

	if(vlen%1024!=0)

		blockspergrid = blockspergrid + 1;

	hostA = (float *)malloc(vlen * sizeof(float));

	hostB = (float *)malloc(vlen * sizeof(float));

	res_partial_host = (float *)malloc(blockspergrid * sizeof(float));

	res = (float *)malloc(sizeof(float));

	res[0] = 0; 

	for(i=0; i<vlen; i++)

	{

		// hostA[i] = rand()+(RAND_MAX - rand())/RAND_MAX;

		// hostB[i] = rand()+(RAND_MAX - rand())/RAND_MAX;

		hostA[i] = hostB[i] = 2.00f;

	}

	cudaMalloc((void **)&devA, vlen * sizeof(float));

	cudaMalloc((void **)&devB, vlen * sizeof(float));

	cudaMalloc((void **)&res_partial_dev, blockspergrid * sizeof(float));

	cudaMemcpy((void*)devA,(void*)hostA,vlen * sizeof(float),cudaMemcpyHostToDevice);

	cudaMemcpy((void*)devB,(void*)hostB,vlen * sizeof(float),cudaMemcpyHostToDevice);

	printf("CPU result is\n");

	for(i=0; i<vlen; i++)

	{

		res[0] += hostA[i] * hostB[i];

	}

	printf("%f",res[0]);

	dim3 blocks(32,32);

	vectvectshared<<<blockspergrid, blocks>>>(devA, devB ,res_partial_dev,vlen);

	cudaMemcpy((void*)res_partial_host,(void*)res_partial_dev,blockspergrid * sizeof(float),cudaMemcpyDeviceToHost);

	printf("result on GPU\n");

	for(i=0; i<blockspergrid; i++)

		res_partial_host[0] += res_partial_host[i];

	printf("%f",res_partial_host[0]);

	cudaFree(devA);

	cudaFree(devB);

	cudaFree(res_partial_dev);

	free(hostA);

	free(hostB);

	free(res);

	free(res_partial_host);

}

@abcd: My first guess would be: probably indexing mistake… are you sure your index calculations are correct ? :)

Guys can I have some help with the following topic too:
I’m using VS2008 and I have some C++ files (.cpp and .h), that I need to parse .obj 3d model files.
My goal is to parse a 3d model in a c++ class, and with the use of some cuda kernels get some info about these
models. So, I need to add some .cu files in my project, but I don’t know how. In fact, here is my Main method (Main.cpp):

#include
using namespace std;

#include
#include
#include

#include “ObjectFileClass.h”
#include “MyVector.h”
#include “graph_adjacency_list.h”
#include “parallelAlgorithm.h”

int main(int argc, char** argv) {
ObjectFileClass model;
unsigned int Source = 1;
int** EdgesArray = NULL;

if (argv[1] == NULL) {
	cout << "Usage: ./progName <Object file> <source for SSSP>" << endl;
	return 1;
}
string fileName (argv[1]);
model.parseModelInfo(fileName);
model.computeEdges();

if(argv[2] != NULL) {
	Source = (unsigned int) atoi(argv[2]);
}

EdgesArray = (int**) malloc(model.getNumberOfVertices()*sizeof(int));
for(int i = 0; i < model.getNumberOfVertices(); i++) {
	EdgesArray[i] = (int*) malloc(model.getNumberOfVertices()*sizeof(int));
	for(int j = 0; j < model.getNumberOfVertices(); j++) {
		EdgesArray[i][j] = model.getEdge(i, j);
	}
}
[b]Parallel_Algorithm<int>(0, model.getNumberOfVertices(), EdgesArray);

[/b]
for(int i = 0; i < model.getNumberOfVertices(); i++)
free(EdgesArray[i]);
free(EdgesArray);

return 0;

}

Where Parallel_Algorithm is the wrapper function for calling the cuda kernels and the
#include “parallelAlgorithm.h” is the header file for the wrapper function. But when i try
to Build the Project I get the following error:
Error 72 error LNK2001: unresolved external symbol “bool __cdecl Parallel_Algorithm(unsigned int,unsigned int,int * *)” (??$Parallel_Algorithm@H@@YA_NIIPAPAH@Z) Main.obj

Does anyone know how am I going to solve this?

P.S.: I start as a Visual C++ Empty Project and the parallelAlgorithm.h is a Visual C++ header file and the file containing the
wrapper function for cuda kernels is called kernel.cu and is a v.4.0 cu file.

Thank you

I seemed to have solved the above(set Custom build rules), but now I get the error LNK2005 which I can’t figure out what is the cause of it.