Cumpute Max of Vector or Matrix

genetic_priest · June 2, 2011, 6:55pm

Hi everyone!

I’m trying to write kernel which finds max element of an array(this could me matrix or vector) using Reduce example from SDK. Here is code:

#include <iostream>

#include <stdio.h>

#include <cuda_runtime_api.h>

#include <cuda_runtime.h>

#include <device_functions.h>

#include <device_launch_parameters.h>

#include <stdio.h>

#define real double

#define BLOCK_SIZE 32

#define N 4096

#define GRID_SIZE N/BLOCK_SIZE

__device__ unsigned int retirementCount = 0;

__device__ real maxVals[GRID_SIZE/2];

template <unsigned int gridSize, unsigned int blockSize, unsigned int rows>

__device__ void PartMAX(real* g_data)

{

	__shared__ real data[blockSize*2];

	__shared__ real maxVal;

	int tx = threadIdx.x;

	int bx = blockIdx.x;

	int step = gridDim.x*blockDim.x;	

	for(int i = 0; i < rows; i++)

	{

		int idx = bx*blockSize + tx + i*gridSize*blockSize;

		data[tx] = fabs(g_data[idx]);

		data[tx+blockSize] = fabs(g_data[idx+step]);

		if(tx==0)

			maxVal = fmax(data[0], data[blockSize]);//checking first max value

		maxVal = fmax(fmax(maxVal, data[tx]), data[tx+blockSize]);//compare its valu with all others

	}

	if(tx==0) maxVals[bx] = maxVal;

}

template <unsigned int gridSize, unsigned int blockSize, unsigned int rows>

__global__ void cuMAX(real *g_data, real *maximum)

{	

	PartMAX<gridSize,blockSize,rows>(g_data);

	

	__threadfence();

	__shared__ real maxVal;

	__shared__ bool isLast;

	int tx = threadIdx.x;

	

	if(tx==0)

	{

		int ticket = atomicInc(&retirementCount, gridDim.x);

		isLast = (ticket==gridDim.x-1);

	}

	if(isLast)

	{

		maxVal = maxVals[0];

		for(int i = 0; i < gridSize/2; i+=blockSize*2)

		{

			maxVal = fmax(maxVal,fmax(maxVals[tx + i], maxVals[tx + i + blockSize]));

		}

	}

	*maximum = maxVal;	

}

int main ()

{

	real* h_A = new real[N*N];

	real* d_A = new real[N*N];

	real* aNorm = new real();

	real* temp = new real();

	

	for(int i = 0; i < N; i++)

	{

		real t1 = 0.;

		for(int j = 0; j < N; j++)

		{

			h_A[i*N+j] = h_A[j*N+i] = (0.+ rand())/RAND_MAX;

			t1 += h_A[i*N+j];

		}

		h_A[i*N+i] += 0.05*t1;

	}

	size_t sizeA = N*N*sizeof(real);

	cudaMalloc((void**)&d_A, sizeA);

	cudaMalloc((void**)&aNorm,sizeof(real));

	cudaMalloc((void**)&temp,sizeof(real));

	

	cudaMemcpy(d_A, h_A, sizeA, cudaMemcpyHostToDevice);

	dim3 dimBlock(BLOCK_SIZE,1,1);

	dim3 dimGrid(GRID_SIZE/2,1,1);

	cuMAX<GRID_SIZE,BLOCK_SIZE,N><<<dimGrid,dimBlock>>>(d_A, aNorm);

	cudaMemcpy(h_A, d_A, N*N*sizeof(real), cudaMemcpyDeviceToHost);

	real norm = MatNrm(h_A);

	

	cudaMemcpy(temp, aNorm, sizeof(real), cudaMemcpyDeviceToHost);

	std::cout << *temp <<;

}

But it seems that these two strings do not work correctly:

if(tx==0) maxVal = fmax(data[0], data[blockSize]);//checking first max value

maxVal = fmax(fmax(maxVal, data[tx]), data[tx+blockSize]);//compare its valu with all others

result of the second comparison brings the same results. I thought that using shared variable may help in this case so that maxVal would be visible for all warp and every thread could calculate its own max number based on this max. Am I wrong ? did I miss smth ?

Thanks.

tera · June 2, 2011, 8:38pm

No, placing maxVal in shared memory just means that out of all the threads in the block one is randomly picked to succeed in placing its value there and the final result will be unpredictable. Reduction schemes are not just about leaving out the atomic operations in where they would be needed. Their important part is to have no two threads write to the same variable, so that no race conditions can evolve.

Also, you may be using [font=“Courier New”]isLast[/font] without assigning it a value first. Place a [font=“Courier New”]__syncthreads()[/font] between writing it from thread 0 and reading it from the other threads. Or, if you rely on having at most one warp (32 threads) per block, declare [font=“Courier New”]isLast[/font] as [font=“Courier New”]volatile[/font].

devkec · June 4, 2011, 8:56am

or simply use thrust:

thrust::device_vector<real> d_A(N*N);

thrust::copy(h_A, h_A + N*N, d_A.begin());

real max = thrust::reduce(d_A.begin(), d_A.end(), thrust::max<real>());

genetic_priest · June 5, 2011, 7:24am

Good idea, but it returns integer always %)

devkec · June 5, 2011, 7:54am

why integer?

thrust documentation reduce

in this example, yes it returns integer.

if you use it like

real result = thrust::reduce(data, data + 6,

                              real(*data),

                              thrust::maximum<real>()

);

it will return a real. it is a template-function

genetic_priest · June 5, 2011, 1:05pm

why integer?

thrust documentation reduce

in this example, yes it returns integer.

if you use it like
real result = thrust::reduce(data, data + 6,

                              real(*data),

                              thrust::maximum<real>()

);
it will return a real. it is a template-function

You may try this:

#include <iostream>

#include <cuda_runtime_api.h>

#include <cuda_runtime.h>

#include <device_functions.h>

#include <device_launch_parameters.h>

#include <thrust/reduce.h>

#include <thrust/device_vector.h>

#define real double

#define N 4096

int main ()

{

	real* h_A = new real[N*N];

	for(int i = 0; i < N; i++)		

		for(int j = 0; j < N; j++)

			h_A[i*N+j] = 1.5;

	thrust::device_vector<real> d_A(N*N);

	thrust::copy(h_A, h_A + N*N, d_A.begin());

	thrust::maximum<real> op;

	real max = thrust::reduce(d_A.begin(), d_A.end(), 0, op);	

	std::cout << max;

}

genetic_priest · June 5, 2011, 1:24pm

You may try this:

#include <iostream>

#include <cuda_runtime_api.h>

#include <cuda_runtime.h>

#include <device_functions.h>

#include <device_launch_parameters.h>

#include <thrust/reduce.h>

#include <thrust/device_vector.h>

#define real double

#define N 4096

int main ()

{

	real* h_A = new real[N*N];

	for(int i = 0; i < N; i++)		

		for(int j = 0; j < N; j++)

			h_A[i*N+j] = 1.5;

	thrust::device_vector<real> d_A(N*N);

	thrust::copy(h_A, h_A + N*N, d_A.begin());

	thrust::maximum<real> op;

	real max = thrust::reduce(d_A.begin(), d_A.end(), 0, op);

	std::cout << max;

}

I found what was the root cause of the problem - 3rd argument of thrust::reduce() is “T init” - initial value of type T. But when you use “0” it performs implicit convertion to int. So you should be careful with it and place “0.”. Code

real max = thrust::reduce(d_A.begin(), d_A.end(), 0., op);

works fine :)

devkec · June 6, 2011, 8:44am

even better:

real max = thrust::reduce(d_A.begin(), d_A.end(), real(0), op);

Topic		Replies	Views
Finding max in array CUDA Programming and Performance	15	42189	November 26, 2017
about finding a max number from a big array CUDA Programming and Performance	10	4691	January 7, 2020
Cuda : Reduce (max/min) function on matrix implementation CUDA Programming and Performance	1	1644	August 22, 2019
Reduction in kernel function Get max value in vector CUDA Programming and Performance	8	13758	May 5, 2011
find maximum value in an array along with index CUDA Programming and Performance	40	18653	October 11, 2010
Find maximum value from threads CUDA Programming and Performance	6	430	December 16, 2023
One question regarding shared memory CUDA Programming and Performance	5	1235	April 24, 2013
CUDA reduction CUDA Programming and Performance	10	51425	June 7, 2009
Reduction from SDK CUDA Programming and Performance	2	11602	March 12, 2009
First CUDA trial but unexpected behavior CUDA Programming and Performance	4	1004	March 29, 2012

Cumpute Max of Vector or Matrix

Related topics