Cumpute Max of Vector or Matrix

Hi everyone!

I’m trying to write kernel which finds max element of an array(this could me matrix or vector) using Reduce example from SDK. Here is code:

#include <iostream>

#include <stdio.h>

#include <cuda_runtime_api.h>

#include <cuda_runtime.h>

#include <device_functions.h>

#include <device_launch_parameters.h>

#include <stdio.h>

#define real double

#define BLOCK_SIZE 32

#define N 4096

#define GRID_SIZE N/BLOCK_SIZE

__device__ unsigned int retirementCount = 0;

__device__ real maxVals[GRID_SIZE/2];

template <unsigned int gridSize, unsigned int blockSize, unsigned int rows>

__device__ void PartMAX(real* g_data)

{

	__shared__ real data[blockSize*2];

	__shared__ real maxVal;

	int tx = threadIdx.x;

	int bx = blockIdx.x;

	int step = gridDim.x*blockDim.x;	

	for(int i = 0; i < rows; i++)

	{

		int idx = bx*blockSize + tx + i*gridSize*blockSize;

		data[tx] = fabs(g_data[idx]);

		data[tx+blockSize] = fabs(g_data[idx+step]);

		if(tx==0)

			maxVal = fmax(data[0], data[blockSize]);//checking first max value

		maxVal = fmax(fmax(maxVal, data[tx]), data[tx+blockSize]);//compare its valu with all others

	}

	if(tx==0) maxVals[bx] = maxVal;

}

template <unsigned int gridSize, unsigned int blockSize, unsigned int rows>

__global__ void cuMAX(real *g_data, real *maximum)

{	

	PartMAX<gridSize,blockSize,rows>(g_data);

	

	__threadfence();

	__shared__ real maxVal;

	__shared__ bool isLast;

	int tx = threadIdx.x;

	

	if(tx==0)

	{

		int ticket = atomicInc(&retirementCount, gridDim.x);

		isLast = (ticket==gridDim.x-1);

	}

	if(isLast)

	{

		maxVal = maxVals[0];

		for(int i = 0; i < gridSize/2; i+=blockSize*2)

		{

			maxVal = fmax(maxVal,fmax(maxVals[tx + i], maxVals[tx + i + blockSize]));

		}

	}

	*maximum = maxVal;	

}

int main ()

{

	real* h_A = new real[N*N];

	real* d_A = new real[N*N];

	real* aNorm = new real();

	real* temp = new real();

	

	for(int i = 0; i < N; i++)

	{

		real t1 = 0.;

		for(int j = 0; j < N; j++)

		{

			h_A[i*N+j] = h_A[j*N+i] = (0.+ rand())/RAND_MAX;

			t1 += h_A[i*N+j];

		}

		h_A[i*N+i] += 0.05*t1;

	}

	size_t sizeA = N*N*sizeof(real);

	cudaMalloc((void**)&d_A, sizeA);

	cudaMalloc((void**)&aNorm,sizeof(real));

	cudaMalloc((void**)&temp,sizeof(real));

	

	cudaMemcpy(d_A, h_A, sizeA, cudaMemcpyHostToDevice);

	dim3 dimBlock(BLOCK_SIZE,1,1);

	dim3 dimGrid(GRID_SIZE/2,1,1);

	cuMAX<GRID_SIZE,BLOCK_SIZE,N><<<dimGrid,dimBlock>>>(d_A, aNorm);

	cudaMemcpy(h_A, d_A, N*N*sizeof(real), cudaMemcpyDeviceToHost);

	real norm = MatNrm(h_A);

	

	cudaMemcpy(temp, aNorm, sizeof(real), cudaMemcpyDeviceToHost);

	std::cout << *temp <<;

}

But it seems that these two strings do not work correctly:

if(tx==0) maxVal = fmax(data[0], data[blockSize]);//checking first max value

maxVal = fmax(fmax(maxVal, data[tx]), data[tx+blockSize]);//compare its valu with all others

result of the second comparison brings the same results. I thought that using shared variable may help in this case so that maxVal would be visible for all warp and every thread could calculate its own max number based on this max. Am I wrong ? did I miss smth ?

Thanks.

No, placing maxVal in shared memory just means that out of all the threads in the block one is randomly picked to succeed in placing its value there and the final result will be unpredictable. Reduction schemes are not just about leaving out the atomic operations in where they would be needed. Their important part is to have no two threads write to the same variable, so that no race conditions can evolve.

Also, you may be using [font=“Courier New”]isLast[/font] without assigning it a value first. Place a [font=“Courier New”]__syncthreads()[/font] between writing it from thread 0 and reading it from the other threads. Or, if you rely on having at most one warp (32 threads) per block, declare [font=“Courier New”]isLast[/font] as [font=“Courier New”]volatile[/font].

or simply use thrust:

thrust::device_vector<real> d_A(N*N);

thrust::copy(h_A, h_A + N*N, d_A.begin());

real max = thrust::reduce(d_A.begin(), d_A.end(), thrust::max<real>());

Good idea, but it returns integer always %)

why integer?

thrust documentation reduce

in this example, yes it returns integer.

if you use it like

real result = thrust::reduce(data, data + 6,

                              real(*data),

                              thrust::maximum<real>()

);

it will return a real. it is a template-function

You may try this:

#include <iostream>

#include <cuda_runtime_api.h>

#include <cuda_runtime.h>

#include <device_functions.h>

#include <device_launch_parameters.h>

#include <thrust/reduce.h>

#include <thrust/device_vector.h>

#define real double

#define N 4096

int main ()

{

	real* h_A = new real[N*N];

	for(int i = 0; i < N; i++)		

		for(int j = 0; j < N; j++)

			h_A[i*N+j] = 1.5;

	thrust::device_vector<real> d_A(N*N);

	thrust::copy(h_A, h_A + N*N, d_A.begin());

	thrust::maximum<real> op;

	real max = thrust::reduce(d_A.begin(), d_A.end(), 0, op);	

	std::cout << max;

}

I found what was the root cause of the problem - 3rd argument of thrust::reduce() is “T init” - initial value of type T. But when you use “0” it performs implicit convertion to int. So you should be careful with it and place “0.”. Code

real max = thrust::reduce(d_A.begin(), d_A.end(), 0., op);

works fine :)

even better:

real max = thrust::reduce(d_A.begin(), d_A.end(), real(0), op);