Hi everyone!
I’m trying to write kernel which finds max element of an array(this could me matrix or vector) using Reduce example from SDK. Here is code:
#include <iostream>
#include <stdio.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#define real double
#define BLOCK_SIZE 32
#define N 4096
#define GRID_SIZE N/BLOCK_SIZE
__device__ unsigned int retirementCount = 0;
__device__ real maxVals[GRID_SIZE/2];
template <unsigned int gridSize, unsigned int blockSize, unsigned int rows>
__device__ void PartMAX(real* g_data)
{
__shared__ real data[blockSize*2];
__shared__ real maxVal;
int tx = threadIdx.x;
int bx = blockIdx.x;
int step = gridDim.x*blockDim.x;
for(int i = 0; i < rows; i++)
{
int idx = bx*blockSize + tx + i*gridSize*blockSize;
data[tx] = fabs(g_data[idx]);
data[tx+blockSize] = fabs(g_data[idx+step]);
if(tx==0)
maxVal = fmax(data[0], data[blockSize]);//checking first max value
maxVal = fmax(fmax(maxVal, data[tx]), data[tx+blockSize]);//compare its valu with all others
}
if(tx==0) maxVals[bx] = maxVal;
}
template <unsigned int gridSize, unsigned int blockSize, unsigned int rows>
__global__ void cuMAX(real *g_data, real *maximum)
{
PartMAX<gridSize,blockSize,rows>(g_data);
__threadfence();
__shared__ real maxVal;
__shared__ bool isLast;
int tx = threadIdx.x;
if(tx==0)
{
int ticket = atomicInc(&retirementCount, gridDim.x);
isLast = (ticket==gridDim.x-1);
}
if(isLast)
{
maxVal = maxVals[0];
for(int i = 0; i < gridSize/2; i+=blockSize*2)
{
maxVal = fmax(maxVal,fmax(maxVals[tx + i], maxVals[tx + i + blockSize]));
}
}
*maximum = maxVal;
}
int main ()
{
real* h_A = new real[N*N];
real* d_A = new real[N*N];
real* aNorm = new real();
real* temp = new real();
for(int i = 0; i < N; i++)
{
real t1 = 0.;
for(int j = 0; j < N; j++)
{
h_A[i*N+j] = h_A[j*N+i] = (0.+ rand())/RAND_MAX;
t1 += h_A[i*N+j];
}
h_A[i*N+i] += 0.05*t1;
}
size_t sizeA = N*N*sizeof(real);
cudaMalloc((void**)&d_A, sizeA);
cudaMalloc((void**)&aNorm,sizeof(real));
cudaMalloc((void**)&temp,sizeof(real));
cudaMemcpy(d_A, h_A, sizeA, cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,1,1);
dim3 dimGrid(GRID_SIZE/2,1,1);
cuMAX<GRID_SIZE,BLOCK_SIZE,N><<<dimGrid,dimBlock>>>(d_A, aNorm);
cudaMemcpy(h_A, d_A, N*N*sizeof(real), cudaMemcpyDeviceToHost);
real norm = MatNrm(h_A);
cudaMemcpy(temp, aNorm, sizeof(real), cudaMemcpyDeviceToHost);
std::cout << *temp <<;
}
But it seems that these two strings do not work correctly:
if(tx==0) maxVal = fmax(data[0], data[blockSize]);//checking first max value
maxVal = fmax(fmax(maxVal, data[tx]), data[tx+blockSize]);//compare its valu with all others
result of the second comparison brings the same results. I thought that using shared variable may help in this case so that maxVal would be visible for all warp and every thread could calculate its own max number based on this max. Am I wrong ? did I miss smth ?
Thanks.