i am trying to find the maximum value in an array and have the following two versions:
//int blocksize = 16; //multiple of 32
//int nblocks = ((npix*npix)+blocksize-1)/blocksize; //round to max npix = 7
// printf("nblocks = %d\n", nblocks);
//dim3 dimBlock(blocksize,blocksize);
//dim3 dimGrid(nblocks,nblocks);
// find max value from the array
// find_corrmax <<< nblocks,blocksize >>> (max_val, x_shift, y_shift, xcout, npix, npix);
__global__ void find_corrmax(double* max_val, int* x_pos, int* y_pos, double* in1, int Nx, int Ny)
{
int k = blockIdx.x * blockDim.x + threadIdx.x;
if(k < 1)
{
for(int j = 0; j < Ny; ++j)
{
for(int i = 0; i < Nx; ++i)
{
if(in1[(k*Nx*Ny)+i*Nx+j] > max_val[k])
{
max_val[k] = in1[(k*Nx*Ny)+i*Nx+j];
x_pos[k] = j;
y_pos[k] = i;
}
}
}
}
}
// find_corrmax1 <<< dimGrid,dimBlock >>> (pmax1, x_shift1, y_shift1, out1_d, pix1, pix2, fftindx_d, fftindy_d);
__global__ void find_corrmax1(double* max_val, int* x_pos, int* y_pos, double* in1, int Nx, int Ny)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k;
for(k = 0; k < 16; ++k)//BLOCK_SIZE=16
{
if(in1[k*Nx+i] > max_val[0])
{
max_val[0] = in1[k*Nx+i];
x_pos[0] = j;
y_pos[0] = i;
}
}
}
the first kernel find_corrmax takes 139 usec and the second kernel find_corrmax1 takes 59 usec. However, the second kernel calculates the maximum value correctly but gives the wrong indices.
any suggestions on how to improve this kernel??
I have looked at Thrust and CUDPP, but would like to implement it this way as I have a time constraint to meet.
The second example has a memory race - every thread will be trying to write into max_val[0], x_pos[0], and y_pos[0]. Use a parallel reduction for this instead.
The second example has a memory race - every thread will be trying to write into max_val[0], x_pos[0], and y_pos[0]. Use a parallel reduction for this instead.
No it doesn’t. You can use a parallel reduction for any size array you want. Something like this (for the L infinity norm of a vector):
template <typename ValueType>
__global__ void
linfty_norm(unsigned int num_rows, const ValueType * x0, ValueType * norms)
{
extern __shared__ ValueType buff[];
volatile int tidx = threadIdx.x + blockIdx.x * blockDim.x;
volatile int stride = gridDim.x * blockDim.x;
// Loop through vector to find local maximum
ValueType lumax = 0;
for( unsigned int i = tidx; i < num_rows; i+=stride ) {
lumax = fmax( lumax, fabs(x0[i]) );
}
// Load into shared memory
buff[threadIdx.x] = lumax; __syncthreads();
// First warp performs shared memory reduction
if (threadIdx.x < warpSize) {
#pragma unroll
for(int i=warpSize; i<blockDim.x; i+=warpSize) {
buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+i]);
}
if (threadIdx.x < 16) {
buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+16]);
}
if (threadIdx.x < 8) {
buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+8]);
}
if (threadIdx.x < 4) {
buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+4]);
}
if (threadIdx.x < 2) {
buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+2]);
}
// Finalise and write out the results to global memory
if (threadIdx.x == 0) {
norms[blockIdx.x] = fmax(buff[0], buff[1]);
}
}
}
Launch it once to reduce any size array down to an array with the same number of elements as blocks in the grid. Launch it again with one block to get the final value, or copy the partial result back to the host and finish the reduction in host memory. I find zero copy memory works well for the second step if you choose to do it on the host.
No it doesn’t. You can use a parallel reduction for any size array you want. Something like this (for the L infinity norm of a vector):
template <typename ValueType>
__global__ void
linfty_norm(unsigned int num_rows, const ValueType * x0, ValueType * norms)
{
extern __shared__ ValueType buff[];
volatile int tidx = threadIdx.x + blockIdx.x * blockDim.x;
volatile int stride = gridDim.x * blockDim.x;
// Loop through vector to find local maximum
ValueType lumax = 0;
for( unsigned int i = tidx; i < num_rows; i+=stride ) {
lumax = fmax( lumax, fabs(x0[i]) );
}
// Load into shared memory
buff[threadIdx.x] = lumax; __syncthreads();
// First warp performs shared memory reduction
if (threadIdx.x < warpSize) {
#pragma unroll
for(int i=warpSize; i<blockDim.x; i+=warpSize) {
buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+i]);
}
if (threadIdx.x < 16) {
buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+16]);
}
if (threadIdx.x < 8) {
buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+8]);
}
if (threadIdx.x < 4) {
buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+4]);
}
if (threadIdx.x < 2) {
buff[threadIdx.x] = fmax(buff[threadIdx.x], buff[threadIdx.x+2]);
}
// Finalise and write out the results to global memory
if (threadIdx.x == 0) {
norms[blockIdx.x] = fmax(buff[0], buff[1]);
}
}
}
Launch it once to reduce any size array down to an array with the same number of elements as blocks in the grid. Launch it again with one block to get the final value, or copy the partial result back to the host and finish the reduction in host memory. I find zero copy memory works well for the second step if you choose to do it on the host.
I ported some fast reduction code that i posted before so that it could be used to find max and minimum instead. The following code achieved 83% of peak bandwidth on my card ( reportedly above 90% on other cards) and finds both max and min of a linear array. You just need to add some code to record the index.
I ported some fast reduction code that i posted before so that it could be used to find max and minimum instead. The following code achieved 83% of peak bandwidth on my card ( reportedly above 90% on other cards) and finds both max and min of a linear array. You just need to add some code to record the index.
Wouldn’t using thrust be the fastest way from a development perspective of doing this? This is very easy to with thrust. thrust::max_element returns an iterator to the max/min element so if you need the index it is very easy to easy determine since the iterators are just pointers.
Wouldn’t using thrust be the fastest way from a development perspective of doing this? This is very easy to with thrust. thrust::max_element returns an iterator to the max/min element so if you need the index it is very easy to easy determine since the iterators are just pointers.
@eelseen: did use Thrust, but thrust::max_element gave me an error. I did read that it’s more optimal to use/write custom kernels. Eevntually, I will compare the execution time taken for both approaches and then include the best one in the implementation.
//find max value and its index
int position = thrust::max_element(vec_sumh.begin(), vec_sumh.end()) - vec_sumh.begin();
double value = vec_sumh[position];
int shifted_y = position/7;
int shifted_x = position - (shifted_y*7);
@eelseen: did use Thrust, but thrust::max_element gave me an error. I did read that it’s more optimal to use/write custom kernels. Eevntually, I will compare the execution time taken for both approaches and then include the best one in the implementation.
//find max value and its index
int position = thrust::max_element(vec_sumh.begin(), vec_sumh.end()) - vec_sumh.begin();
double value = vec_sumh[position];
int shifted_y = position/7;
int shifted_x = position - (shifted_y*7);
The kernel requires runtime declared shared memory, which you have not defined. It should be 1 word per thread. Also, use many threads per block (for example 128), and launch something like 4 blocks per MP of your GPU. If you need the maximum position, then expand the code to save the position of the maximum whenever a new maximum value is found. Note also that the code I posted is finding the absolute maximum, not the maximum. I posted it as an example of how a reduction could be done for an arbitrary array, not as a solution to your problem, with the expectation that you would modify it for you needs. If you can’t do that, then take eelsen’s very sound advice and stick with thrust.
The kernel requires runtime declared shared memory, which you have not defined. It should be 1 word per thread. Also, use many threads per block (for example 128), and launch something like 4 blocks per MP of your GPU. If you need the maximum position, then expand the code to save the position of the maximum whenever a new maximum value is found. Note also that the code I posted is finding the absolute maximum, not the maximum. I posted it as an example of how a reduction could be done for an arbitrary array, not as a solution to your problem, with the expectation that you would modify it for you needs. If you can’t do that, then take eelsen’s very sound advice and stick with thrust.