I believe its possible to come up with a more work-efficient approach. This is based on the following observations:
- The diagonal of the result matrix is your input data
- The results below the diagonal are all zero
- For calculation of the results above the diagonal (the superdiagonals), we can observe that each item is simply the min of the value to the left of that item, and the value below that item
- If we arrange our threads to process diagonallly from the main diagonal “towards” the upper right, we can at each step have the array of threads compute a diagonal of the result.
This method will require synchronization and communication, so ideally we would use shared memory for this. Furthermore, since we are not calculating results a row at a time, but rather one superdiagonal at a time, we must give some consideration to a method to write out results.
What follows is my attempt to compare what you originally posted, both CPU and GPU versions, with my approach (gpu_test
). We can make some conclusions:
- for a single instance of this problem, your approach is faster.
- if we have multiple instances of this problem to compute at once, (16 is my example number) then my approach appears faster on my GTX960.
$ cat t165.cu
#include <iostream>
#include <cstdlib>
#include <cstdio>
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
const int MY_MAX = 1000;
//const int MY_MIN = 0;
#define LOGLEN_MAX 150
__constant__ int constant_nLogLen;
__constant__ int constant_inArray[LOGLEN_MAX];
__global__ void calculateMin(
int* dev_outMatrix)
{
int length = threadIdx.x;
int startIndex = blockIdx.x;
int logLen = constant_nLogLen;
if (startIndex + length >= logLen || startIndex >= logLen || length >= logLen)
{
return;
}
int min = MY_MAX;
for (int j = startIndex; j <= startIndex + length;j++)
{
if (constant_inArray[j] < min)
{
min = constant_inArray[j];
}
}
dev_outMatrix[startIndex*logLen + length+startIndex] = min;
}
template <typename T>
void cpu_test(T *in_array, T *out_matrix, T *out_max, int length){
//Loop 1
for (int start= 0; start < length; ++start )
{
//Loop 2
for (int end= 0; end< length; ++end)
{
T nMin = MY_MAX;
// T nMax = MY_MIN;
//Loop 3
for (int n = start; n <= end; ++n)
{
if (in_array[n] < nMin)
{
nMin = in_array[n];
}
out_matrix[start*length+end] = nMin;
}
}
}
}
// assumes block consists of an even number of threads
template <typename T>
__global__ void gpu_test(const T * __restrict__ in, T * __restrict__ out_min, T * __restrict__ out_max){
__shared__ T s[LOGLEN_MAX*LOGLEN_MAX/2];
const int t = threadIdx.x;
T my_val = in[t];
const int my_l = blockDim.x;
const int my_half = my_l>>1;
const int my_l_m1 = my_l-1;
int my_oidx = (blockIdx.x*my_l+my_l_m1)*my_l + t;
int my_sidx = ((t - (t >= my_half)*my_half)*my_l)+t;
s[my_sidx] = my_val;
out_min[my_oidx] = (t == (my_l_m1))*my_val;
for (int i = 1; i < my_l; i++){
my_oidx -= my_l;
my_sidx -= my_l;
if (my_sidx < 0) my_sidx += my_half*my_l;
int my_osidx = t+(my_l_m1-i-((i < my_half)*my_half))*my_l;
__syncthreads();
if (t >= i)
s[my_sidx] = my_val = min(my_val,s[my_sidx-1]);
out_min[my_oidx] = (t > (my_l_m1-i-1))*s[my_osidx];
}
}
typedef int mt;
int main(){
const int nblk = 16;
const int length=150;
mt out_matrix[length][length];
mt in_array[length];
mt *out = &(out_matrix[0][0]);
for (int i = 0; i < length; i++){
in_array[i] = (rand()%(MY_MAX-1))+1;
for (int j = 0; j < length; j++) out_matrix[i][j] = 0;}
unsigned long long dt = dtime_usec(0);
for (int i = 0; i < nblk; i++) cpu_test(in_array, out, out, length);
dt = dtime_usec(dt);
std::cout << "cpu time: " << dt << "us" << std::endl;
cudaMemcpyToSymbol(constant_inArray, in_array, length*sizeof(mt));
cudaMemcpyToSymbol(constant_nLogLen, &length, sizeof(mt));
mt *d_out, *h_out;
cudaMalloc(&d_out, nblk*length*length*sizeof(mt));
h_out = new mt[length*length];
cudaMemset(d_out, 0, length*length*sizeof(mt));
for (int i = 0; i < nblk; i++)
calculateMin<<<length, length>>>(d_out);
cudaMemcpy(h_out, d_out, length*length*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < length; i++){
for (int j = 0; j < length; j++)
if (h_out[i*length+j] != out[i*length+j]) {std::cout << "mismatch0 at: " << i << "," << j << " was: " << h_out[i*length+j] << " should be: " << out[i*length+j] << std::endl; return 0;}
}
mt *d_in;
cudaMalloc(&d_in, length*sizeof(mt));
cudaMemcpy(d_in, in_array, length*sizeof(mt), cudaMemcpyHostToDevice);
gpu_test<<<nblk, length>>>(d_in, d_out, d_out);
cudaMemcpy(h_out, d_out, length*length*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < length; i++)
for (int j = 0; j < length; j++)
if (h_out[i*length+j] != out[i*length+j]) {std::cout << "mismatch1 at: " << i << "," << j << " was: " << h_out[i*length+j] << " should be: " << out[i*length+j] << std::endl; return 0;}
}
$ nvcc -O3 -o t165 t165.cu -lineinfo
$ nvprof ./t165
cpu time: 4282us
==2990== NVPROF is profiling process 2990, command: ./t165
==2990== Profiling application: ./t165
==2990== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 68.71% 144.26us 16 9.0160us 8.4480us 9.3120us calculateMin(int*)
13.96% 29.312us 2 14.656us 14.624us 14.688us [CUDA memcpy DtoH]
13.87% 29.120us 1 29.120us 29.120us 29.120us void gpu_test<int>(int const *, int*, int*)
1.74% 3.6480us 1 3.6480us 3.6480us 3.6480us [CUDA memset]
1.72% 3.6170us 3 1.2050us 1.0240us 1.4090us [CUDA memcpy HtoD]
API calls: 99.04% 141.83ms 2 70.915ms 6.7020us 141.82ms cudaMemcpyToSymbol
0.40% 566.04us 202 2.8020us 170ns 159.97us cuDeviceGetAttribute
0.17% 243.11us 2 121.55us 120.67us 122.44us cudaMalloc
0.14% 204.36us 3 68.118us 11.442us 110.78us cudaMemcpy
0.10% 145.35us 2 72.675us 51.758us 93.593us cuDeviceTotalMem
0.08% 117.60us 17 6.9170us 5.6480us 13.981us cudaLaunchKernel
0.04% 58.518us 2 29.259us 26.122us 32.396us cuDeviceGetName
0.02% 27.103us 1 27.103us 27.103us 27.103us cudaMemset
0.01% 10.098us 2 5.0490us 1.7830us 8.3150us cuDeviceGetPCIBusId
0.00% 1.2910us 3 430ns 274ns 721ns cuDeviceGetCount
0.00% 1.2760us 4 319ns 192ns 689ns cuDeviceGet
0.00% 586ns 2 293ns 256ns 330ns cuDeviceGetUuid
$
In order to reduce shared memory footprint of my code, I “folded” the shared usage. This causes “lower” rows that will be written out to global memory first to use the “right” half of the shared storage that is allocated for upper rows. By the time the upper rows get around to using this region, the lower row results have already been written out to global memory.
Just to summarize, the timings I see for 16 iterations are:
cpu_test: ~4ms (this could probably be improved e.g. with OpenMP)
calculateMin: ~144us
gpu_test: ~29us