Hi everyone,
I am trying to understand how to use the cub library functions efficiently. Consider the following example. Here we are finding the sum of warp elements using reduction algorithm.
#include <cub/cub.cuh>
#include <cuda.h>
#include <vector>
using std::vector;
#include <iostream>
using std::cout;
using std::endl;
const int N = 128;
__global__ void sum(float *indata, float *outdata) {
typedef cub::WarpReduce<float> WarpReduce;
__shared__ typename WarpReduce::TempStorage temp_storage[4];
int id = blockIdx.x*blockDim.x+threadIdx.x;
int warp_id = id/32;
if( id < N ) {
outdata[id] = WarpReduce(temp_storage[warp_id]).Sum(indata[id]);
}
}
int main() {
vector<float> y(N), sol(N);
float *dev_y, *dev_sol;
cudaMalloc((void**)&dev_y,N*sizeof(float));
cudaMalloc((void**)&dev_sol,N*sizeof(float));
for( int i = 0; i < N; i++ ) {
y[i] = (float)i;
}
cout << "input: ";
for( int i = 0; i < N; i++ ) cout << y[i] << "\t";
cout << endl;
cudaMemcpy(dev_y,&y[0],N*sizeof(float),cudaMemcpyHostToDevice);
sum<<<1,N>>>(dev_y,dev_sol);
cudaMemcpy(&sol[0],dev_sol,N*sizeof(float),cudaMemcpyDeviceToHost);
cout << "output: ";
for( int i = 0; i < N; i++ ) cout << sol[i] << "\t";
cout << endl;
cudaFree(dev_y);
cudaFree(dev_sol);
return 0;
}
My question is what happens when the temp_storage[4]
is decreased/changed to temp_storage[1]
…? (although I am getting the correct answer for both the cases) and how to identify the optimum temp_storage size
…?