Hi all,
I am trying to optimize a kernel to its maximum performance on my jetson Tx2, using Jetpack 4.3 (cuda toolkit 10.0).
The kernel includes shared memory usage, float to int calculations and conversions and finally some for loops.
I compiled it (using fastmath on nvcc) achieving roughly 300ms per run (measured through nvprof).
I then realized that I didn’t specify any compute capability, so my eclipse Nsight was defaulting to compute capability 3.0 and PTX/SASS 2.0 compilation.
When I switched to the 6.2 compute capability, I got my Kernel slowed down by almost 400% achieving average 1.3 seconds with the same conditions as before.
I would have expected an enhancement instead.
I am struggling to find an answer to this, does anybody have any suggestion or pointer, or general optimization advice?
The code of my kernel is the following (it’s an interpolation and modulation of float input values to uint8 output):
/******************************************
Function Call:
******************************************/
int block_1d = 256;
int n_cols = 256*205;
int n_pre_rows = 13;
int n_post_rows = 3105;
int grid_1d = (n_cols + block_1d -1) / block_1d;
my_kernel<<<grid_1d, block_1d, block_1d*n_pre_rows*sizeof(float), my_stream>>>(
(uint8_t*)ptr_u8,(const float*)ptr_f,
n_pre_rows, n_post_rows, n_cols, (float)downsamplingFactor, (1.f/(float)downsamplingFactor), samplingFactor, subf);
/******************************************/
__global__ void my_kernel(
uint8_t* __restrict__ out_buff, const float* __restrict__ input_buff,
const uint32_t n_pre_rows, const uint32_t n_post_rows, const uint32_t n_cols,
const float downsamplingFactor, const float downsamplingFactorMul, const float samplingFactor, const float subf)
{
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
extern uint8_t __shared__ s[];
float m;
float * pre_data = (float*) s;
uint32_t shared_idx = threadIdx.x*n_pre_rows;
uint32_t accumulation;
int increments;
int lastData;
uint32_t col_idx;
uint32_t ds_factor_u16;
uint32_t subloop_idx;
uint32_t post_data_init_0;
uint32_t post_data_init_1;
uint32_t post_data_current;
const float round_factor = 0.5f;
if (x < (n_cols)){
ds_factor_u16 = (uint16_t) (downsamplingFactor);
col_idx = x*n_post_rows;
pre_data[shared_idx] = samplingFactor * (input_buff[x*n_pre_rows] - subf);
for (int i=0; i<n_pre_rows-1; i++){
subloop_idx = i*ds_factor_u16;
pre_data[shared_idx+i+1] = (samplingFactor * (input_buff[x*n_pre_rows+i+1] - subf));
m = (pre_data[shared_idx+i+1] - pre_data[shared_idx+i]) * downsamplingFactorMul;
if (i==0){
post_data_init_0 = floorf(pre_data[shared_idx] + round_factor);
post_data_init_1 = floorf(pre_data[shared_idx]+ m + round_factor);
increments = 2*(post_data_init_1 - post_data_init_0);
accumulation = post_data_init_0;
lastData = -1;
}
for (int j = (i==0); j<downsamplingFactor; j++){
post_data_current = floorf(pre_data[shared_idx+i]+ (float)j* m + round_factor);
accumulation += (increments >> 1);
if (increments > 128) {
lastData = -1;
} else {
if (post_data_current > accumulation) {
lastData = -1;
} else if (post_data_current < accumulation) {
lastData = 1;
}
}
out_buff[col_idx + subloop_idx + j-1] = (lastData == 1);
increments += lastData;
}
}
}
}
Thanks,
Andrea