Hello,
I’ve got a problem in a CUDA kernel, which uses half precision floats (by including cuda_fp16.h).
This is quite difficult to post about, because the code looks ok (I believe), and it’s hard to debug exactly what is in each variable when running the kernel, and it’s hard for me to prove that I am sending sensible data into the algorithm. However, apart from the PROBLEM_SECTION, this code reproduces the values which another implementation of this algorithm in OpenCV produces on the same hardware, so the majority of this code works as expected.
The problem shows up when I allow line 107 (counting in the main kernel listing, below) to compile. The half2 variable th should contain 1.0 and 1.0. This comes from the lines:
half thrshhi = __hgt (__high2half (a_h2[i]), dbthresh) ? oneh : zeroh;
half thrshlo = __hgt (__low2half (a_h2[i]), dbthresh_alt) ? oneh : zeroh;
Both halves of a_h2[i] contains a 0.1 or a 1.0; dbthresh and dbthresh_alt contain 0.001, so thrshhi and thrshlo should both end up containing the content of oneh, which is 1.0. (On a side note: I had to use dbthresh_alt (which is essentially a copy of dbthresh) in the second __hgt() call to avoid strange behaviour which showed up in CUDA 8 but not CUDA 9, so I have already had to work through some oddness on this platform).
So, now in line 107 (in the big listing), we should get 1.0 and 1.0 being written into the variable th:
half2 th = __halves2half2 (thrshlo, thrshhi); // Ok
Then a_real_h2[i] should be unchanged by this operation:
// L1 wipe out black regions
a_real_h2[i] = __hmul2 (a_real_h2[i], th); // Bad: a_real_h2 is set to 0; th should contain 1.0 and 1.0
However, when this line is enabled, it behaves as if th contained 0.0 and 0.0.
Can anyone suggest a way forward to understanding what’s going on here?
kind regards,
Seb James
Here’s the whole kernel:
#define USE_DRONE_BLANKING 1 // This IS defined when compiling
__global__ void
avdu_monolith_fp16_update (float* in_sep_fl,
half2* in_sep_h2,
half2* in_scaled2,
half2* a_h2,
half2* a_real_h2,
half2* b_h2,
#ifdef USE_DRONE_BLANKING
half2* prevthresh_h2,
half2 dbthreshh2,
#endif
half2 oneh2,
half2 zeroh2,
half2 scale_pr_slow_h2,
half2 scale_pr_fast_h2,
half2 inv_scale_pr_slow_h2,
half2 inv_scale_pr_fast_h2,
half2 scale_no_delay_h2,
half2 scale_fast_delay_h2,
half2 scale_slow_delay_h2,
half2 inv_scale_no_delay_h2,
half2 inv_scale_fast_delay_h2,
half2 inv_scale_slow_delay_h2)
{
// For LIN maps, blockIdx.y gives the map ID.
int mapid = blockIdx.y;
// i is an iterator into the 4xLINlayer memory
// regions. mapid*gridDim.x*blockDim.x is the number of threads in
// one of the four maps.
int iin = (blockIdx.x * blockDim.x) + threadIdx.x;
int imapid = mapid*gridDim.x*blockDim.x;
int i = imapid + iin;
// Input is from pr.a for most of the layers and from separate
// memory for the pr layer. Input needs an individual pointer index,
// which is iin.
half2* inp = &a_h2[0];
if (mapid == 0) {
// if mapid is 0, then we're computing the PR; use separate input.
inp = in_sep_h2;
// If necessary, convert from float to half2 for PR layer's input:
if (in_sep_fl != (float*)0) {
int in_fl_i = iin*2; // or << 1
half in_high = __float2half (in_sep_fl[in_fl_i]);
in_fl_i++;
half in_low = __float2half (in_sep_fl[in_fl_i]);
inp[iin] = __halves2half2 (in_high, in_low);
}
// And do all the PR specific stuff.
// For use later. Can potentially put zeroh, oneh into arg list to
// avoid these operations.
half zeroh = __low2half (zeroh2);
half zeroh_alt = __high2half (zeroh2); // Used in CUDA-8.0 __hgt() workaround, later.
half oneh = __low2half (oneh2);
#ifdef USE_DRONE_BLANKING
half dbthresh = __low2half (dbthreshh2);
half dbthresh_alt = __high2half (dbthreshh2); // CUDA-8.0 __hgt() workaround
// threshold binary.
half thrshhi = __hgt (__high2half (a_h2[i]), dbthresh) ? oneh : zeroh;
half thrshlo = __hgt (__low2half (a_h2[i]), dbthresh_alt) ? oneh : zeroh;
half maskhi = __hlt (__high2half (prevthresh_h2[i]), thrshhi) ? oneh : zeroh;
half masklo = __hlt (__low2half (prevthresh_h2[i]), thrshlo) ? oneh : zeroh;
#endif // USE_DRONE_BLANKING
// Add weighted (__hfma2 is arg1*arg2 + arg3, so this is inp*inv_slow + b*slow
b_h2[i] = __hfma2 (b_h2[i], scale_pr_slow_h2, __hmul2 (inp[iin], inv_scale_pr_slow_h2));
#ifdef USE_DRONE_BLANKING
half b_h2hi = __high2half (b_h2[i]);
half b_h2lo = __low2half (b_h2[i]);
half inputhi = __high2half (inp[iin]);
half inputlo = __low2half (inp[iin]);
b_h2hi = __hgt (maskhi, zeroh) ? inputhi : b_h2hi;
b_h2lo = __hgt (masklo, zeroh_alt) ? inputlo : b_h2lo;
b_h2[i] = __halves2half2 (b_h2lo, b_h2hi);
#endif // USE_DRONE_BLANKING
// now combine inp with b. Result of (inp - b_h2) written into
// in_scaled2:
in_scaled2[i] = __hsub2 (inp[iin], b_h2[i]);
// Another add weighted
a_real_h2[i] = __hfma2 (a_real_h2[i], scale_pr_fast_h2, __hmul2 (in_scaled2[i], inv_scale_pr_fast_h2));
#define PROBLEM_SECTION 1
#ifdef PROBLEM_SECTION
// Another CUDA bug - the line: prevthresh_h2[i] =
// thresholded_h2[i]; upsets what gets written into a_h2 after
// here. Also, the first line: a_real_h2[i] = __hmul2
// (a_real_h2[i], thresholded_h2[i]); also mucks the operation
// up. These problems occur using CUDA 9 on a desktop PC as
// well as on the Tegra TX2.
#ifdef USE_DRONE_BLANKING
// L0 Place two halves into a temporary variable
half2 th = __halves2half2 (thrshlo, thrshhi); // Ok
// L1 wipe out black regions
a_real_h2[i] = __hmul2 (a_real_h2[i], th); // Bad: a_real_h2 is set to 0; th should contain 1.0 and 1.0
// L2 store old threshold values to detect transitions
prevthresh_h2[i] = th;
#endif // USE_DRONE_BLANKING
#endif // PROBLEM_SECTION
// compensate for up / down - negate
a_h2[i] = __hneg2 (a_real_h2[i]);
// src, dst, threshold, value. If > 0.0 then a_h2 unchanged, else, set a_h2 to value.
half a_h2hi = __high2half (a_h2[i]);
half a_h2lo = __low2half (a_h2[i]);
// Note that we have to do a separate comparison of the high part
// and the low part of the half2; I don't think there's any
// getting around this.
a_h2hi = (__hgt (a_h2hi, zeroh) == true) ? a_h2hi : zeroh;
// NB: Here I use a DIFFERENT variable containing 0 for the
// comparison (zeroh_alt). This is a workaround for a bug which
// manifests with CUDA-8.0. The bug does not show when I run this
// code on CUDA-9.0 on a Desktop GPU (sm_61; GTX1070)
a_h2lo = (__hgt (a_h2lo, zeroh_alt) == true) ? a_h2lo : zeroh;
// a_h2[i] is the result of the linlayer process and forms inputs
// for the shift multiply process.
//
// The CUDA-8 bug *shows* here; the correct value appears in
// a_h2[i] for a_h2lo, but 0 always appeared for a_h2hi.
a_h2[i] = __halves2half2 (a_h2lo, a_h2hi);
} else if (mapid == 1) {
// No Delay
a_h2[i] = __hfma2(inp[iin], inv_scale_no_delay_h2, __hmul2(a_h2[i], scale_no_delay_h2));
} else if (mapid == 2) {
// Fast Delay
a_h2[i] = __hfma2(inp[iin], inv_scale_fast_delay_h2, __hmul2(a_h2[i], scale_fast_delay_h2));
} else if (mapid == 3) {
// Slow Delay
a_h2[i] = __hfma2(inp[iin], inv_scale_slow_delay_h2, __hmul2(a_h2[i], scale_slow_delay_h2));
}
} // avdu_monolith_fp16_update()