CJR
July 7, 2020, 7:35am
13
There seems to be a bug in the open source code you are following. The kernels in plugin layer should use the same incoming cuda stream which it receives in the call to “enqueue”. This will ensure in-order execution of all the kernels in the entire network.
}
}
float *res_count = output + bnIdx * outputElem;
int count = (int)atomicAdd(res_count, 1);
if (count >= maxoutobject) return;
char *data = (char*)res_count + sizeof(float) + count * sizeof(Detection);
Detection *det = (Detection*)(data);
int row = idx / yoloWidth;
int col = idx % yoloWidth;
//Location
// pytorch:
// y = x[i].sigmoid()
// y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i] # xy
// y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
// X: (sigmoid(tx) + cx)/FeaturemapW * netwidth
det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;
// W: (Pw * e^tw) / FeaturemapW * netwidth