I am trying to convert my network into FP16 mode where the network can also run cuda kernel for IPluginExt in FP16 mode too for which I am using __half datatype but context.enqeue doesnt give me the right values after inference, it seems like it gives garbage values but when I use float instead of __half, the model works fine and Inference gives me correct output. My Inference Code is given below:
void doInference(IExecutionContext& context, __half* input, __half* output, int batch_size)
{
const ICudaEngine& engine = context.getEngine();
assert(engine.getNbBindings() == 2);
void* buffers[2];
const int input_bind = engine.getBindingIndex(INPUT_NAME);
const int output_bind = engine.getBindingIndex(OUTPUT_NAME);
cout << "Size of __half data type is : " << sizeof(__half) << endl;
CudaSafeCall(cudaMalloc(&buffers[input_bind], INPUT_SIZE*sizeof(__half)));
CudaSafeCall(cudaMalloc(&buffers[output_bind], OUTPUT_SIZE*sizeof(__half)));
cudaStream_t stream;
cudaStreamCreate(&stream);
cout << "Address of buffer[input_bind] is " << buffers[input_bind] << endl;
cout << "Address of buffer[output_bind] is " << buffers[output_bind] << endl;
CudaSafeCall(cudaMemcpyAsync(buffers[input_bind], input, INPUT_SIZE*sizeof(__half), cudaMemcpyHostToDevice, stream));
bool result = context.enqueue(batch_size, buffers, stream, nullptr);
if (result)
cout << "Enqeue was successful" << endl;
cout << "Address of buffer[output_bind] after enqeue is " << buffers[output_bind] << endl;
cout << "Size of buffers[output_bind] is : " << sizeof(buffers[output_bind]) << endl;
CudaSafeCall(cudaMemcpyAsync(output, buffers[output_bind], OUTPUT_SIZE*sizeof(__half), cudaMemcpyDeviceToHost, stream));
for (int i=0; i<OUTPUT_SIZE; i++)
cout << fp16::__half2float(output[i]) << endl;
CudaCheckError();
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
cudaFree(buffers[input_bind]);
cudaFree(buffers[output_bind]);
}
And my enqueue of IPluginExt is like this:
virtual int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override
{
cout << "Address inside kernel for INPUT: " << inputs[0] << endl;
cout << "Address inside kernel for OUTPUT: " << outputs[0] << endl;
interp_gpu( (const __half*)inputs[0], mInputDims.d[3], mInputDims.d[2], mInputDims.d[1], batchSize, (__half *)outputs[0], b_x, b_y, b_z, stream );
return 0;
}
where inter_gpu is my custom kernel which takes in __half input