Hello,
I am very new to the multi-threaded/parallel implementations of code using CUDA. I am messing around with one of the codes developed by Nvidia available at GitHub - dusty-nv/jetson-inference: Hello AI World guide to deploying deep-learning inference networks and deep vision primitives with TensorRT and NVIDIA Jetson.
I want to utilize the data in one of the main processing loops for one of the files, let’s say for example https://github.com/dusty-nv/jetson-inference/blob/master/detectnet-camera/detectnet-camera.cpp
Here is the code inside the loop:
while( !signal_recieved )
{
void* imgCPU = NULL;
void* imgCUDA = NULL;
// get the latest frame
if( !camera->Capture(&imgCPU, &imgCUDA, 1000) )
printf("\ndetectnet-camera: failed to capture frame\n");
// convert from YUV to RGBA
void* imgRGBA = NULL;
if( !camera->ConvertRGBA(imgCUDA, &imgRGBA) )
printf("detectnet-camera: failed to convert from NV12 to RGBA\n");
// classify image with detectNet
int numBoundingBoxes = maxBoxes;
if( net->Detect((float*)imgRGBA, camera->GetWidth(), camera->GetHeight(), bbCPU, &numBoundingBoxes, confCPU))
{
printf("%i bounding boxes detected\n", numBoundingBoxes);
int lastClass = 0;
int lastStart = 0;
for( int n=0; n < numBoundingBoxes; n++ )
{
const int nc = confCPU[n*2+1];
float* bb = bbCPU + (n * 4);
printf("bounding box %i (%f, %f) (%f, %f) w=%f h=%f\n", n, bb[0], bb[1], bb[2], bb[3], bb[2] - bb[0], bb[3] - bb[1]);
if( nc != lastClass || n == (numBoundingBoxes - 1) )
{
if( !net->DrawBoxes((float*)imgRGBA, (float*)imgRGBA, camera->GetWidth(), camera->GetHeight(),
bbCUDA + (lastStart * 4), (n - lastStart) + 1, lastClass) )
printf("detectnet-console: failed to draw boxes\n");
lastClass = nc;
lastStart = n;
/* ADD SOME CPU PROCESSING CODE HERE !! */
CUDA(cudaDeviceSynchronize());
}
}
/*if( font != NULL )
{
char str[256];
sprintf(str, "%05.2f%% %s", confidence * 100.0f, net->GetClassDesc(img_class));
font->RenderOverlay((float4*)imgRGBA, (float4*)imgRGBA, camera->GetWidth(), camera->GetHeight(),
str, 10, 10, make_float4(255.0f, 255.0f, 255.0f, 255.0f));
}*/
if( display != NULL )
{
char str[256];
sprintf(str, "TensorRT build %x | %s | %04.1f FPS", NV_GIE_VERSION, net->HasFP16() ? "FP16" : "FP32", display->GetFPS());
//sprintf(str, "GIE build %x | %s | %04.1f FPS | %05.2f%% %s", NV_GIE_VERSION, net->GetNetworkName(), display->GetFPS(), confidence * 100.0f, net->GetClassDesc(img_class));
display->SetTitle(str);
}
}
// update display
if( display != NULL )
{
display->UserEvents();
display->BeginRender();
if( texture != NULL )
{
// rescale image pixel intensities for display
CUDA(cudaNormalizeRGBA((float4*)imgRGBA, make_float2(0.0f, 255.0f),
(float4*)imgRGBA, make_float2(0.0f, 1.0f),
camera->GetWidth(), camera->GetHeight()));
// map from CUDA to openGL using GL interop
void* tex_map = texture->MapCUDA();
if( tex_map != NULL )
{
cudaMemcpy(tex_map, imgRGBA, texture->GetSize(), cudaMemcpyDeviceToDevice);
texture->Unmap();
}
// draw the texture
texture->Render(100,100);
}
display->EndRender();
}
}
- What does cudaDeviceSynchronize() on line 42 do? I added some code at line 45 and everything seems to run asynchronously.
- What happens if I add a piece of code at line 41, i.e., before the call to cudaDeviceSynchronize() but that runs on the CPU? Will I be able to ensure that the data obtained before line 41, eg. value of 'bb' here will not change till my CPU code runs? If so, how? Any 'pointers' (pun intended) would help.
Sorry for the complicated post. I am very confused as I have no CUDA experience/exposure.
Many thanks in advance.