CUDA beginner: understanding the workflow of CUDA kernels and cudaDeviceSynchronize()


I am very new to the multi-threaded/parallel implementations of code using CUDA. I am messing around with one of the codes developed by Nvidia available at

I want to utilize the data in one of the main processing loops for one of the files, let’s say for example

Here is the code inside the loop:

while( !signal_recieved )
		void* imgCPU  = NULL;
		void* imgCUDA = NULL;
		// get the latest frame
		if( !camera->Capture(&imgCPU, &imgCUDA, 1000) )
			printf("\ndetectnet-camera:  failed to capture frame\n");

		// convert from YUV to RGBA
		void* imgRGBA = NULL;
		if( !camera->ConvertRGBA(imgCUDA, &imgRGBA) )
			printf("detectnet-camera:  failed to convert from NV12 to RGBA\n");

		// classify image with detectNet
		int numBoundingBoxes = maxBoxes;
		if( net->Detect((float*)imgRGBA, camera->GetWidth(), camera->GetHeight(), bbCPU, &numBoundingBoxes, confCPU))
			printf("%i bounding boxes detected\n", numBoundingBoxes);
			int lastClass = 0;
			int lastStart = 0;
			for( int n=0; n < numBoundingBoxes; n++ )
				const int nc = confCPU[n*2+1];
				float* bb = bbCPU + (n * 4);
				printf("bounding box %i   (%f, %f)  (%f, %f)  w=%f  h=%f\n", n, bb[0], bb[1], bb[2], bb[3], bb[2] - bb[0], bb[3] - bb[1]); 
				if( nc != lastClass || n == (numBoundingBoxes - 1) )
					if( !net->DrawBoxes((float*)imgRGBA, (float*)imgRGBA, camera->GetWidth(), camera->GetHeight(), 
						                        bbCUDA + (lastStart * 4), (n - lastStart) + 1, lastClass) )
						printf("detectnet-console:  failed to draw boxes\n");
					lastClass = nc;
					lastStart = n;
			/*if( font != NULL )
				char str[256];
				sprintf(str, "%05.2f%% %s", confidence * 100.0f, net->GetClassDesc(img_class));
				font->RenderOverlay((float4*)imgRGBA, (float4*)imgRGBA, camera->GetWidth(), camera->GetHeight(),
								    str, 10, 10, make_float4(255.0f, 255.0f, 255.0f, 255.0f));
			if( display != NULL )
				char str[256];
				sprintf(str, "TensorRT build %x | %s | %04.1f FPS", NV_GIE_VERSION, net->HasFP16() ? "FP16" : "FP32", display->GetFPS());
				//sprintf(str, "GIE build %x | %s | %04.1f FPS | %05.2f%% %s", NV_GIE_VERSION, net->GetNetworkName(), display->GetFPS(), confidence * 100.0f, net->GetClassDesc(img_class));

		// update display
		if( display != NULL )

			if( texture != NULL )
				// rescale image pixel intensities for display
				CUDA(cudaNormalizeRGBA((float4*)imgRGBA, make_float2(0.0f, 255.0f), 
								   (float4*)imgRGBA, make_float2(0.0f, 1.0f), 
		 						   camera->GetWidth(), camera->GetHeight()));

				// map from CUDA to openGL using GL interop
				void* tex_map = texture->MapCUDA();

				if( tex_map != NULL )
					cudaMemcpy(tex_map, imgRGBA, texture->GetSize(), cudaMemcpyDeviceToDevice);

				// draw the texture

  1. What does cudaDeviceSynchronize() on line 42 do? I added some code at line 45 and everything seems to run asynchronously.
  2. What happens if I add a piece of code at line 41, i.e., before the call to cudaDeviceSynchronize() but that runs on the CPU? Will I be able to ensure that the data obtained before line 41, eg. value of 'bb' here will not change till my CPU code runs? If so, how? Any 'pointers' (pun intended) would help.

Sorry for the complicated post. I am very confused as I have no CUDA experience/exposure.

Many thanks in advance.