NVCC: Variable missing when compiling with nvcc? Renaming from cu to cpp and compiling with VS works

Hi,

I have here some code which setup the memory on the device and start only a copy Kernel.

The Problem is that the variables dimBlock and dimGrid are not set correctly. In debug mode

the debugger don’t execute the lines where the variables are set.

Instead of x = 32 y = 16 z = 1 … VS shows following lines when compiled with nvcc.

  •   dimBlock	{x=0 y=1 z=16777216 }	dim3
    
      x	0	unsigned int
    
      y	1	unsigned int
    
      z	16777216	unsigned int
    

However, the variables are correctly set when I rename my file to cpp and comment out the kernel call

and compile it with Visual Studio compiler.

I think that it might have to do something with optimization of the nvcc compiler or is there some other

reason for that behavior?

#include "acceleratedSegmentation.h"

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <math.h>

#include "GrowCut3D_Kernel.cu"

extern "C" void acceleratedSegmentation(const unsigned char* inputVoxelVolume, 

										const unsigned char* labelVoxelVolume,

										const unsigned char* seedVoxelVolume,

										int dim_x, int dim_y, int dim_z)

{

	// global data

	cudaExtent voxelVolumeDim = make_cudaExtent(dim_x * sizeof(unsigned char), 

												dim_y * sizeof(unsigned char), 

												dim_z * sizeof(unsigned char));

	cudaMemcpy3DParms volCopyParms = {0};

	cudaError_t status = cudaSuccess;

	dim3 dimBlock;

	dim3 dimGrid;

	// host data

	cudaPitchedPtr hostInputVolumePitch3D = {0};

	cudaPitchedPtr hostSeedVolumePitch3D  = {0};

	cudaPitchedPtr hostLabelVolumePitch3D = {0};

	// device data

	cudaPitchedPtr deviceInputVolumePitch3D	= {0};

	cudaPitchedPtr deviceSeedVolumePitch3D	 = {0};

	cudaPitchedPtr deviceLabelVolumePitch3D_1	= {0};

	cudaPitchedPtr deviceLabelVolumePitch3D_2	= {0};

	cudaPitchedPtr deviceDistanceVolumePitch3D_1 = {0};

	cudaPitchedPtr deviceDistanceVolumePitch3D_2 = {0};

	//initialze cudaPitchedPointer for host data

	hostInputVolumePitch3D = make_cudaPitchedPtr((void*)inputVoxelVolume, 

												 dim_x * sizeof(unsigned char),  /* memory extend per line x direction */

												 dim_x * sizeof(unsigned char),  /* extend of data in x direction*/

												 dim_y * sizeof(unsigned char)); /* extend of data in y direction*/

	hostLabelVolumePitch3D = make_cudaPitchedPtr((void*)seedVoxelVolume, 

												 dim_x * sizeof(unsigned char),  /* memory extend per line x direction */

												 dim_x * sizeof(unsigned char),  /* extend of data in x direction*/

												 dim_y * sizeof(unsigned char)); /* extend of data in y direction*/

	hostSeedVolumePitch3D = make_cudaPitchedPtr((void*)labelVoxelVolume, 

												 dim_x * sizeof(unsigned char),  /* memory extend per line x direction */

												 dim_x * sizeof(unsigned char),  /* extend of data in x direction*/

												 dim_y * sizeof(unsigned char)); /* extend of data in y direction*/

	//initialze cudaPitchedPointer for device data

	status = cudaMalloc3D (&deviceInputVolumePitch3D, voxelVolumeDim);

	if(status != cudaSuccess)

	  {fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	status = cudaMemset3D (deviceInputVolumePitch3D, 0, voxelVolumeDim);

	if(status != cudaSuccess)

	  {fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	status = cudaMalloc3D (&deviceSeedVolumePitch3D, voxelVolumeDim);

	if(status != cudaSuccess)

	{fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	status = cudaMemset3D (deviceSeedVolumePitch3D, 0, voxelVolumeDim);

	if(status != cudaSuccess)

	  {fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	status = cudaMalloc3D (&deviceLabelVolumePitch3D_1, voxelVolumeDim);

	if(status != cudaSuccess)

	{fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	status = cudaMemset3D (deviceLabelVolumePitch3D_1, 0, voxelVolumeDim);

	if(status != cudaSuccess)

	  {fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	status = cudaMalloc3D (&deviceLabelVolumePitch3D_2, voxelVolumeDim);

	if(status != cudaSuccess)

	{fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	status = cudaMemset3D (deviceLabelVolumePitch3D_2, 0, voxelVolumeDim);

	if(status != cudaSuccess)

	  {fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	cudaExtent voxelVolumeDistanceDim  = {0};

	voxelVolumeDistanceDim = make_cudaExtent(dim_x * sizeof(float1) + 128,  //border

											 dim_y, 

											 dim_z);

	status = cudaMalloc3D (&deviceDistanceVolumePitch3D_1, voxelVolumeDistanceDim);

	if(status != cudaSuccess)

	{fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	status = cudaMemset3D (deviceDistanceVolumePitch3D_1, 0, voxelVolumeDistanceDim);

	if(status != cudaSuccess)

	  {fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	status = cudaMalloc3D (&deviceDistanceVolumePitch3D_2, voxelVolumeDistanceDim);

	if(status != cudaSuccess)

	{fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	status = cudaMemset3D (deviceDistanceVolumePitch3D_2, 0, voxelVolumeDistanceDim);

	if(status != cudaSuccess)

	  {fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	// 3D memory copy parameters

	cudaPos hostVolPos   = make_cudaPos(0 * sizeof(unsigned char), 0 * sizeof(unsigned char), 0 * sizeof(unsigned char));

	cudaPos deviceVolPos = make_cudaPos(0 * sizeof(unsigned char), 0 * sizeof(unsigned char), 0 * sizeof(unsigned char));

	// cudaMemcpy3D Host to Device

	volCopyParms.srcPos = hostVolPos;

	volCopyParms.srcPtr = hostInputVolumePitch3D;

	volCopyParms.dstPos = deviceVolPos;

	volCopyParms.dstPtr = deviceInputVolumePitch3D;

	volCopyParms.extent = voxelVolumeDim;

	volCopyParms.kind   = cudaMemcpyHostToDevice;

	status = cudaMemcpy3D(&volCopyParms);

	if(status != cudaSuccess)

	{fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	volCopyParms.srcPtr = hostSeedVolumePitch3D;

	volCopyParms.dstPtr = deviceSeedVolumePitch3D;

	status = cudaMemcpy3D(&volCopyParms);

	if(status != cudaSuccess)

	{fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	volCopyParms.srcPtr = hostLabelVolumePitch3D;

	volCopyParms.dstPtr = deviceLabelVolumePitch3D_1;

	status = cudaMemcpy3D(&volCopyParms);

	if(status != cudaSuccess)

	{fprintf(stderr, "%s\n", cudaGetErrorString(status));}

	// CUDA Kernel is here

	// sets the block and grid size

	dimBlock.x = 32;

	dimBlock.y = 16;

	dimBlock.z = 1;

	dimGrid.x = dim_x / dimBlock.x;

	dimGrid.y = dim_y / dimBlock.y;

	dimGrid.z = dim_z / dimBlock.z;

	copy3D<<<dimGrid, dimBlock>>>((unsigned char*)deviceInputVolumePitch3D.ptr,

								  (unsigned char*)deviceLabelVolumePitch3D_1.ptr,

								  deviceInputVolumePitch3D.pitch);

	// cudaMemcpy3D Device to Host - copy back calculated label

	volCopyParms.srcPos = deviceVolPos;

	volCopyParms.srcPtr = deviceLabelVolumePitch3D_1;

	volCopyParms.dstPos = hostVolPos;

	volCopyParms.dstPtr = hostLabelVolumePitch3D;

	volCopyParms.extent = voxelVolumeDim;

	volCopyParms.kind   = cudaMemcpyDeviceToHost;

	status = cudaMemcpy3D(&volCopyParms);

	{fprintf(stderr, "%s\n", cudaGetErrorString(status));}

}

I found the problem … like I guessed it has something to do with the optimization of nvcc.

The combination of nvcc and the visual studio compiler regards the variables dimBlock and dim Grid
as unused or not read in my program and so the memory is probably not allocated and my programm
fails.

This only happens when the optimzation is set to “/O2” (default) in “CUDA Build Rule”. After changing
this setting to “Od” (no optimaztion) my code works.

Is there something wrong with my code or is this a problem of “nvcc” or the “build file”?