Hi,
I have here some code which setup the memory on the device and start only a copy Kernel.
The Problem is that the variables dimBlock and dimGrid are not set correctly. In debug mode
the debugger don’t execute the lines where the variables are set.
Instead of x = 32 y = 16 z = 1 … VS shows following lines when compiled with nvcc.
-
dimBlock {x=0 y=1 z=16777216 } dim3 x 0 unsigned int y 1 unsigned int z 16777216 unsigned int
However, the variables are correctly set when I rename my file to cpp and comment out the kernel call
and compile it with Visual Studio compiler.
I think that it might have to do something with optimization of the nvcc compiler or is there some other
reason for that behavior?
#include "acceleratedSegmentation.h"
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <math.h>
#include "GrowCut3D_Kernel.cu"
extern "C" void acceleratedSegmentation(const unsigned char* inputVoxelVolume,
const unsigned char* labelVoxelVolume,
const unsigned char* seedVoxelVolume,
int dim_x, int dim_y, int dim_z)
{
// global data
cudaExtent voxelVolumeDim = make_cudaExtent(dim_x * sizeof(unsigned char),
dim_y * sizeof(unsigned char),
dim_z * sizeof(unsigned char));
cudaMemcpy3DParms volCopyParms = {0};
cudaError_t status = cudaSuccess;
dim3 dimBlock;
dim3 dimGrid;
// host data
cudaPitchedPtr hostInputVolumePitch3D = {0};
cudaPitchedPtr hostSeedVolumePitch3D = {0};
cudaPitchedPtr hostLabelVolumePitch3D = {0};
// device data
cudaPitchedPtr deviceInputVolumePitch3D = {0};
cudaPitchedPtr deviceSeedVolumePitch3D = {0};
cudaPitchedPtr deviceLabelVolumePitch3D_1 = {0};
cudaPitchedPtr deviceLabelVolumePitch3D_2 = {0};
cudaPitchedPtr deviceDistanceVolumePitch3D_1 = {0};
cudaPitchedPtr deviceDistanceVolumePitch3D_2 = {0};
//initialze cudaPitchedPointer for host data
hostInputVolumePitch3D = make_cudaPitchedPtr((void*)inputVoxelVolume,
dim_x * sizeof(unsigned char), /* memory extend per line x direction */
dim_x * sizeof(unsigned char), /* extend of data in x direction*/
dim_y * sizeof(unsigned char)); /* extend of data in y direction*/
hostLabelVolumePitch3D = make_cudaPitchedPtr((void*)seedVoxelVolume,
dim_x * sizeof(unsigned char), /* memory extend per line x direction */
dim_x * sizeof(unsigned char), /* extend of data in x direction*/
dim_y * sizeof(unsigned char)); /* extend of data in y direction*/
hostSeedVolumePitch3D = make_cudaPitchedPtr((void*)labelVoxelVolume,
dim_x * sizeof(unsigned char), /* memory extend per line x direction */
dim_x * sizeof(unsigned char), /* extend of data in x direction*/
dim_y * sizeof(unsigned char)); /* extend of data in y direction*/
//initialze cudaPitchedPointer for device data
status = cudaMalloc3D (&deviceInputVolumePitch3D, voxelVolumeDim);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
status = cudaMemset3D (deviceInputVolumePitch3D, 0, voxelVolumeDim);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
status = cudaMalloc3D (&deviceSeedVolumePitch3D, voxelVolumeDim);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
status = cudaMemset3D (deviceSeedVolumePitch3D, 0, voxelVolumeDim);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
status = cudaMalloc3D (&deviceLabelVolumePitch3D_1, voxelVolumeDim);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
status = cudaMemset3D (deviceLabelVolumePitch3D_1, 0, voxelVolumeDim);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
status = cudaMalloc3D (&deviceLabelVolumePitch3D_2, voxelVolumeDim);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
status = cudaMemset3D (deviceLabelVolumePitch3D_2, 0, voxelVolumeDim);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
cudaExtent voxelVolumeDistanceDim = {0};
voxelVolumeDistanceDim = make_cudaExtent(dim_x * sizeof(float1) + 128, //border
dim_y,
dim_z);
status = cudaMalloc3D (&deviceDistanceVolumePitch3D_1, voxelVolumeDistanceDim);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
status = cudaMemset3D (deviceDistanceVolumePitch3D_1, 0, voxelVolumeDistanceDim);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
status = cudaMalloc3D (&deviceDistanceVolumePitch3D_2, voxelVolumeDistanceDim);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
status = cudaMemset3D (deviceDistanceVolumePitch3D_2, 0, voxelVolumeDistanceDim);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
// 3D memory copy parameters
cudaPos hostVolPos = make_cudaPos(0 * sizeof(unsigned char), 0 * sizeof(unsigned char), 0 * sizeof(unsigned char));
cudaPos deviceVolPos = make_cudaPos(0 * sizeof(unsigned char), 0 * sizeof(unsigned char), 0 * sizeof(unsigned char));
// cudaMemcpy3D Host to Device
volCopyParms.srcPos = hostVolPos;
volCopyParms.srcPtr = hostInputVolumePitch3D;
volCopyParms.dstPos = deviceVolPos;
volCopyParms.dstPtr = deviceInputVolumePitch3D;
volCopyParms.extent = voxelVolumeDim;
volCopyParms.kind = cudaMemcpyHostToDevice;
status = cudaMemcpy3D(&volCopyParms);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
volCopyParms.srcPtr = hostSeedVolumePitch3D;
volCopyParms.dstPtr = deviceSeedVolumePitch3D;
status = cudaMemcpy3D(&volCopyParms);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
volCopyParms.srcPtr = hostLabelVolumePitch3D;
volCopyParms.dstPtr = deviceLabelVolumePitch3D_1;
status = cudaMemcpy3D(&volCopyParms);
if(status != cudaSuccess)
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
// CUDA Kernel is here
// sets the block and grid size
dimBlock.x = 32;
dimBlock.y = 16;
dimBlock.z = 1;
dimGrid.x = dim_x / dimBlock.x;
dimGrid.y = dim_y / dimBlock.y;
dimGrid.z = dim_z / dimBlock.z;
copy3D<<<dimGrid, dimBlock>>>((unsigned char*)deviceInputVolumePitch3D.ptr,
(unsigned char*)deviceLabelVolumePitch3D_1.ptr,
deviceInputVolumePitch3D.pitch);
// cudaMemcpy3D Device to Host - copy back calculated label
volCopyParms.srcPos = deviceVolPos;
volCopyParms.srcPtr = deviceLabelVolumePitch3D_1;
volCopyParms.dstPos = hostVolPos;
volCopyParms.dstPtr = hostLabelVolumePitch3D;
volCopyParms.extent = voxelVolumeDim;
volCopyParms.kind = cudaMemcpyDeviceToHost;
status = cudaMemcpy3D(&volCopyParms);
{fprintf(stderr, "%s\n", cudaGetErrorString(status));}
}