Invalid value for referenced variable

Hi to everyone,
I’m tring to get the right value from function GetArea but everytimes value is 0 instead of 3.
I tried inlining function and usign pointer to variable but anything change.

Someone can explain what happens?

I’m using cuda 9.0

Thank you, Fabio.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__device__ __forceinline__ void GetArea(float dx, float dy, float &area);

__device__ __forceinline__ void testArea();

__global__ void addKernel(int *c, const int *a, const int *b)
{
	int i = threadIdx.x;
	c[i] = a[i] + b[i];

	testArea();
}

__device__ __forceinline__ void testArea(){
	float dxd = 0.7272;
	float dyu = 0.2727;
	float area;

	area = 0;
	GetArea(dxd, dyu, area);

	int fff = 0;
}

// calculate coefficient that describe area of interest of this pixel
__device__ /*__forceinline__*/ void GetArea(float dx, float dy, float &area)
{
	area = 3;// sqrtf(2) - sqrtf(dx * dx + dy * dy);
}


int main()
{
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

when I add the following:

GetArea(dxd, dyu, area);
        printf("area = %f\n", area); // add this line

I get the expected printout:

area = 3.000000
area = 3.000000
area = 3.000000
area = 3.000000
area = 3.000000

Since your code modifies no global state for the area calculation, the compiler may optimize that out unless you store it in global memory or print it out.

Thank’s txbob,
I understand the problem. But here I reported only one case of invalid value but in my code I have multiple situation in which it happens.
It’s a way to be sure that compiler doesn’t “cut” this parts of code and doesn’t reduce performances using a printf?

Thanks

If you compile your code in debug mode, all optimizations are turned off and I would assume this includes dead code elimination (all device code that does not contribute to the alteration of global state will normally be considered “dead” by the compiler and is eliminated, as txbob explained).

Ok I understand the problem and I wrong to put that code in my post beacuse it’s misleading.
Here I put the code that I’m actually using.
In funtion Distort, when function UndistortPixel return, the values of undistortedPixelX and undistortedPixelY are wrong but inside UndistortPixel values are right.

As you can see, compiler shouldn’t remove that code because it’s used after.
Is it right or do I wrong?

Thank’s Fabio.

/// <summary>
/// Used to calculate undistorted pixel by distorted one. (0,0) is in center of image
/// </summary>
/// <returns></returns>
__device__ void UndistortPixel(const int distortedPointX, const int distortedPointY, float &undistortedPointX, float &undistortedPointY, ProcessingPars *pars)
{
	float Dx = distortedPointX - (*pars).Xc;
	float Dy = distortedPointY - (*pars).Yc;
	float r2 = Dx * Dx + Dy * Dy;
	float r4 = r2 * r2;
	float r6 = r4 * r2;

	float antiK1 = -(*pars).K1;
	float antiK2 = -(*pars).K2;
	float antiK3 = -(*pars).K3;

	float antiP1 = -(*pars).P1;
	float antiP2 = -(*pars).P2;

	//double x = p.X + Dx * (antiK1 * r2 + antiK2 * r4 + antiK3 * r6) + (antiP1 * (r2 + 2 * Dx * Dx) + 2 * antiP2 * Dx * Dy);
	//double y = p.Y + Dy * (antiK1 * r2 + antiK2 * r4 + antiK3 * r6) + (2 * antiP1 * Dx * Dy + antiP2 * (r2 + 2 * Dy * Dy));

	//return new Vector2D(x, y);


	float x = (*pars).Xc + Dx / (1 + antiK1 * r2 + antiK2 * r4 + antiK3 * r6);
	float y = (*pars).Yc + Dy / (1 + antiK1 * r2 + antiK2 * r4 + antiK3 * r6);

	undistortedPointX = x;
	undistortedPointY = y;
}


__device__ void GetPixelsForDistortedPixel(float *undistortedPixelX, float *undistortedPixelY, int2 *points, float * areas, ProcessingPars *pars, int *numPoints)
{

	*numPoints = 0;

	int xd = floorf(*undistortedPixelX);
	int xu = ceilf(*undistortedPixelX);
	int yd = floorf(*undistortedPixelY);
	int yu = ceilf(*undistortedPixelY);

	float dxd = *undistortedPixelX - xd;
	float dxu = xu - *undistortedPixelX;
	float dyd = *undistortedPixelY - yd;
	float dyu = yu - *undistortedPixelY;
	float area;

	area = 0;
	GetArea(dxd, dyu, area); 
	if (area > 0 && pars->XMaxInputToDistortImage > xd && xd >= pars->XMinInputToDistortImage && pars->YMaxInputToDistortImage > yu && yu >= pars->YMinInputToDistortImage)
	{
		areas[*numPoints] = area;
		points[*numPoints].x = xd;
		points[*numPoints].y = yu;
		(*numPoints)++;
	}
	if (yd != yu)
	{
		GetArea(dxd, dyd, area); 
		if (area > 0 && pars->XMaxInputToDistortImage > xd && xd >= pars->XMinInputToDistortImage && pars->YMaxInputToDistortImage > yd && yd >= pars->YMinInputToDistortImage)
		{
			areas[*numPoints] = area;
			points[*numPoints].x = xd;
			points[*numPoints].y = yd;
			(*numPoints)++;
		}
	}

	if (xd != xu)
	{
		GetArea(dxu, dyu, area); 
		if (area > 0 && pars->XMaxInputToDistortImage > xu && xu >= pars->XMinInputToDistortImage && pars->YMaxInputToDistortImage > yu && yu >= pars->YMinInputToDistortImage)
		{
			areas[*numPoints] = area;
			points[*numPoints].x = xu;
			points[*numPoints].y = yu;
			(*numPoints)++;
		}
		if (yd != yu)
		{
			GetArea(dxu, dyd, area);
			if (area > 0 && pars->XMaxInputToDistortImage > xu && xu >= pars->XMinInputToDistortImage && pars->YMaxInputToDistortImage > yd && yd >= pars->YMinInputToDistortImage)
			{
				areas[*numPoints] = area;
				points[*numPoints].x = xu;
				points[*numPoints].y = yd;
				(*numPoints)++;
			}
		}
	}


}


__global__ void Distort(unsigned char *inputBitmap, unsigned char *outputBitmap, ProcessingPars *pars)

{

	int x = blockIdx.x*blockDim.x + threadIdx.x;
	int y = blockIdx.y*blockDim.y + threadIdx.y;

	if (x > (*pars).OutputDistortWidth || y > (*pars).OutputDistortHeight)
		return;

	x -= ((*pars).OutputDistortWidth /2);
	y -= ((*pars).OutputDistortHeight /2);


	int numPoints = 0;

	// get values of distorted pixel referenced by undistortedPixel
	float undistortedPixelX = 0;
	float undistortedPixelY = 0;
	float *ptrUndPxixX = &undistortedPixelX;
	float *ptrUndPxixY = &undistortedPixelY;

	UndistortPixel(x, y, undistortedPixelX, undistortedPixelY, pars);
				
	int2 nearPoints[4];
	float areas[4];
	
	GetPixelsForDistortedPixel(&undistortedPixelX, &undistortedPixelY, (int2 *)nearPoints, (float *)areas, pars, &numPoints);
			
	int totArea = 0;
	for (int i = 0; i < numPoints; i++)
		totArea += areas[i];

	float value = 0;
	int id = 0;

	for (int i = 0; i<numPoints; i++)
	{
		GetIdOfInputImage(nearPoints[0].x, nearPoints[0].y, 1, pars, &id);
		value += inputBitmap[id] * areas[i];
	}

	GetIdInOutputImage(x, y, 1, pars, &id);
	outputBitmap[id] = (unsigned char)(value / totArea);

	GetIdInOutputImage(x, y, 1, pars, &id);
	int h = 0;
}

Good evening,
here I prepared a code in that you can understand witch is the problem.

Thank’s Fabio.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <math.h>
#include <cuda.h>

#include <cuda_runtime_api.h>

#include <device_functions.h>

#include "math_functions.h"

#include <stdio.h>;

#include "cuda_fp16.h"

extern "C"             //No name mangling
__declspec(dllexport) __host__ __device__ struct ProcessingPars
{
	float K1;
	float K2;
	float K3;
	float P1;
	float P2;
	float Xc;
	float Yc;
	int OutputDistortWidth;
	int OutputDistortHeight;
	int WidthInputFrom3DImage;
	int HeightInputFrom3DImage;
	int XMinInputToDistortImage;
	int XMaxInputToDistortImage;
	int YMinInputToDistortImage;
	int YMaxInputToDistortImage;
	int WidthInputToDistortImage;
	int HeightInputToDistortImage;
};


#define MAX_BLOCK_SIZE 32


static void checkCUDAError(cudaError_t err, const char *file, int line);
#define CUDA_ERROR( err ) ( checkCUDAError( err, __FILE__, __LINE__ ) )


// Simple utility function to check for CUDA runtime errors
static void checkCUDAError(cudaError_t err, const char *file, int line)
{
	if (err != cudaSuccess)
	{
		const char * error = cudaGetErrorString(err);

		fprintf(stderr, "Cuda error: %s in %s at line %d\n", error,
			file, line);
		//exit(EXIT_FAILURE);
	}
}











/// calculate coefficient that describe area of interest of this pixel
__device__ __forceinline__ void GetArea(float dx, float dy, float &area)
{
	area = sqrtf(2) - sqrtf(dx * dx + dy * dy);
}



/// <summary>
/// Used to calculate undistorted pixel by distorted one. (0,0) is in center of image
/// </summary>
/// <returns></returns>
__device__ void UndistortPixel(const int distortedPointX, const int distortedPointY, float &undistortedPointX, float &undistortedPointY, ProcessingPars *pars)
{
	float Dx = distortedPointX - (*pars).Xc;
	float Dy = distortedPointY - (*pars).Yc;
	float r2 = Dx * Dx + Dy * Dy;
	float r4 = r2 * r2;
	float r6 = r4 * r2;

	float antiK1 = -(*pars).K1;
	float antiK2 = -(*pars).K2;
	float antiK3 = -(*pars).K3;

	float antiP1 = -(*pars).P1;
	float antiP2 = -(*pars).P2;

	//double x = p.X + Dx * (antiK1 * r2 + antiK2 * r4 + antiK3 * r6) + (antiP1 * (r2 + 2 * Dx * Dx) + 2 * antiP2 * Dx * Dy);
	//double y = p.Y + Dy * (antiK1 * r2 + antiK2 * r4 + antiK3 * r6) + (2 * antiP1 * Dx * Dy + antiP2 * (r2 + 2 * Dy * Dy));

	//return new Vector2D(x, y);


	float x = (*pars).Xc + Dx / (1 + antiK1 * r2 + antiK2 * r4 + antiK3 * r6);
	float y = (*pars).Yc + Dy / (1 + antiK1 * r2 + antiK2 * r4 + antiK3 * r6);

	undistortedPointX = x;
	undistortedPointY = y;
}

/// calculate id in un 1 dimention array of pixel x,y where (0,0) is image center
__device__ void GetIdOfInputImage(int x, int y, int numBytesPerPixel, ProcessingPars *pars, int *id)
{
	*id = (x - (*pars).XMinInputToDistortImage) * numBytesPerPixel + (y - (*pars).YMinInputToDistortImage) * numBytesPerPixel * (*pars).WidthInputToDistortImage;
}


__device__ void GetPixelsForDistortedPixel(float *undistortedPixelX, float *undistortedPixelY, int2 *points, float * areas, ProcessingPars *pars, int *numPoints)
{

	*numPoints = 0;

	int xd = floorf(*undistortedPixelX);
	int xu = ceilf(*undistortedPixelX);
	int yd = floorf(*undistortedPixelY);
	int yu = ceilf(*undistortedPixelY);

	float dxd = *undistortedPixelX - xd;
	float dxu = xu - *undistortedPixelX;
	float dyd = *undistortedPixelY - yd;
	float dyu = yu - *undistortedPixelY;
	float area;

	area = 0;
	GetArea(dxd, dyu, area);
	if (area > 0 && pars->XMaxInputToDistortImage > xd && xd >= pars->XMinInputToDistortImage && pars->YMaxInputToDistortImage > yu && yu >= pars->YMinInputToDistortImage)
	{
		areas[*numPoints] = area;
		points[*numPoints].x = xd;
		points[*numPoints].y = yu;
		(*numPoints)++;
	}
	if (yd != yu)
	{
		GetArea(dxd, dyd, area);
		if (area > 0 && pars->XMaxInputToDistortImage > xd && xd >= pars->XMinInputToDistortImage && pars->YMaxInputToDistortImage > yd && yd >= pars->YMinInputToDistortImage)
		{
			areas[*numPoints] = area;
			points[*numPoints].x = xd;
			points[*numPoints].y = yd;
			(*numPoints)++;
		}
	}

	if (xd != xu)
	{
		GetArea(dxu, dyu, area);
		if (area > 0 && pars->XMaxInputToDistortImage > xu && xu >= pars->XMinInputToDistortImage && pars->YMaxInputToDistortImage > yu && yu >= pars->YMinInputToDistortImage)
		{
			areas[*numPoints] = area;
			points[*numPoints].x = xu;
			points[*numPoints].y = yu;
			(*numPoints)++;
		}
		if (yd != yu)
		{
			GetArea(dxu, dyd, area);
			if (area > 0 && pars->XMaxInputToDistortImage > xu && xu >= pars->XMinInputToDistortImage && pars->YMaxInputToDistortImage > yd && yd >= pars->YMinInputToDistortImage)
			{
				areas[*numPoints] = area;
				points[*numPoints].x = xu;
				points[*numPoints].y = yd;
				(*numPoints)++;
			}
		}
	}


}


/// calculate id in un 1 dimention array of pixel x,y where (0,0) is left up corner
__device__ void GetIdInOutputImage(int x, int y, int numBytesPerPixel, ProcessingPars *pars, int *id)
{
	*id = (x + (*pars).OutputDistortWidth / 2) * numBytesPerPixel + (*pars).OutputDistortWidth * numBytesPerPixel * (y + (*pars).OutputDistortHeight / 2);
}


__global__ void Distort(unsigned char *inputBitmap, unsigned char *outputBitmap, ProcessingPars *pars)

{

	int x = blockIdx.x*blockDim.x + threadIdx.x;
	int y = blockIdx.y*blockDim.y + threadIdx.y;

	if (x > (*pars).OutputDistortWidth || y > (*pars).OutputDistortHeight)
		return;

	x -= ((*pars).OutputDistortWidth / 2);
	y -= ((*pars).OutputDistortHeight / 2);


	int numPoints = 0;

	// get values of distorted pixel referenced by undistortedPixel
	float undistortedPixelX = 0;
	float undistortedPixelY = 0;
	float *ptrUndPxixX = &undistortedPixelX;
	float *ptrUndPxixY = &undistortedPixelY;

	UndistortPixel(x, y, undistortedPixelX, undistortedPixelY, pars);

	int2 nearPoints[4];
	float areas[4];

	GetPixelsForDistortedPixel(&undistortedPixelX, &undistortedPixelY, (int2 *)nearPoints, (float *)areas, pars, &numPoints);

	int totArea = 0;
	for (int i = 0; i < numPoints; i++)
		totArea += areas[i];

	float value = 0;
	int id = 0;

	for (int i = 0; i<numPoints; i++)
	{
		GetIdOfInputImage(nearPoints[0].x, nearPoints[0].y, 1, pars, &id);
		value += inputBitmap[id] * areas[i];
	}

	GetIdInOutputImage(x, y, 1, pars, &id);
	outputBitmap[id] = (unsigned char)(value / totArea);

	GetIdInOutputImage(x, y, 1, pars, &id);
	int h = 0;
}


// Helper function for using CUDA to add vectors in parallel.
void main()
{
	cudaFree(0);
	/*
	size_t val = 0;
	CUDA_ERROR(cudaDeviceGetLimit(&val, cudaLimitStackSize));

	if (val < 4096)
		CUDA_ERROR(cudaDeviceSetLimit(cudaLimitStackSize, 4096));
	*/

	cudaEvent_t start, stop;
	float elapsedTime0 = 0;
	int niter = 1;

	ProcessingPars pars;
	pars.K1 = 0.00001;
	pars.K2 = 0;
	pars.K3 =0;
	pars.P1 = 0;
	pars.P2 = 0;
	pars.Xc = 0;
	pars.Yc = 0;
	pars.OutputDistortWidth = 300;
	pars.OutputDistortHeight = 300;
	pars.WidthInputFrom3DImage = 1920;
	pars.HeightInputFrom3DImage = 1080;
	pars.XMinInputToDistortImage = -273;
	pars.XMaxInputToDistortImage = 273;
	pars.YMinInputToDistortImage = -273;
	pars.YMaxInputToDistortImage = 273;
	pars.WidthInputToDistortImage = 546;
	pars.HeightInputToDistortImage = 546;
	



	int sizeInputBin = pars.WidthInputToDistortImage*pars.HeightInputToDistortImage * sizeof(unsigned char);
	int sizeOutput = pars.OutputDistortHeight * pars.OutputDistortWidth;

#pragma region start timer
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
#pragma endregion

	ProcessingPars *pars_dev;
	CUDA_ERROR(cudaMalloc(&pars_dev, sizeof(ProcessingPars)));
	CUDA_ERROR(cudaMemcpy(pars_dev, &pars, sizeof(ProcessingPars), cudaMemcpyHostToDevice));

	unsigned char *dataImageResize_dev;
	CUDA_ERROR(cudaMalloc((void **)&dataImageResize_dev, sizeInputBin ));
	CUDA_ERROR(cudaMemset(dataImageResize_dev, 100, sizeInputBin));

	unsigned char *output_dev;
	CUDA_ERROR(cudaMalloc((void **)&output_dev, sizeOutput));
	unsigned char *output = new unsigned char;

	Distort << <1, 1 >> >(dataImageResize_dev, output_dev, pars_dev);     
	CUDA_ERROR(cudaGetLastError());

	CUDA_ERROR(cudaMemcpy(output_dev, output, sizeOutput, cudaMemcpyDeviceToHost));
	CUDA_ERROR(cudaGetLastError());

	CUDA_ERROR(cudaFree(dataImageResize_dev));
	CUDA_ERROR(cudaFree(pars_dev));
	CUDA_ERROR(cudaFree(output_dev));

	delete output;


	return;
}