Hi all,
I’ve been working on a kernel for a few days already and I am always getting the same compiler error message which I don’t really understand External Image It is:
First-chance exception at 0x7c812a5b in HoloPipelineApp.exe: Microsoft C++ exception: cudaError_enum at memory location 0x0012dd88…
First-chance exception at 0x7c812a5b in HoloPipelineApp.exe: Microsoft C++ exception: cudaError_enum at memory location 0x0012dd90…
First-chance exception at 0x7c812a5b in HoloPipelineApp.exe: Microsoft C++ exception: cudaError_enum at memory location 0x0012dd88…
When I compile my code, I get an Advisory warning, that some memory is not clearly located:
… line 228: Advisory: Cannot tell what pointer points to, assuming global memory space
The kernel works on a texture and each thread should actually calculate one pixel of the result data. I would be very happy for any suggestions as how to find and/or fix the error :) The important lines have been commented in color. I am trying to use as much global memory (device) as possible to increase the performance. d_resultData contains about 250’000 float2 elements. By the way, doesn’t CUDA have operators for the vector types, e.g. to add a float3 in a straightforward manner:
float3 f3A = make_float3(0.0f,0.0f,0.0f);
float3 f3B = make_float3(0.2f,0.4f,0.8f);
float3 f3Result = f3A + f3B;
futhermore, operators like length(TYPE) would be nice External Image
Anyway…here is my code, which is called from my framework:
#ifndef HOLO_CUDA_EYE_EVALUATOR_H
#define HOLO_CUDA_EYE_EVALUATOR_H
// define CUDA block size
#define BLOCK_SIZE 16
//! includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
//! includes, CUDA
#include <cuda.h>
#include <cutil.h>
// forward declarations
extern “C” global void evaluatePointSources_kernel(float2* d_resultData, int iNumPointSources, int iWidth, int iHeight);
extern “C” device float3 interpolatePosition(float x, float y);
extern “C” device float3 interpolateDirection(float3 f3PositionSource, float x, float y);
extern “C” device float f3Length(float3 vec);
texture<float2, 2, cudaReadModeElementType> sourceTexture;
// these for corners are used for billinear interpolation
// in the simulated pixel shader
device float3 cornerLL; // lower left
device float3 cornerLR; // lower right
device float3 cornerUL; // upper left
device float3 cornerUR; // upper right
// these four directions are used for billinear interpolation
// in the simulated pixel shader
device float3 directionLL;
device float3 directionLR;
device float3 directionUL;
device float3 directionUR;
// the point positions and (amplitude,phase,k) data in global memory
device float3* d_pointPositions;
device float3* d_f3APhaseKs;
// the host part
extern “C” host bool evaluatePointSources(int iNumPointSources,
int iWidth,
int iHeight,
float* targetData,
float* sourceData,
float3* pointPositions,
float3* f3APhaseKs,
float3 aPosf3[4],
float3 aDirf3[4])
{
// initialize memory on the device for the source texture
// the CUDA channel description for the source array
cudaChannelFormatDesc sourceChannelDesc = cudaCreateChannelDesc(32,32,0,0,cudaChannelFormatKindFloat);
// the CUDA array for the source texture
cudaArray* sourceArray;
CUDA_SAFE_CALL(cudaMallocArray(&sourceArray, &sourceChannelDesc, iWidth, iHeight));
CUT_CHECK_ERROR("cudaMallocArray failed");
CUDA_SAFE_CALL(cudaMemcpyToArray(sourceArray, 0, 0, sourceData, iWidth*iHeight*2*sizeof(float), cudaMemcpyHostToDevice));
CUT_CHECK_ERROR("cudaMemcpyToArray");
// set texture parameters for source texture
sourceTexture.addressMode[0] = cudaAddressModeClamp;
sourceTexture.addressMode[1] = cudaAddressModeClamp;
sourceTexture.filterMode = cudaFilterModeLinear;
sourceTexture.normalized = false;
// bind the source array to the source texture
CUDA_SAFE_CALL(cudaBindTextureToArray(sourceTexture, sourceArray, sourceChannelDesc));
CUT_CHECK_ERROR("cudaBindTextureToArray failed");
// copy the point locations and (amplitudue,phase,k) triples to the device
// obtain the correct pointer for the point positions
float3* devPtr = NULL;
CUDA_SAFE_CALL(cudaGetSymbolAddress((void**)&devPtr, d_pointPositions));
CUT_CHECK_ERROR("cudaGetSymbolAddress failed");
CUDA_SAFE_CALL(cudaMalloc((void**)&devPtr, iNumPointSources*sizeof(float3)));
CUT_CHECK_ERROR("cudaMalloc failed");
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_pointPositions, pointPositions, iNumPointSources, 0, cudaMemcpyHostToDevice));
CUT_CHECK_ERROR("cudaMemcpy failed");
// obtain the correct pointer for the (Amplitude,Phase,K) triples
devPtr = NULL;
CUDA_SAFE_CALL(cudaGetSymbolAddress((void**)&devPtr, d_f3APhaseKs));
CUT_CHECK_ERROR("cudaGetSymbolAddress failed");
CUDA_SAFE_CALL(cudaMalloc((void**)&devPtr, iNumPointSources*sizeof(float3)));
CUT_CHECK_ERROR("cudaMalloc failed");
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_f3APhaseKs, f3APhaseKs, iNumPointSources, 0, cudaMemcpyHostToDevice));
CUT_CHECK_ERROR("cudaMemcpy failed");
// allocate memory for the result data
float2* d_resultData = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_resultData, iNumPointSources*sizeof(float2)));
CUT_CHECK_ERROR("cudaMalloc failed");
// copy the corner positions and directions to the (global) device memory
CUDA_SAFE_CALL(cudaMemcpyToSymbol(cornerLL,&aPosf3[0],sizeof(float3),0,cudaMemcpyHostToDevice));
CUT_CHECK_ERROR("cudaMemcpyToSymbol failed");
CUDA_SAFE_CALL(cudaMemcpyToSymbol(cornerLR,&aPosf3[1],sizeof(float3),0,cudaMemcpyHostToDevice));
CUT_CHECK_ERROR("cudaMemcpyToSymbol failed");
CUDA_SAFE_CALL(cudaMemcpyToSymbol(cornerUL,&aPosf3[2],sizeof(float3),0,cudaMemcpyHostToDevice));
CUT_CHECK_ERROR("cudaMemcpyToSymbol failed");
CUDA_SAFE_CALL(cudaMemcpyToSymbol(cornerUR,&aPosf3[3],sizeof(float3),0,cudaMemcpyHostToDevice));
CUT_CHECK_ERROR("cudaMemcpyToSymbol failed");
CUDA_SAFE_CALL(cudaMemcpyToSymbol(directionLL,&aDirf3[0],sizeof(float3),0,cudaMemcpyHostToDevice));
CUT_CHECK_ERROR("cudaMemcpyToSymbol failed");
CUDA_SAFE_CALL(cudaMemcpyToSymbol(directionLR,&aDirf3[1],sizeof(float3),0,cudaMemcpyHostToDevice));
CUT_CHECK_ERROR("cudaMemcpyToSymbol failed");
CUDA_SAFE_CALL(cudaMemcpyToSymbol(directionUL,&aDirf3[2],sizeof(float3),0,cudaMemcpyHostToDevice));
CUT_CHECK_ERROR("cudaMemcpyToSymbol failed");
CUDA_SAFE_CALL(cudaMemcpyToSymbol(directionUR,&aDirf3[3],sizeof(float3),0,cudaMemcpyHostToDevice));
CUT_CHECK_ERROR("cudaMemcpyToSymbol failed");
// calculate the grid and block size
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(iWidth / dimBlock.x, iHeight / dimBlock.y);
// call the CUDA kernel with the appropriate grid and block size
evaluatePointSources_kernel<<<dimGrid, dimBlock>>>(d_resultData, iNumPointSources, iWidth, iHeight);
//copy result data back
CUDA_SAFE_CALL(cudaMemcpy(targetData, d_resultData, iNumPointSources*sizeof(float2), cudaMemcpyDeviceToHost));
CUT_CHECK_ERROR("cudaMemcpy failed");
// free memory on the device
CUDA_SAFE_CALL(cudaFreeArray(sourceArray));
CUT_CHECK_ERROR("cudaFreeArray failed");
CUDA_SAFE_CALL(cudaFree(d_pointPositions));
CUT_CHECK_ERROR("cudaFree failed");
CUDA_SAFE_CALL(cudaFree(d_f3APhaseKs));
CUT_CHECK_ERROR("cudaFree failed");
CUDA_SAFE_CALL(cudaFree(d_resultData));
CUT_CHECK_ERROR("cudaFree failed");
// if all went well return TRUE
return true;
}
// the device part
extern “C” global void evaluatePointSources_kernel(float2* d_resultData, int iNumPointSources, int iWidth, int iHeight)
{
// CUDA block index
int bX = blockIdx.x;
int bY = blockIdx.y;
// CUDA thread index
int tX = threadIdx.x;
int tY = threadIdx.y;
// calculate total coordinate of current thread which acutally corresponds to a single pixel
// being evaluated in the current thread
int iX = bX*blockDim.x + tX;
int iY = bY*blockDim.y + tY;
float2 resTexel = tex2D(sourceTexture, iX, iY);
// for all point sources...
for(int i=0; i<iNumPointSources; i++)
{
// ...evaluate the current texel
float3 positionSource = d_pointPositions[i];
float3 f3APhaseK = d_f3APhaseKs[i]; // this memory location seems to be unknown, but strangely, only if the line commented by purple color is uncommented...
//float3 interpVPosition = interpolatePosition(iX/iWidth, iY/iHeight);
float3 interpVDirection = interpolateDirection(positionSource, iX/iWidth, iY/iHeight);
float fR = f3Length(interpVDirection);
float fSin, fCos;
__sincosf(fR*f3APhaseK.z + f3APhaseK.y, &fSin, &fCos);
float fAmp = f3APhaseK.x / fR;
float2 curTexel = make_float2(fAmp*fCos,fAmp*fSin);
resTexel = make_float2(resTexel.x+curTexel.x, resTexel.y+curTexel.y); // This line makes the program hang, without it, it runs fine, but of course then the whole loop maybe even doesn't get evaluated, since no result is written or used, so the compiler simply strips this code then I guess.
}
// write the resulting color value to the summation memory
//__syncthreads(); // no change if uncommented
d_resultData[iY*iWidth+iX] = resTexel;
}
/*
-
This function replaces the fragment interpolation which is usually done
-
in the rendering pipeline between vertex and fragment processing.
-
It is a bilinear interpolation of the vertex position.
*/
extern “C” device float3 interpolatePosition(float x, float y)
{
float3 position = make_float3(cornerLL.x*(1-x)*(1-y) + cornerLR.x*x*(1-y) + cornerUL.x*(1-x)*y + cornerUR.x*x*y,
cornerLL.y*(1-x)*(1-y) + cornerLR.y*x*(1-y) + cornerUL.y*(1-x)*y + cornerUR.y*x*y,
cornerLL.z*(1-x)*(1-y) + cornerLR.z*x*(1-y) + cornerUL.z*(1-x)*y + cornerUR.z*x*y );
return position;
}
/*
-
This function replaces the fragment interpolation which is usually done
-
in the rendering pipeline between vertex and fragment processing.
-
It is a bilinear interpolation of the vertex direction relative to the point of evaluation \c f3PositionSource
*/
extern “C” device float3 interpolateDirection(float3 f3PositionSource, float x, float y)
{
float3 newDirectionLL = make_float3(directionLL.x - f3PositionSource.x,
directionLL.y - f3PositionSource.y,
directionLL.z - f3PositionSource.z);
float3 newDirectionLR = make_float3(directionLR.x - f3PositionSource.x,
directionLR.y - f3PositionSource.y,
directionLR.z - f3PositionSource.z);
float3 newDirectionUL = make_float3(directionUL.x - f3PositionSource.x,
directionUL.y - f3PositionSource.y,
directionUL.z - f3PositionSource.z);
float3 newDirectionUR = make_float3(directionUR.x - f3PositionSource.x,
directionUR.y - f3PositionSource.y,
directionUR.z - f3PositionSource.z);
float3 direction = make_float3(directionLL.x*(1-x)*(1-y) + directionLR.x*x*(1-y) + directionUL.x*(1-x)*y + directionUR.x*x*y,
directionLL.y*(1-x)*(1-y) + directionLR.y*x*(1-y) + directionUL.y*(1-x)*y + directionUR.y*x*y,
directionLL.z*(1-x)*(1-y) + directionLR.z*x*(1-y) + directionUL.z*(1-x)*y + directionUR.z*x*y );
return direction;
}
/*
-
This small helper function computes the length of a float3 vector.
*/
extern “C” device float f3Length(float3 vec)
{
float length = vec.x*vec.x + vec.y*vec.y + vec.z*vec.z;
return sqrtf(length);
}