Could not find an applicable discussion about our specific problem after searching online and in this forum.
First attempt at real world CUDA programming. Have a C++ CUDA project which is loaded and called by a C# using ManagedCuda. (yes, the GPU usage is inefficient but first we need to get it to just work)
The CUDA function works fine when called during unit testing of the C# method.
But when the CUDA function is called in the application it always crashes with CUDA_ERROR_ILLEGAL_ADDRESS.
Debugged using NSight CUDA Debugging. Says all of the input array variables point to invalid memory - ???
The first two int parameters have correct values.
Any ideas why the memory pointers are wrong? TIA!
Here are the calling function and the CUDA function:
private bool CalculateOnCUDAThread()
{
CudaDeviceVariable<int> d_data = _lines.Data;
int[] h_lines = new int[_lines.FrameHeight * 2];
for (int i = 0, j = 0; i < _lines.LineCount; i++)
{
h_lines[j++] = (int)_lines.Lines[i].Start.X;
h_lines[j++] = (int)_lines.Lines[i].End.X;
}
CudaDeviceVariable<int> d_lines = h_lines;
int dimResults = 10;
CudaDeviceVariable<Int64> d_results = new CudaDeviceVariable<Int64>(_lines.FrameHeight * dimResults);
int threadsPerBlock = Math.Min(256, _lines.FrameHeight);
cudaPeak.BlockDimensions = threadsPerBlock;
cudaPeak.GridDimensions = (_lines.FrameHeight + threadsPerBlock - 1) / threadsPerBlock;
cudaPeak.Run(_lines.FrameWidth, _lines.FrameHeight, d_data.DevicePointer, d_lines.DevicePointer, d_results.DevicePointer);
Int64[] h_results = d_results;
...
}
global void cudaPeak(int width, int height, int *data, int *lines, __int64 *result)
{
int y = threadIdx.x + blockIdx.x * blockDim.x;
if (y >= height) return;
int offset = y*width;
int xstart = lines[y * 2 + 0];
int xend = lines[y * 2 + 1];
__int64 total = 0;
__int64 totalX = 0;
__int64 totalY = 0;
int min = INT_MAX;
int max = INT_MIN;
int maxX = -1;
result[y * 10 + 3] = LONG_MAX;
result[y * 10 + 4] = LONG_MIN;
for (int x = xstart; x <= xend; x++)
{
int dataValue = data[x + offset];
totalX += dataValue * x;
totalY += dataValue * y;
if (dataValue < min)
min = dataValue;
if (dataValue > max)
{
max = dataValue;
maxX = x;
}
total += dataValue;
}
result[y * 10 + 0] = total;
result[y * 10 + 1] = totalX;
result[y * 10 + 2] = totalY;
result[y * 10 + 3] = min;
result[y * 10 + 4] = max;
result[y * 10 + 5] = maxX;
result[y * 10 + 6] = threadIdx.x;
result[y * 10 + 7] = blockIdx.x;
result[y * 10 + 8] = blockDim.x;
}