I’m relatively new to CUDA programming so I assume this is due to something I’ve missed or misunderstood in the documentation.
In my image processing system, I invoke kernels repeatedly in a loop until clustering is accurate enough. In EmuDebug mode, GetLiklihoodValuesKernel below causes an ‘Access violation exception writing…’, it doesn’t enter the kernel at all so I presume there’s no point posting all of it’s code. This even fails with trivially small amounts of data so I can rule out running out of memory (I think this throws it’s own exception anyway).
Is it OK to use device memory pointers like I am doing?
In Debug mode, EMKernel is executed repeatedly, I presume because GetLiklihoodValuesKernel is not working properly.
CUDA_SAFE_CALL( cudaMalloc( (void **)&deviceInitialData, dataSize));
CUDA_SAFE_CALL( cudaMalloc( (void **)&dKProcessedData, CLUSTER_DATA_SIZE));
CUDA_SAFE_CALL( cudaMemcpy(deviceInitialData, hostInitialData, dataSize, cudaMemcpyHostToDevice) );
while (newTotalDistance < oldTotalDistance)
{
clustering<<<grid, threads>>>(
dKProcessedData,
deviceInitialData,
width,
height,
2,
cluster1,
cluster2,
first
);
CUDA_SAFE_CALL( cudaMemcpy(hostResultData, dKProcessedData, CLUSTER_DATA_SIZE, cudaMemcpyDeviceToHost) );
// use hostResultData to update cluster1 and cluster2 for each kernel iteration
}
CUDA_SAFE_CALL( cudaFree(dKProcessedData) );
CUT_CHECK_ERROR("kmeans clustering() execution failed\n");
emHostResult = (structExpectationReturn *)malloc(NUM_BLOCKS * sizeof(structExpectationReturn));
liklihoodHostResultData = (float *)malloc(NUM_BLOCKS);
CUDA_SAFE_CALL( cudaMalloc( (void **)&emDeviceResult, NUM_BLOCKS * sizeof(structExpectationReturn)));
do
{
EMKernel<<<grid, threads>>>(
deviceInitialData,
emDeviceResult,
initialComponentArray,
width,
height
);
CUDA_SAFE_CALL( cudaMemcpy(emHostResult, emDeviceResult, CLUSTER_DATA_SIZE, cudaMemcpyDeviceToHost) );
// use emHostResult to update cluster1 and cluster2 for each kernel iteration
CUDA_SAFE_CALL( cudaMalloc( (void **)&liklihoodDeviceData, NUM_BLOCKS * sizeof(float)));
// secondary kernel to get an accuracy value
GetLiklihoodValuesKernel<<<grid, threads>>>(
deviceInitialData,
liklihoodDeviceData,
initialComponentArray,
width,
height
);
CUDA_SAFE_CALL( cudaMemcpy(liklihoodHostResultData, liklihoodDeviceData, NUM_BLOCKS * sizeof(float), cudaMemcpyDeviceToHost) );
// use liklihoodHostResultData to find newLiklihoodValue each time
}
while(abs(oldLiklihoodValue / newLiklihoodValue - 1) > 0.001);
The signature of the kernel and it’s shared data is
__global__ void GetLiklihoodValuesKernel(
float *d_Data,
float *totals,
componentArrays compArrays,
int dataW,
int dataH
)
{
float blockTotal;
__shared__ float pixelTotals[blockSize];
This is probably something something quite simple but I’ve spent hours looking at and just can’t see what’s happening. Any ideas greatly appreciated.