Access violation during kernel invocation Possible misunderstanding of memory usag

I’m relatively new to CUDA programming so I assume this is due to something I’ve missed or misunderstood in the documentation.

In my image processing system, I invoke kernels repeatedly in a loop until clustering is accurate enough. In EmuDebug mode, GetLiklihoodValuesKernel below causes an ‘Access violation exception writing…’, it doesn’t enter the kernel at all so I presume there’s no point posting all of it’s code. This even fails with trivially small amounts of data so I can rule out running out of memory (I think this throws it’s own exception anyway).

Is it OK to use device memory pointers like I am doing?

In Debug mode, EMKernel is executed repeatedly, I presume because GetLiklihoodValuesKernel is not working properly.

CUDA_SAFE_CALL( cudaMalloc( (void **)&deviceInitialData, dataSize));

CUDA_SAFE_CALL( cudaMalloc( (void **)&dKProcessedData, CLUSTER_DATA_SIZE));

CUDA_SAFE_CALL( cudaMemcpy(deviceInitialData, hostInitialData, dataSize, cudaMemcpyHostToDevice) );

while (newTotalDistance < oldTotalDistance)

    {

	clustering<<<grid, threads>>>(

        dKProcessedData,

        deviceInitialData,

        width,

        height,

        2,

        cluster1,

        cluster2,

        first

	);

  	

  CUDA_SAFE_CALL( cudaMemcpy(hostResultData, dKProcessedData, CLUSTER_DATA_SIZE, cudaMemcpyDeviceToHost) );

  	

  // use hostResultData to update cluster1 and cluster2 for each kernel iteration

  

     }	

  

	CUDA_SAFE_CALL( cudaFree(dKProcessedData) );

	CUT_CHECK_ERROR("kmeans clustering() execution failed\n");

      	

  

	emHostResult = (structExpectationReturn *)malloc(NUM_BLOCKS * sizeof(structExpectationReturn));

	liklihoodHostResultData = (float *)malloc(NUM_BLOCKS);

	

	CUDA_SAFE_CALL( cudaMalloc( (void **)&emDeviceResult, NUM_BLOCKS * sizeof(structExpectationReturn)));

	

	do

	{

  EMKernel<<<grid, threads>>>(

  	deviceInitialData,

  	emDeviceResult,

              initialComponentArray,

              width,

              height

  	);

  

  CUDA_SAFE_CALL( cudaMemcpy(emHostResult, emDeviceResult, CLUSTER_DATA_SIZE, cudaMemcpyDeviceToHost) );

    	

  // use emHostResult to update cluster1 and cluster2 for each kernel iteration

  

  CUDA_SAFE_CALL( cudaMalloc( (void **)&liklihoodDeviceData, NUM_BLOCKS * sizeof(float)));

    

  // secondary kernel to get an accuracy value

  

  GetLiklihoodValuesKernel<<<grid, threads>>>(

  	deviceInitialData,

  	liklihoodDeviceData,

              initialComponentArray,

              width,

              height

  	);

  

  CUDA_SAFE_CALL( cudaMemcpy(liklihoodHostResultData, liklihoodDeviceData, NUM_BLOCKS * sizeof(float), cudaMemcpyDeviceToHost) );

  

  // use liklihoodHostResultData to find newLiklihoodValue each time

  

	}

	while(abs(oldLiklihoodValue / newLiklihoodValue - 1) > 0.001);

The signature of the kernel and it’s shared data is

__global__ void GetLiklihoodValuesKernel(

	float *d_Data,

	float *totals,

	componentArrays compArrays,

	int dataW,

    int dataH

	)

{

	float blockTotal;

	

	__shared__ float pixelTotals[blockSize];

This is probably something something quite simple but I’ve spent hours looking at and just can’t see what’s happening. Any ideas greatly appreciated.