why does my card hang?

once my card gets to the bolded cutilSafeCall( cudaThreadSynchronize() ); my card hangs. the display goes blank and comes up with a message on windows 7 saying my nvidia driver stopped and was recovered?

any thoughts?

p.s. incase it help my card is inno3D GTS250

[codebox]global void create_mask (float* map, int rows, int cols);

device float getMinDistance (char *edge_map, float *map, int i, int j, int rows, int cols, int step);

global void getDistanceMap(float *map, float dist_map, char edge_map, int rows, int cols, int step);

void distanceTransform_GPU(char *edge_map, float *distanceMap, int rows, int cols, int step){

dim3 dimBlock(16,16);

float *map, *dist_map_GPU;

unsigned int hTimer=0;

char *edge_map_GPU;

double gpuTime;



static int initial_call=1;

if (initial_call){

	

	dim3 dimGrid((int)ceil((float)(2*rows-1)/(float)dimBlock.x),(int)ceil((float)(2*cols-1)/(float)dimBlock.y));

	cudaSetDevice( cutGetMaxGflopsDeviceId() );					//intialise fastest available device

	//allocate memory

	cudaMalloc((void **)&map, (2*rows-1)*(2*cols-1)*sizeof(float));	//allocate memory for map on GPU

	cudaMalloc((void **)&dist_map_GPU, rows*cols*sizeof(float));		//allocate memory for distance map on GPU

	cudaMalloc((void **)&edge_map_GPU, rows*cols*sizeof(char));			//allocate memory for edge map on GPU

	cutilCheckError( cutCreateTimer(&hTimer) );

	

	cutilCheckError( cutResetTimer(hTimer) );

            cutilCheckError( cutStartTimer(hTimer) );

	cutilSafeCall( cudaThreadSynchronize() );

	create_mask<<<dimGrid, dimBlock>>>(map, rows,cols);

	cutilSafeCall( cudaThreadSynchronize() );

	cutilCheckError( cutStopTimer(hTimer) );

            gpuTime = cutGetTimerValue(hTimer);

	printf("d_map () time    : %f msec\n", gpuTime);

	//cudaMemcpy(distanceMap,map, (2*rows-1)*(2*cols-1)*sizeof(float),cudaMemcpyDeviceToHost);	//copy distance transform back to CPU

	initial_call=0;

}

dim3 dimGrid((int)ceil((float)rows/(float)dimBlock.x),(int)ceil((float)cols/(float)dimBlock.y));

cudaMemcpy(edge_map_GPU,edge_map, rows*step*sizeof(char),cudaMemcpyHostToDevice);	//copy edge information to GPU

cutilCheckError( cutResetTimer(hTimer) );

    cutilCheckError( cutStartTimer(hTimer) );

cutilSafeCall( cudaThreadSynchronize() );

getDistanceMap<<<dimGrid, dimBlock>>>(map, dist_map_GPU, edge_map_GPU, rows, cols, step);

<b>cutilSafeCall( cudaThreadSynchronize() );</b>

cutilCheckError( cutStopTimer(hTimer) );

    gpuTime = cutGetTimerValue(hTimer);

printf("final distance transform time    : %f msec\n", gpuTime);

cudaMemcpy(distanceMap,dist_map_GPU, rows*cols*sizeof(float),cudaMemcpyDeviceToHost);	//copy distance transform back to CPU

}

global void create_mask (float* map, signed int rows, signed int cols){

signed int i=blockIdx.x * blockDim.x + threadIdx.x;

signed int j=blockIdx.y * blockDim.y + threadIdx.y;



int map_rows=cols*2-1;

if (i<(2*rows-1) && j<(2*cols-1))

	*(float *)(map+i*map_rows+j)=sqrtf(powf((float)(i-(rows-1)),2.0f)+powf((float)(j-(cols-1)),2.0f));

	//*(float *)(map+i*map_rows+j)=sqrtf(abs((float)(i-(rows-1)))+abs((float)(j-(cols-1))));

}

global void getDistanceMap(float *map, float dist_map, char edge_map, int rows, int cols, int step){

int i=blockIdx.x * blockDim.x + threadIdx.x;

int j=blockIdx.y * blockDim.y + threadIdx.y;

if (i<rows && j<cols)

	//*(float *)(dist_map+i*rows+j)=5.0;

	*(float *)(dist_map+i*cols+j)=getMinDistance(edge_map, map, i, j, rows, cols, step);

}

device float getMinDistance (char *edge_map, float *map, int i, int j, int rows, int cols, int step){

int k=0, l=0;

int map_cols=cols*2-1;

float minval=LARGE_VAL, distance=LARGE_VAL;

for (k=0; k<rows; k++)

	for (l=0; l<rows; l++)

		if (*(edge_map+k*step+l)){	/*if there is an edge pixel at (k, l)*/

			distance=*(float *)(map+(rows-i+k)*map_cols+(cols-j+l));/*take note of distance from current pixel to this edge pixel*/

			minval=fminf(distance,minval);

		}

return minval;

}[/codebox]

So in getDistance kernel? I’m not getting down & dirty dry-running it, but my guess is buffer overrun:

if (i<rows && j<cols)

	//*(float *)(dist_map+i*rows+j)=5.0;

	*(float *)(dist_map+i*cols+j)=getMinDistance(edge_map, map, i, j, rows, cols, step);

you’ve got a very nasty chunk here:

*(float )(dist_map+icols+j)

Whats the evaluation order of that? Put some brackets, dry run, check its not writing outside the array. I’ve had all sorts of wonderful crashes writing into random GPU mem.

James