MMU fault when debugging .. any clue

I have a problem in my CUDA program.
I am trying to solve the sequence alignment problem using GPGPU. I’m working on the protein DB which contains about 191Million character forming about 580000 DB sequence, and I partition it to fit my memory size. I’m running CUDA 4.1 on nVIDIA GT 540M with 1 GB memory. I’m using only 50% of my card memory.
I make each thread compares one sequence from the database against a known query sequence.

This is my kernel:

const int GLOBAL_MAX_BYTES = 500000;
 
__constant__ char *query = "MPKIIEAIYENGVFKPLQKVDLKEGEKIRILLKKIDVEKFIMAKLPEEKIRELERRFEDENLYX"; 
__constant__ const int queryLength = 64;
const int hostQueryLength = 64;
__global__ void Smith_Waterman_Kernel(char *records, int *lengths, int *offset, int *results, int recCounts)
{
	int tid=threadIdx.x+blockDim.x*blockIdx.x ;
	if(tid>=recCounts)
		return;

		int bid = blockIdx.x;
		int iOffset = offset[tid];
		int recLen = lengths[tid];
		int i, j;
		char rec[120];// I fixed it to the maximum sequence length of the first round
		int similar=0;

		int bdk1[120];//= (int*) malloc(sizeof(int) *compTimes);
		int bdk2[120];//= (int*) malloc(sizeof(int) *compTimes);
		int *dk1 = bdk1;
		int *dk2 = bdk2;
		int *tmp;
		memcpy(rec, &records[iOffset], recLen*sizeof(char));
		
		int maxvalue=0;

	// calclate first row, such that we avoid to insert boundary conditions on the later loop.
		for (j = 0; j < recLen; j++)
		{
			//int value=0;
			similar = 0;
			i = 0;
			// fetch and compare
			char ch1 = query[i];
			char ch2 = rec[j];
			if (ch1 == ch2)
				similar = 2;

			dk1[j]=similar;
			maxvalue=max(maxvalue,dk1[j]);	
		
		}
		// Calculate the rest of the cells
		for (i = 1; i < queryLength; i++)
		{
			//int value=0;
			similar = 0;
			
			j = 0; // column

			// fetch and compare
			char ch1 = query[i];
			char ch2 = rec[j];
			if (ch1 == ch2)
				similar = 2;

			dk2[0]=max(dk1[0]-1, similar);
			maxvalue=max(maxvalue,dk2[0]);

			for (j = 1; j < recLen; j++)
			{
				//int value=0;
				similar = 0;
				// fetch and compare
				char ch1 = query[i];
				char ch2 = rec[j];
				if (ch1 == ch2)
					similar = 2;

		
				int upValue = dk1[j];//calc(i - 1, j, dk_1, recLen);
				int leftValue = dk2[j-1];//calc(i, j - 1, dk_1, recLen);
				int diagonalValue = dk1[j-1];//calc(i - 1, j - 1, dk_2, recLen);
			
				dk2[j]= max(upValue - 1, max(leftValue - 1, diagonalValue + similar));
				maxvalue=max(maxvalue,dk2[j]);
			}
// swap the 2 resulting rows of the matrix before starting a new comparison.
			tmp = dk2;
			dk2 = dk1;
			dk1 = tmp;
		}
		results[tid] =maxvalue;
}

then here is the main function:

void main()
{
// read the sorted sequences, lengths, and offsets data  files

char* host_1D_Array_pre = (char*)FReadFile("chars.bin", NULL);
	int* protein_Offset_pre= (int*) FReadFile("offset.bin", NULL);
	int* protein_length_pre = (int*) FReadFile("lenth.bin", numSeq);
	*numSeq /= sizeof(int); as it was ina binary file

int it = 0;
	char *currentDBChar = host_1D_Array_pre;
	float total_elapsed=0.0;
	
	int numrun=0;
	
	while (it < *numSeq)
	{
		numrun++;
		
		int globalBytesSum = 0;
		
		int recordsBytes = 0;
		int maxRecordLength = 0;
		float elapsedTime=0.0;

		int begin = it;		

		while (it < *numSeq)
		{
			int recordLength=protein_length_pre[it];
			int globalBytes= recordLength+ 3*sizeof(int);
			int blocksCountTillNow = it - begin;
			if(globalBytesSum + globalBytes  > GLOBAL_MAX_BYTES)
				break;
			recordsBytes += recordLength;
			globalBytesSum += globalBytes;
			maxRecordLength = recordLength;
			it++;
		}
		 
			int recordsCount = it - begin;
			
// Call the kernel

	int threadsCount = 256;
	int blocksCount  = (recordsCount+(threadsCount-1))/threadsCount;
			
	char *dev_records;
	cudaMalloc(&dev_records, GLOBAL_MAX_BYTES * sizeof(char));
	cudaMemcpy(dev_records, currentDBChar, recordsBytes * sizeof(char), cudaMemcpyHostToDevice);
			
	currentDBChar += recordsBytes ;

	int *dev_lengths;
	cudaMalloc(&dev_lengths, recordsCount * sizeof(int));
	cudaMemcpy(dev_lengths, protein_length_pre + begin, recordsCount * sizeof(int), cudaMemcpyHostToDevice);
			
	int *dev_offsets;
	cudaMalloc(&dev_offsets, recordsCount * sizeof(int));
	cudaMemcpy(dev_offsets, protein_Offset_pre + begin, recordsCount * sizeof(int), cudaMemcpyHostToDevice);
			
        int *host_results = (int*)malloc(recordsCount * sizeof(int));
	int *dev_results;
	cudaMalloc(&dev_results, recordsCount * sizeof(int));

	//Device Timing
			cudaEvent_t start, stop;
			cudaError_t cudaStatus;

			 cudaEventCreate(&start);
			 cudaEventRecord(start,0);

			//Calling the kernel
		Smith_Waterman_Kernel<<<blocksCount, threadsCount>>>(dev_records, dev_lengths, dev_offsets, dev_results, recordsCount);

			cudaEventCreate(&stop);
			cudaEventRecord(stop,0);
			cudaEventSynchronize(stop);
			cudaMemcpy(host_results, dev_results,recordsCount * sizeof(int), cudaMemcpyDeviceToHost);

			cudaEventElapsedTime(&elapsedTime, start,stop);
			printf("Elapsed time : %f ms\n" ,elapsedTime);

		
		cudaEventDestroy(start);
		cudaEventDestroy(stop);

		total_elapsed+=elapsedTime;

		free(host_results);

		cudaFree(dev_records);
		cudaFree(dev_lengths);
		cudaFree(dev_offsets);
		cudaFree(dev_results);

	}
	cout<<"----------------------------------records done="<<it<<"--------------------------- \n";

	//Free Host Memory
    free( numSeq);
  
    free( protein_Offset_pre);
    free( protein_length_pre);
    free( host_1D_Array_pre);	
}

the kernel is running in a while loop it should complete 396 run but I just run 4 times and at the 5 th run when Debug using the Parallel Nsight it gives me that error:

Parallel Nsight Debug
Detected MMU fault on Warp #0
Operation : Global Load
Page Address : 0xff00000000
blockIdx : {1,0,0}
First threadIdx : {0,0,0}