I have a problem in my CUDA program.
I am trying to solve the sequence alignment problem using GPGPU. I’m working on the protein DB which contains about 191Million character forming about 580000 DB sequence, and I partition it to fit my memory size. I’m running CUDA 4.1 on nVIDIA GT 540M with 1 GB memory. I’m using only 50% of my card memory.
I make each thread compares one sequence from the database against a known query sequence.
This is my kernel:
const int GLOBAL_MAX_BYTES = 500000;
__constant__ char *query = "MPKIIEAIYENGVFKPLQKVDLKEGEKIRILLKKIDVEKFIMAKLPEEKIRELERRFEDENLYX";
__constant__ const int queryLength = 64;
const int hostQueryLength = 64;
__global__ void Smith_Waterman_Kernel(char *records, int *lengths, int *offset, int *results, int recCounts)
{
int tid=threadIdx.x+blockDim.x*blockIdx.x ;
if(tid>=recCounts)
return;
int bid = blockIdx.x;
int iOffset = offset[tid];
int recLen = lengths[tid];
int i, j;
char rec[120];// I fixed it to the maximum sequence length of the first round
int similar=0;
int bdk1[120];//= (int*) malloc(sizeof(int) *compTimes);
int bdk2[120];//= (int*) malloc(sizeof(int) *compTimes);
int *dk1 = bdk1;
int *dk2 = bdk2;
int *tmp;
memcpy(rec, &records[iOffset], recLen*sizeof(char));
int maxvalue=0;
// calclate first row, such that we avoid to insert boundary conditions on the later loop.
for (j = 0; j < recLen; j++)
{
//int value=0;
similar = 0;
i = 0;
// fetch and compare
char ch1 = query[i];
char ch2 = rec[j];
if (ch1 == ch2)
similar = 2;
dk1[j]=similar;
maxvalue=max(maxvalue,dk1[j]);
}
// Calculate the rest of the cells
for (i = 1; i < queryLength; i++)
{
//int value=0;
similar = 0;
j = 0; // column
// fetch and compare
char ch1 = query[i];
char ch2 = rec[j];
if (ch1 == ch2)
similar = 2;
dk2[0]=max(dk1[0]-1, similar);
maxvalue=max(maxvalue,dk2[0]);
for (j = 1; j < recLen; j++)
{
//int value=0;
similar = 0;
// fetch and compare
char ch1 = query[i];
char ch2 = rec[j];
if (ch1 == ch2)
similar = 2;
int upValue = dk1[j];//calc(i - 1, j, dk_1, recLen);
int leftValue = dk2[j-1];//calc(i, j - 1, dk_1, recLen);
int diagonalValue = dk1[j-1];//calc(i - 1, j - 1, dk_2, recLen);
dk2[j]= max(upValue - 1, max(leftValue - 1, diagonalValue + similar));
maxvalue=max(maxvalue,dk2[j]);
}
// swap the 2 resulting rows of the matrix before starting a new comparison.
tmp = dk2;
dk2 = dk1;
dk1 = tmp;
}
results[tid] =maxvalue;
}
then here is the main function:
void main()
{
// read the sorted sequences, lengths, and offsets data files
char* host_1D_Array_pre = (char*)FReadFile("chars.bin", NULL);
int* protein_Offset_pre= (int*) FReadFile("offset.bin", NULL);
int* protein_length_pre = (int*) FReadFile("lenth.bin", numSeq);
*numSeq /= sizeof(int); as it was ina binary file
int it = 0;
char *currentDBChar = host_1D_Array_pre;
float total_elapsed=0.0;
int numrun=0;
while (it < *numSeq)
{
numrun++;
int globalBytesSum = 0;
int recordsBytes = 0;
int maxRecordLength = 0;
float elapsedTime=0.0;
int begin = it;
while (it < *numSeq)
{
int recordLength=protein_length_pre[it];
int globalBytes= recordLength+ 3*sizeof(int);
int blocksCountTillNow = it - begin;
if(globalBytesSum + globalBytes > GLOBAL_MAX_BYTES)
break;
recordsBytes += recordLength;
globalBytesSum += globalBytes;
maxRecordLength = recordLength;
it++;
}
int recordsCount = it - begin;
// Call the kernel
int threadsCount = 256;
int blocksCount = (recordsCount+(threadsCount-1))/threadsCount;
char *dev_records;
cudaMalloc(&dev_records, GLOBAL_MAX_BYTES * sizeof(char));
cudaMemcpy(dev_records, currentDBChar, recordsBytes * sizeof(char), cudaMemcpyHostToDevice);
currentDBChar += recordsBytes ;
int *dev_lengths;
cudaMalloc(&dev_lengths, recordsCount * sizeof(int));
cudaMemcpy(dev_lengths, protein_length_pre + begin, recordsCount * sizeof(int), cudaMemcpyHostToDevice);
int *dev_offsets;
cudaMalloc(&dev_offsets, recordsCount * sizeof(int));
cudaMemcpy(dev_offsets, protein_Offset_pre + begin, recordsCount * sizeof(int), cudaMemcpyHostToDevice);
int *host_results = (int*)malloc(recordsCount * sizeof(int));
int *dev_results;
cudaMalloc(&dev_results, recordsCount * sizeof(int));
//Device Timing
cudaEvent_t start, stop;
cudaError_t cudaStatus;
cudaEventCreate(&start);
cudaEventRecord(start,0);
//Calling the kernel
Smith_Waterman_Kernel<<<blocksCount, threadsCount>>>(dev_records, dev_lengths, dev_offsets, dev_results, recordsCount);
cudaEventCreate(&stop);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaMemcpy(host_results, dev_results,recordsCount * sizeof(int), cudaMemcpyDeviceToHost);
cudaEventElapsedTime(&elapsedTime, start,stop);
printf("Elapsed time : %f ms\n" ,elapsedTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
total_elapsed+=elapsedTime;
free(host_results);
cudaFree(dev_records);
cudaFree(dev_lengths);
cudaFree(dev_offsets);
cudaFree(dev_results);
}
cout<<"----------------------------------records done="<<it<<"--------------------------- \n";
//Free Host Memory
free( numSeq);
free( protein_Offset_pre);
free( protein_length_pre);
free( host_1D_Array_pre);
}
the kernel is running in a while loop it should complete 396 run but I just run 4 times and at the 5 th run when Debug using the Parallel Nsight it gives me that error:
Parallel Nsight Debug
Detected MMU fault on Warp #0
Operation : Global Load
Page Address : 0xff00000000
blockIdx : {1,0,0}
First threadIdx : {0,0,0}