my program calls the kernel in a for loop, I make sure that the max. data passed to be shared in less than 15K and the global data to be less 1GB in each iteration oft he kernel execution. the data at the device i then copied from the global memory to shared for each block to start the execution… the first iteration is the only one that gets results from the device kernel, those results are not all right… I think the problem is in the memory accessing inside the kernel any suggestions??
const int GLOBAL_MAX_BYTES = 900000; // global size
const int SHARED_MAX_BYTES = 15900; // shared size
__constant__ char *query = "MPKIIEAIYENGVFKPLQKVDLKEGEKIRILLKKIDVEKFIMAKLPEEKIRELERRFEDENLY"; //hardcoded
__constant__ const int queryLength = 63; //hardcoded
const int hostQueryLength = 63;
__global__ void Smith_Waterman_Kernel(char *records, int *lengths, int *offset, int *results, int maxRecLen)
{
int bid = blockIdx.x;
int tid;
int idx = 0;
int i, j;
extern __shared__ char allShared[];// for passing multiple arrays of shared memory
__shared__ char *rec;
__shared__ int *dk_1;
__shared__ int *dk_2;
rec = allShared;
dk_1 = (int*)(allShared + maxRecLen);
dk_2 = dk_1 + queryLength;
int recLen = lengths[bid];
tid = threadIdx.x * recLen;
for (int x = threadIdx.x; x < recLen; x += blockDim.x)
rec[x] = records[offset[bid] + x];
__syncthreads();
int similar = 0 ;
int end = queryLength + recLen - 2 - (threadIdx.x * 2);
for(int k=0; k <= queryLength + recLen - 2; k++)
{
idx = k - threadIdx.x;
int value=0;
similar = 0;
if((idx < 0) || (idx > end)) ;
else
{
i = tid / recLen;
j = tid % recLen;
// fetch and compare
char ch1 = query[i];
char ch2 = rec[j];
if (ch1 == ch2)
similar = 2;
int upValue = calc(i - 1, j, dk_1, recLen); // another simple kernel that calculats the value given 3 paramertes
int leftValue = calc(i, j - 1, dk_1, recLen);
int diagonalValue = calc(i - 1, j - 1, dk_2, recLen);
value = max2Values(upValue - 1, max2Values(leftValue - 1, diagonalValue + similar));// get the max of two values
results[bid] = max2Values(results[bid], value);
}//else
__syncthreads();
if (idx < 0 || idx > end)
continue;
// swap dk_1 and dk_2
dk_2[threadIdx.x] = dk_1[threadIdx.x];
dk_1[threadIdx.x] = value;
int d = i + j;
//increment the tid
if(d < recLen - 1)
tid++;
else
tid += recLen;
// Synch
__syncthreads();
}
}
int main()
{
// some code
int it = 0;
char *currentDBChar = host_1D_Array_pre;
float total_elapsed=0.0;
while (it < *numSeq)
{
int globalBytesSum = 0;
int sharedBytes = 0;
int recordsBytes = 0;
int maxRecordLength = 0;
float elapsedTime=0.0;
int begin = it;
while (it < *numSeq)
{
int recordLength = protein_length_pre[it];
int globalBytes = recordLength + 3 * 4; //the recordlength+ passing 3 int arrays
int blocksCountTillNow = it - begin;
int temp = (recordLength + min(hostQueryLength, recordLength) * 4 * 2)*(blocksCountTillNow + 1);
if (temp > SHARED_MAX_BYTES || globalBytesSum + globalBytes > GLOBAL_MAX_BYTES)
break;
recordsBytes += recordLength;
sharedBytes = temp;
globalBytesSum += globalBytes;
maxRecordLength = recordLength;
it++;
}
// Call the kernel
int blocksCount = it - begin;
int recordsCount = blocksCount;
int threadsCount = hostQueryLength;
char *dev_records;
cudaMalloc(&dev_records, recordsBytes * sizeof(char));
cudaMemcpy(dev_records, currentDBChar, recordsBytes * sizeof(char), cudaMemcpyHostToDevice);
currentDBChar += recordsBytes;
int *dev_lengths;
cudaMalloc(&dev_lengths, recordsCount * sizeof(int));
cudaMemcpy(dev_lengths, protein_length_pre + begin, recordsCount * sizeof(int), cudaMemcpyHostToDevice);
int *dev_offsets;
cudaMalloc(&dev_offsets, recordsCount * sizeof(int));
cudaMemcpy(dev_offsets, protein_Offset_pre + begin, recordsCount * sizeof(int), cudaMemcpyHostToDevice);
int *host_results = (int*)malloc(recordsCount * sizeof(int));
int *dev_results;
cudaMalloc(&dev_results, recordsCount * sizeof(int));
//Calling the kernel
Smith_Waterman_Kernel(dev_records, dev_lengths, dev_offsets, dev_results, maxRecordLength);
cudaMemcpy(host_results, dev_results, recordsCount * sizeof(int), cudaMemcpyDeviceToHost);
free(host_results);
cudaFree(dev_records);
cudaFree(dev_lengths);
cudaFree(dev_offsets);
cudaFree(dev_results);
}
printf("Total Elapsed time : %f ms
" ,total_elapsed);
//Free Host Memory
return 0;
}