I am new to CUDA and I am trying to write a efficient kernel function for parsing a packet containing ‘k’ different messages each of different length (which I know). This is what I have done.
//head ----> pointing to a page locked memory i.e start of the packet
//k -----> Number of messages in the packet
host_func()
{
cudaHostGetDevicePointer(&d_dataPtr, head, 0);
int *h_bytes;
cudaHostAlloc((void**)&h_bytes, sizeof(int), cudaHostAllocWriteCombined | cudaHostAllocMapped);
*h_bytes = 0;
cudaHostGetDevicePointer(&d_bytes, h_bytes, 0);
kernel<<<1,26>>>(d_dataPtr, d_bytes, k);
cudaThreadSynchronize();
}
__global__ void kernel(char *data, int *bytesProcessed, int *numOfMessages)
{
int temp = 0;
for(int i=0;i<*numOfMessages;i++)
{
switch(*data)
{
case 'A':
//do some processing
temp = sizeof(MessageA);
break;
case 'B':
//do some processing
temp = sizeof(MessageB);
break;
case 'C':
//do some processing
temp = sizeof(MessageB);
break;
}
data += temp;
*bytesProcessed += temp;
}
}
I am trying to write the kernel so that all messages with same type are processed on same thread.
Thanks in advance for your help!!