HI I have the following kernel that computes two variables TW and Tag
based on input buffer
I have also included the c func that does the same job
I see that the results from the GPU are completely erratic
Is there something that I’m completely unaware when
implementing bit operations on gpu
Can someone please let me know
Or is there any other fast method to do the same with correct results
Thanx in advance
Get_Tag_TW<<< dimGrid, dimBlock >>>(d_buffer,d_TW,d_Tag);
/// kernel for GPU
__global__
void Get_Tag_TW(unsigned char *Buffer, int *TW,unsigned char *Tag)
{
__shared__ unsigned char shrd_buffer[BLOCK_SIZE];
__shared__ int shrd_TW[BLOCK_SIZE];
__shared__ unsigned char shrd_Tag[BLOCK_SIZE];
unsigned int index_in = blockIdx.x * blockDim.x + threadIdx.x;
if(index_in < NBYTES)
{
shrd_Tag[threadIdx.x]= 0x00;
shrd_Tag[threadIdx.x] = shrd_Tag[threadIdx.x] | (( Buffer[8*threadIdx.x+3] >> 3)& 0x08);
shrd_Tag[threadIdx.x] = shrd_Tag[threadIdx.x] | ((Buffer[8*threadIdx.x+7] >>4)& 0x04);
shrd_Tag[threadIdx.x] = shrd_Tag[threadIdx.x] | ((Buffer[8*threadIdx.x+5])& 0x03);
shrd_TW[threadIdx.x] = Buffer[8*threadIdx.x+5];
shrd_TW[threadIdx.x] = (shrd_TW[threadIdx.x]<< 8 )|Buffer[8*threadIdx.x+4];
shrd_TW[threadIdx.x] = (shrd_TW[threadIdx.x]<< 8 )|Buffer[8*threadIdx.x+1];
shrd_TW[threadIdx.x] = (shrd_TW[threadIdx.x]<< 8 )|Buffer[8*threadIdx.x];
__syncthreads();
}
TW[index_in] = shrd_TW[threadIdx.x];
Tag[index_in]= shrd_Tag[threadIdx.x];
}
/// C func for same operation on CPU
void readBuffer(unsigned char* Tag,int* TW,unsigned char* addressBuffer )
{
int i=0;
for (i=0;i<NBYTES;i++)
{
Tag[i]= 0x00;
Tag[i] = Tag[i] | (( addressBuffer[8*i+3] >> 3)& 0x08);
Tag[i] = Tag[i] | ((addressBuffer[8*i+7] >>4)& 0x04);
Tag[i] = Tag[i] | ((addressBuffer[8*i+5])& 0x03);
TW[i] = addressBuffer[8*i+5];
TW[i] = (TW[i]<< 8 )|addressBuffer[8*i+4];
TW[i] = (TW[i]<< 8 )|addressBuffer[8*i+1];
TW[i] = (TW[i]<< 8 )|addressBuffer[8*i+0];
}
}