index counts error

I am trying to fill a bitmap with the index during my CUDA learning.
The following code is the code:

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define LOG_BLOCK 8
#define BLOCKDIM 256

typedef unsigned char uchar_t;
unsigned char g_bitCountUchar =
{
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};

global
void fillBitmap(int n, uchar_t* bitmap)
{
const unsigned int index = (blockIdx.x << LOG_BLOCK) + threadIdx.x;

__syncthreads();
if (index < n)
{
	int byteNum = index / (sizeof(uchar_t) * 8);
	int bitNum = index % (sizeof(uchar_t) * 8);
	bitmap[byteNum] |= (1<<bitNum);
}

}

int main()
{
int n = 1000;
int bitmapSize = (n / (sizeof(uchar_t) * 8) + (size_t)(n % (sizeof(uchar_t)*8) != 0));
int num_blocks = 1 + ((n-1) >> LOG_BLOCK);

uchar_t *d_bitmap;
uchar_t *bitmap = (uchar_t*) malloc(bitmapSize);

cudaMalloc((void**) &d_bitmap, bitmapSize);
cudaMemset(d_bitmap, 0, bitmapSize);

fillBitmap<<<num_blocks, BLOCKDIM, 0>>>(n, d_bitmap);	
cudaThreadSynchronize();
cudaMemcpy(bitmap, d_bitmap, bitmapSize, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();

int count = 0;
for (int i = 0; i < bitmapSize; i++)
{
	count += g_bitCountUchar[bitmap[i]];
}
printf("%d \n",count);

}

The g_bitCountUchar is the number of "1"s from 0 to 255. And g_bitCountUchar is proved to be right.

The expected result of the code is 1000,but it turns out to be 125 :confused:

I don’t where the problem is ? I am using GTX260,CUDA2.2.

Thanks.