I change any way, still same problem. Now I let each thread access 8 bytes and generate 3 hash value,
n = (fileSize/8)*3;
global
void constructKeyValue(const unsigned n, const char *data,
unsigned *d_keys, unsigned *d_vals){
unsigned thread_index = threadIdx.x +
blockIdx.x * blockDim.x +
blockIdx.y * blockDim.x * gridDim.x;
if(thread_index >= (n/3) ) return;
d_keys[thread_index] = *(unsigned int*) &data[thread_index*8];
d_vals[thread_index] = (unsigned int)&data[thread_index*8];
printf("\n %d, %d, %d ", thread_index, d_keys[thread_index], d_vals[thread_index] );
__syncthreads();
d_keys[thread_index+1] = *(unsigned int*) &data[thread_index*8+2];
d_vals[thread_index+1] = (unsigned int)&data[thread_index*8+2];
printf("\n %d, %d, %d ", thread_index, d_keys[thread_index+1], d_vals[thread_index+1] );
__syncthreads();
d_keys[thread_index+2] = *(unsigned int*) &data[thread_index*8+4];
d_vals[thread_index+2] = (unsigned int)&data[thread_index*8+4];
printf("\n %d, %d, %d ", thread_index, d_keys[thread_index+2], d_vals[thread_index+2] );
__syncthreads();
}
int main(){
constructKeyValue <<<gridDim, kBlockSize>>> (n, data,
d_keys,
d_vals);
CUT_CHECK_ERROR("Failed to construct KeyValue!");
// bool res = Build(n, d_keys, d_vals);
//return res;
CUT_CHECK_ERROR("Failed before build.");
unsigned *d_sorted_keys = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_keys, sizeof(unsigned) * n));
CUDA_SAFE_CALL(cudaMemcpy(d_sorted_keys, d_keys, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice));
unsigned *d_sorted_vals = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_vals, sizeof(unsigned) * n));
CUDA_SAFE_CALL(cudaMemcpy(d_sorted_vals, d_vals, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice));
CUT_CHECK_ERROR(“Failed to allocate.”);
cudaThreadSynchronize();
unsigned *h_keys = (unsigned *) malloc(sizeof(unsigned)*n);
unsigned *h_vals = (unsigned *) malloc(sizeof(unsigned)*n);
CUDA_SAFE_CALL(cudaMemcpy(h_keys, d_keys , sizeof(unsigned)*n, cudaMemcpyDeviceToHost));
return 0;
}