Global memory access problem

Hi All,

I wanna hash every 2 bytes in a string,
For example:
“abcd12345678” , i wanna to generate hash value for “abcd”, “cd12”, “1234”, “3456”, “5678”

My code as below:

global
void constructKeyValue(const unsigned n, const char *data,
unsigned *d_keys, unsigned *d_vals){
unsigned thread_index = threadIdx.x +
blockIdx.x * blockDim.x +
blockIdx.y * blockDim.x * gridDim.x;

 if (thread_index >= (n+1)/2  ) return;

d_keys[2*thread_index] = *(unsigned int*) &data[thread_index*4]; 
d_keys[2*thread_index+1] = *(unsigned int*) &data[thread_index*4+2];

}

unsigned *h_keys = (unsigned *) malloc(sizeof(unsigned)*n);
CUDA_SAFE_CALL(cudaMemcpy(h_keys, d_keys , sizeof(unsigned)*n, cudaMemcpyDeviceToHost));

The problem happens on last line code, " unknown error." , anyone can find what is the problem with the CUDA memory allocation?

I guess the problem is somewhere else. Did your kernel complete successfully. Post your entire code.

I tested, this way has confliction, problem may happens when two threads access interleaved address.

But don’t understand why d_keys[2thread_index] = d_keys[2thread_index+1], these two are access by same thread.

And why this line has problem?

CUDA_SAFE_CALL(cudaMemcpy(h_keys, d_keys , sizeof(unsigned)*n, cudaMemcpyDeviceToHost));

I change any way, still same problem. Now I let each thread access 8 bytes and generate 3 hash value,

n = (fileSize/8)*3;

global

void constructKeyValue(const unsigned n, const char *data,

				   unsigned  *d_keys, unsigned  *d_vals){

unsigned thread_index = threadIdx.x +

						  blockIdx.x * blockDim.x +

						  blockIdx.y * blockDim.x * gridDim.x;



 if(thread_index >= (n/3) ) return;

 d_keys[thread_index] = *(unsigned int*) &data[thread_index*8]; 

 d_vals[thread_index] = (unsigned int)&data[thread_index*8];

 printf("\n %d, %d, %d  ", thread_index, d_keys[thread_index],  d_vals[thread_index]   );

 __syncthreads();

 

 d_keys[thread_index+1] = *(unsigned int*) &data[thread_index*8+2]; 

 d_vals[thread_index+1] = (unsigned int)&data[thread_index*8+2];

 printf("\n %d, %d, %d  ", thread_index, d_keys[thread_index+1],  d_vals[thread_index+1]   );

 __syncthreads();

 d_keys[thread_index+2] = *(unsigned int*) &data[thread_index*8+4]; 

 d_vals[thread_index+2] = (unsigned int)&data[thread_index*8+4];

 printf("\n  %d, %d, %d  ", thread_index, d_keys[thread_index+2],  d_vals[thread_index+2]   );

 __syncthreads();

}

int main(){

constructKeyValue <<<gridDim, kBlockSize>>> (n, data,

                                         d_keys,

                                         d_vals);

 CUT_CHECK_ERROR("Failed to construct KeyValue!");

 // bool res = Build(n, d_keys, d_vals);

 //return res;

 CUT_CHECK_ERROR("Failed before build.");

unsigned *d_sorted_keys = NULL;

CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_keys, sizeof(unsigned) * n));

CUDA_SAFE_CALL(cudaMemcpy(d_sorted_keys, d_keys, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice));

unsigned *d_sorted_vals = NULL;

CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_vals, sizeof(unsigned) * n));

CUDA_SAFE_CALL(cudaMemcpy(d_sorted_vals, d_vals, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice));

CUT_CHECK_ERROR(“Failed to allocate.”);

cudaThreadSynchronize();

unsigned *h_keys = (unsigned *) malloc(sizeof(unsigned)*n);

unsigned *h_vals = (unsigned *) malloc(sizeof(unsigned)*n);

CUDA_SAFE_CALL(cudaMemcpy(h_keys, d_keys , sizeof(unsigned)*n, cudaMemcpyDeviceToHost));

return 0;

}

I just worry about misaligned address of “&data[thread_index*4+2]”

d_keys[2*thread_index+1] = *(unsigned int*) &data[thread_index*4+2];

I always fetch characters to shared memory first, then do processing later, for example

__global__ void constructKeyValue(const unsigned n, const int *data,

    unsigned *d_keys, unsigned *d_vals)

{

    __shared__ int smem[TB] ; // TB = number of threads per block, TB = blockDim.x 

    int tid = threadIdx.x ; // local thread id

    int bid = blockIdx.y * gridDim.x + blockIdx.x ; // block id

    int gtid = bid * blockDim.x + threadIdx.x ; // global thread id

smem[tid] = data[bid * TB/2 + tid]; // make sure size of data is large enough or bind data to texture memory

    __syncthreads();

// tid = 2*gid + laneid

    int gid = tid >> 1 ; 

    int laneid = tid & 1 ;

    int x[2] ;

    x[0] = smem[gid];

    x[1] = smem[gid+1];

    int num = x[0];

    if ( laneid ){

        num = (x[0] << 16) + ((x[1]>>16) & 0x0000FFFF); 

    }

    d_keys[gid] = num ;

}

I tested “abcd123489abcd1234abcd12”, result is as below, first column is thread id, 2rd is key, 3rd is value. I don’t understand why thread 0 first 4 characters(char 0-4) value equals value of chars 2-6 , any solution for the issue, thanks a lot.

0, 1684234849, 87031808

1, 1650538808, 87031816

2, 1650537523, 87031824

0, 1684234849, 87031810

1, 1650538808, 87031818

2, 1650537523, 87031826

0, 875770417, 87031812

1, 842097763, 87031820

2, 842097763, 87031828

__global__ 

void constructKeyValue(const unsigned n, const char *data,

					   unsigned  *d_keys, unsigned  *d_vals){

	unsigned thread_index = threadIdx.x +

							  blockIdx.x * blockDim.x +

							  blockIdx.y * blockDim.x * gridDim.x;

	

	 if(thread_index >= (n/3) ) return;

	 d_keys[thread_index] = *(unsigned int*) &data[thread_index*8]; 

	 d_vals[thread_index] = (unsigned int)&data[thread_index*8];

	 printf("\n %d, %d, %d  ", thread_index, d_keys[thread_index],  d_vals[thread_index]   );

	 __syncthreads();

	 

	 d_keys[thread_index+1] = *(unsigned int*) &data[thread_index*8+2]; 

	 d_vals[thread_index+1] = (unsigned int)&data[thread_index*8+2];

	 printf("\n %d, %d, %d  ", thread_index, d_keys[thread_index+1],  d_vals[thread_index+1]   );

	 __syncthreads();

	 d_keys[thread_index+2] = *(unsigned int*) &data[thread_index*8+4]; 

	 d_vals[thread_index+2] = (unsigned int)&data[thread_index*8+4];

	 printf("\n  %d, %d, %d  ", thread_index, d_keys[thread_index+2],  d_vals[thread_index+2]   );

	 __syncthreads();

}

main(){

  //...

	constructKeyValue <<<gridDim, kBlockSize>>> (n, data,d_keys,d_vals);

	CUT_CHECK_ERROR("Failed to construct KeyValue!");

	 

	CUT_CHECK_ERROR("Failed before build.");

	

	

	

	unsigned *d_sorted_keys = NULL;

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_keys, sizeof(unsigned) * n));

	CUDA_SAFE_CALL(cudaMemcpy(d_sorted_keys, d_keys, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice));

	

	unsigned *d_sorted_vals = NULL;

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_vals, sizeof(unsigned) * n));

	CUDA_SAFE_CALL(cudaMemcpy(d_sorted_vals, d_vals, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice));

	

	CUT_CHECK_ERROR("Failed to allocate.");

	

	cudaThreadSynchronize();

	unsigned *h_keys = (unsigned *) malloc(sizeof(unsigned)*n);

	unsigned *h_vals = (unsigned *) malloc(sizeof(unsigned)*n);

	CUDA_SAFE_CALL(cudaMemcpy(h_keys, d_keys , sizeof(unsigned)*n, cudaMemcpyDeviceToHost));

	

return 0;

}

I tested “abcd123489abcd1234abcd12”, result is as below, first column is thread id, 2rd column is key, 3rd column is value. I don’t understand why thread 0 first 4 characters(char 0-4, “abcd”) value equals value of chars 2-6(“cd12”) , any solution for the issue? thanks a lot.

I tried shared memory, same issue

0, 1684234849, 87031808

1, 1650538808, 87031816

2, 1650537523, 87031824

0, 1684234849, 87031810

1, 1650538808, 87031818

2, 1650537523, 87031826

0, 875770417, 87031812

1, 842097763, 87031820

2, 842097763, 87031828

As LSChien already pointed out, results of misaligned reads are undefined in CUDA.
Current hardware just masks off the lowest address bits, which is why both the aligned and the misaligned memory access return the same result.

The __byte_perm() intrinsic allows you to shuffle bytes conveniently, and it maps to a single instruction on compute capability 2.x.

Thank you so much :)

Many thanks :)