Global memory access problem

xiaoxi · July 13, 2011, 2:37am

Hi All,

I wanna hash every 2 bytes in a string,
For example:
“abcd12345678” , i wanna to generate hash value for “abcd”, “cd12”, “1234”, “3456”, “5678”

My code as below:

global
void constructKeyValue(const unsigned n, const char *data,
unsigned *d_keys, unsigned *d_vals){
unsigned thread_index = threadIdx.x +
blockIdx.x * blockDim.x +
blockIdx.y * blockDim.x * gridDim.x;

 if (thread_index >= (n+1)/2  ) return;

d_keys[2*thread_index] = *(unsigned int*) &data[thread_index*4]; 
d_keys[2*thread_index+1] = *(unsigned int*) &data[thread_index*4+2];

}

unsigned *h_keys = (unsigned *) malloc(sizeof(unsigned)*n);
CUDA_SAFE_CALL(cudaMemcpy(h_keys, d_keys , sizeof(unsigned)*n, cudaMemcpyDeviceToHost));

The problem happens on last line code, " unknown error." , anyone can find what is the problem with the CUDA memory allocation?

veda87 · July 13, 2011, 3:32am

I guess the problem is somewhere else. Did your kernel complete successfully. Post your entire code.

xiaoxi · July 13, 2011, 3:51am

Hi All,

I wanna hash every 2 bytes in a string,

For example:

“abcd12345678” , i wanna to generate hash value for “abcd”, “cd12”, “1234”, “3456”, “5678”

My code as below:

global

void constructKeyValue(const unsigned n, const char *data,
				   unsigned  *d_keys, unsigned  *d_vals){

unsigned thread_index = threadIdx.x +

						  blockIdx.x * blockDim.x +

						  blockIdx.y * blockDim.x * gridDim.x;

 	 

 if (thread_index >= (n+1)/2  ) return;

d_keys[2*thread_index] = *(unsigned int*) &data[thread_index*4]; 

d_keys[2*thread_index+1] = *(unsigned int*) &data[thread_index*4+2];
}

unsigned *h_keys = (unsigned *) malloc(sizeof(unsigned)*n);

CUDA_SAFE_CALL(cudaMemcpy(h_keys, d_keys , sizeof(unsigned)*n, cudaMemcpyDeviceToHost));

The problem happens on last line code, " unknown error." , anyone can find what is the problem with the CUDA memory allocation?

I tested, this way has confliction, problem may happens when two threads access interleaved address.

But don’t understand why d_keys[2thread_index] = d_keys[2thread_index+1], these two are access by same thread.

And why this line has problem?

CUDA_SAFE_CALL(cudaMemcpy(h_keys, d_keys , sizeof(unsigned)*n, cudaMemcpyDeviceToHost));

xiaoxi · July 13, 2011, 3:57am

I change any way, still same problem. Now I let each thread access 8 bytes and generate 3 hash value,

n = (fileSize/8)*3;

global

void constructKeyValue(const unsigned n, const char *data,

				   unsigned  *d_keys, unsigned  *d_vals){

unsigned thread_index = threadIdx.x +

						  blockIdx.x * blockDim.x +

						  blockIdx.y * blockDim.x * gridDim.x;



 if(thread_index >= (n/3) ) return;

 d_keys[thread_index] = *(unsigned int*) &data[thread_index*8]; 

 d_vals[thread_index] = (unsigned int)&data[thread_index*8];

 printf("\n %d, %d, %d  ", thread_index, d_keys[thread_index],  d_vals[thread_index]   );

 __syncthreads();

 

 d_keys[thread_index+1] = *(unsigned int*) &data[thread_index*8+2]; 

 d_vals[thread_index+1] = (unsigned int)&data[thread_index*8+2];

 printf("\n %d, %d, %d  ", thread_index, d_keys[thread_index+1],  d_vals[thread_index+1]   );

 __syncthreads();

 d_keys[thread_index+2] = *(unsigned int*) &data[thread_index*8+4]; 

 d_vals[thread_index+2] = (unsigned int)&data[thread_index*8+4];

 printf("\n  %d, %d, %d  ", thread_index, d_keys[thread_index+2],  d_vals[thread_index+2]   );

 __syncthreads();

}

int main(){

constructKeyValue <<<gridDim, kBlockSize>>> (n, data,

                                         d_keys,

                                         d_vals);

 CUT_CHECK_ERROR("Failed to construct KeyValue!");

 // bool res = Build(n, d_keys, d_vals);

 //return res;

 CUT_CHECK_ERROR("Failed before build.");

unsigned *d_sorted_keys = NULL;

CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_keys, sizeof(unsigned) * n));

CUDA_SAFE_CALL(cudaMemcpy(d_sorted_keys, d_keys, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice));

unsigned *d_sorted_vals = NULL;

CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_vals, sizeof(unsigned) * n));

CUDA_SAFE_CALL(cudaMemcpy(d_sorted_vals, d_vals, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice));

CUT_CHECK_ERROR(“Failed to allocate.”);

cudaThreadSynchronize();

unsigned *h_keys = (unsigned *) malloc(sizeof(unsigned)*n);

unsigned *h_vals = (unsigned *) malloc(sizeof(unsigned)*n);

CUDA_SAFE_CALL(cudaMemcpy(h_keys, d_keys , sizeof(unsigned)*n, cudaMemcpyDeviceToHost));

return 0;

}

LSChien · July 13, 2011, 4:07am

I just worry about misaligned address of “&data[thread_index*4+2]”

d_keys[2*thread_index+1] = *(unsigned int*) &data[thread_index*4+2];

I always fetch characters to shared memory first, then do processing later, for example

__global__ void constructKeyValue(const unsigned n, const int *data,

    unsigned *d_keys, unsigned *d_vals)

{

    __shared__ int smem[TB] ; // TB = number of threads per block, TB = blockDim.x 

    int tid = threadIdx.x ; // local thread id

    int bid = blockIdx.y * gridDim.x + blockIdx.x ; // block id

    int gtid = bid * blockDim.x + threadIdx.x ; // global thread id

smem[tid] = data[bid * TB/2 + tid]; // make sure size of data is large enough or bind data to texture memory

    __syncthreads();

// tid = 2*gid + laneid

    int gid = tid >> 1 ; 

    int laneid = tid & 1 ;

    int x[2] ;

    x[0] = smem[gid];

    x[1] = smem[gid+1];

    int num = x[0];

    if ( laneid ){

        num = (x[0] << 16) + ((x[1]>>16) & 0x0000FFFF); 

    }

    d_keys[gid] = num ;

}

xiaoxi · July 13, 2011, 4:07am

I change any way, still same problem. Now I let each thread access 8 bytes and generate 3 hash value,

n = (fileSize/8)*3;

global

void constructKeyValue(const unsigned n, const char *data,
				   unsigned  *d_keys, unsigned  *d_vals){

unsigned thread_index = threadIdx.x +

						  blockIdx.x * blockDim.x +

						  blockIdx.y * blockDim.x * gridDim.x;



 if(thread_index >= (n/3) ) return;

 d_keys[thread_index] = *(unsigned int*) &data[thread_index*8]; 

 d_vals[thread_index] = (unsigned int)&data[thread_index*8];

 printf("\n %d, %d, %d  ", thread_index, d_keys[thread_index],  d_vals[thread_index]   );

 __syncthreads();

 

 d_keys[thread_index+1] = *(unsigned int*) &data[thread_index*8+2]; 

 d_vals[thread_index+1] = (unsigned int)&data[thread_index*8+2];

 printf("\n %d, %d, %d  ", thread_index, d_keys[thread_index+1],  d_vals[thread_index+1]   );

 __syncthreads();

 d_keys[thread_index+2] = *(unsigned int*) &data[thread_index*8+4]; 

 d_vals[thread_index+2] = (unsigned int)&data[thread_index*8+4];

 printf("\n  %d, %d, %d  ", thread_index, d_keys[thread_index+2],  d_vals[thread_index+2]   );

 __syncthreads();
}

int main(){

constructKeyValue <<<gridDim, kBlockSize>>> (n, data,
                                         d_keys,

                                         d_vals);

 CUT_CHECK_ERROR("Failed to construct KeyValue!");

 // bool res = Build(n, d_keys, d_vals);

 //return res;

 CUT_CHECK_ERROR("Failed before build.");
unsigned *d_sorted_keys = NULL;

CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_keys, sizeof(unsigned) * n));

CUDA_SAFE_CALL(cudaMemcpy(d_sorted_keys, d_keys, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice));

unsigned *d_sorted_vals = NULL;

CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_vals, sizeof(unsigned) * n));

CUDA_SAFE_CALL(cudaMemcpy(d_sorted_vals, d_vals, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice));

CUT_CHECK_ERROR(“Failed to allocate.”);

cudaThreadSynchronize();

unsigned *h_keys = (unsigned *) malloc(sizeof(unsigned)*n);

unsigned *h_vals = (unsigned *) malloc(sizeof(unsigned)*n);

CUDA_SAFE_CALL(cudaMemcpy(h_keys, d_keys , sizeof(unsigned)*n, cudaMemcpyDeviceToHost));

return 0;

}

I tested “abcd123489abcd1234abcd12”, result is as below, first column is thread id, 2rd is key, 3rd is value. I don’t understand why thread 0 first 4 characters(char 0-4) value equals value of chars 2-6 , any solution for the issue, thanks a lot.

0, 1684234849, 87031808

1, 1650538808, 87031816

2, 1650537523, 87031824

0, 1684234849, 87031810

1, 1650538808, 87031818

2, 1650537523, 87031826

0, 875770417, 87031812

1, 842097763, 87031820

2, 842097763, 87031828

xiaoxi · July 13, 2011, 4:24am

__global__ 

void constructKeyValue(const unsigned n, const char *data,

					   unsigned  *d_keys, unsigned  *d_vals){

	unsigned thread_index = threadIdx.x +

							  blockIdx.x * blockDim.x +

							  blockIdx.y * blockDim.x * gridDim.x;

	

	 if(thread_index >= (n/3) ) return;

	 d_keys[thread_index] = *(unsigned int*) &data[thread_index*8]; 

	 d_vals[thread_index] = (unsigned int)&data[thread_index*8];

	 printf("\n %d, %d, %d  ", thread_index, d_keys[thread_index],  d_vals[thread_index]   );

	 __syncthreads();

	 

	 d_keys[thread_index+1] = *(unsigned int*) &data[thread_index*8+2]; 

	 d_vals[thread_index+1] = (unsigned int)&data[thread_index*8+2];

	 printf("\n %d, %d, %d  ", thread_index, d_keys[thread_index+1],  d_vals[thread_index+1]   );

	 __syncthreads();

	 d_keys[thread_index+2] = *(unsigned int*) &data[thread_index*8+4]; 

	 d_vals[thread_index+2] = (unsigned int)&data[thread_index*8+4];

	 printf("\n  %d, %d, %d  ", thread_index, d_keys[thread_index+2],  d_vals[thread_index+2]   );

	 __syncthreads();

}

main(){

  //...

	constructKeyValue <<<gridDim, kBlockSize>>> (n, data,d_keys,d_vals);

	CUT_CHECK_ERROR("Failed to construct KeyValue!");

	 

	CUT_CHECK_ERROR("Failed before build.");

	

	

	

	unsigned *d_sorted_keys = NULL;

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_keys, sizeof(unsigned) * n));

	CUDA_SAFE_CALL(cudaMemcpy(d_sorted_keys, d_keys, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice));

	

	unsigned *d_sorted_vals = NULL;

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_sorted_vals, sizeof(unsigned) * n));

	CUDA_SAFE_CALL(cudaMemcpy(d_sorted_vals, d_vals, sizeof(unsigned) * n, cudaMemcpyDeviceToDevice));

	

	CUT_CHECK_ERROR("Failed to allocate.");

	

	cudaThreadSynchronize();

	unsigned *h_keys = (unsigned *) malloc(sizeof(unsigned)*n);

	unsigned *h_vals = (unsigned *) malloc(sizeof(unsigned)*n);

	CUDA_SAFE_CALL(cudaMemcpy(h_keys, d_keys , sizeof(unsigned)*n, cudaMemcpyDeviceToHost));

	

return 0;

}

I tested “abcd123489abcd1234abcd12”, result is as below, first column is thread id, 2rd column is key, 3rd column is value. I don’t understand why thread 0 first 4 characters(char 0-4ï¼Œ “abcd”) value equals value of chars 2-6(“cd12”) , any solution for the issue? thanks a lot.

I tried shared memory, same issue

0, 1684234849, 87031808

1, 1650538808, 87031816

2, 1650537523, 87031824

0, 1684234849, 87031810

1, 1650538808, 87031818

2, 1650537523, 87031826

0, 875770417, 87031812

1, 842097763, 87031820

2, 842097763, 87031828

tera · July 13, 2011, 4:31am

As LSChien already pointed out, results of misaligned reads are undefined in CUDA.
Current hardware just masks off the lowest address bits, which is why both the aligned and the misaligned memory access return the same result.

The __byte_perm() intrinsic allows you to shuffle bytes conveniently, and it maps to a single instruction on compute capability 2.x.

xiaoxi · July 13, 2011, 4:38am

I just worry about misaligned address of “&data[thread_index*4+2]”

d_keys[2*thread_index+1] = *(unsigned int*) &data[thread_index*4+2];

I always fetch characters to shared memory first, then do processing later, for example

__global__ void constructKeyValue(const unsigned n, const int *data,

    unsigned *d_keys, unsigned *d_vals)

{

    __shared__ int smem[TB] ; // TB = number of threads per block, TB = blockDim.x 

    int tid = threadIdx.x ; // local thread id

    int bid = blockIdx.y * gridDim.x + blockIdx.x ; // block id

    int gtid = bid * blockDim.x + threadIdx.x ; // global thread id

smem[tid] = data[bid * TB/2 + tid]; // make sure size of data is large enough or bind data to texture memory

    __syncthreads();

// tid = 2*gid + laneid

    int gid = tid >> 1 ; 

    int laneid = tid & 1 ;

    int x[2] ;

    x[0] = smem[gid];

    x[1] = smem[gid+1];

    int num = x[0];

    if ( laneid ){

        num = (x[0] << 16) + ((x[1]>>16) & 0x0000FFFF); 

    }

    d_keys[gid] = num ;

}

Thank you so much :)

xiaoxi · July 13, 2011, 4:40am

Many thanks :)

Topic		Replies	Views
cudamalloc struct problems - unspecified launch failure CUDA Programming and Performance	16	3841	April 8, 2011
Device memory access bug? CUDA Programming and Performance	29	2367	June 16, 2011
CUDA Runtime API error 74: misaligned address CUDA Programming and Performance	11	11308	November 10, 2016
"unaligned memory accesses not supported" CUDA Programming and Performance	1	2876	June 26, 2007
strange error in summation memory problems CUDA Programming and Performance	14	16791	July 21, 2010
Error: "unaligned memory accesses not supported"? CUDA Programming and Performance	6	11212	April 2, 2009
Bad performance or bad coding? CUDA Programming and Performance	21	976	October 12, 2021
Block synchronization issue on Fermi Global memory doesn't seem to by in sync across thread bloc CUDA Programming and Performance	18	3373	April 7, 2011
Copy hash table from GPU to CPU CUDA Programming and Performance	4	989	April 4, 2019
Very strange share memory bank conflicts CUDA Programming and Performance cuda	4	590	November 2, 2021

Global memory access problem

Related topics