memory coalescing and syncronizing problem

Hi, everyone

I wrote several versions of a device function in order to get the best runtime performance as i can. Function get_1itemset_Count5 works well and the output is correct, but its memory access is non_coalesced because just less than 29 integers are read from global memory to shared memory and number of integers are not fixed with a alignment problem every time , so i wrote function get_1itemset_Count6 to make memory access coalesced by reading 256 integers every time. Compiling succeeds but the output is totally wrong, i detected the source of error step by step and found that every block can read 256 integers correctly when i annotate __syncthreads(), otherwise, 68 integers of 256 are read, and there are 19 integers are read corresponding to blockDim(64, 1) if annotating __syncthreads(). The key number(68, 19) are not a multiple of 32, and the problem occurs at beginning of the device function, it’s don’t make sense so i am confused, hope someone can give me some advices.

Thanks in advance for your help.

-Gimurk

CUDA configuration and codes as following:

DEVICE: Geforce 8800 GT

SYSTEM: Fedora 7

CUDA driver: NVIDIA Driver for Linux with CUDA Support (169.09)

CUDA toolkit: CUDA Toolkit version 1.1 for Fedora 7

CUDA SDK: CUDA SDK version 1.1 for Linux

[codebox]// count the candidate frequent 1-itemset by scaning database

global void get_1itemset_Count5( int stream_start, int itemset_num, int trans_num, int sdata_sz, int *dk_buffer, ITEMCNT *dk_freq_itemset_cnt)

{

extern  __shared__  int sdata[];             // sdata[DBASE_MAX_TRAN_LEN + 4]



int item_idx = stream_start + blockIdx.x * blockDim.x + threadIdx.x;

int item_cnt = 0, address = 0, tran_sz;

int i, j;



if(item_idx < itemset_num) {

	// read first transaction

	if(threadIdx.x  < sdata_sz) 

			sdata[threadIdx.x] = dk_buffer[threadIdx.x];

	__syncthreads();

	tran_sz = sdata[1];                        // sdata[1]: store the size of current transaction

	// count and read next transaction

	for(i = 0; i < trans_num - 1; i++) {			

		for(j = 2; j < tran_sz + 2; j++) {       // all branch will be evaluated

			if(sdata[j] == item_idx) item_cnt++;

		}

		

		address += tran_sz + 2;

		if(threadIdx.x  < sdata[tran_sz + 3] + 4) 

			sdata[threadIdx.x] = dk_buffer[address + threadIdx.x];

		__syncthreads();

		tran_sz = sdata[1];		

	}	

	// last transaction counting

	for(j = 2; j < tran_sz + 2; j++) {        // all branch will be evaluated

			if(sdata[j] == item_idx) item_cnt++;

		}

			

	dk_freq_itemset_cnt[item_idx].itemset_cnt = item_cnt;     

	dk_freq_itemset_cnt[item_idx].itemset_id = item_idx;

}

}[/codebox]

[codebox]// count the candidate frequent 1-itemset by scaning database

global void get_1itemset_Count6( int stream_start, int itemset_num, int trans_num, int *dk_buffer, ITEMCNT *dk_freq_itemset_cnt)

{ // blockDim.x must be larger than 16

extern  __shared__  int sdata[];               // sdata[blockDim.x]



int item_idx = stream_start + blockIdx.x * blockDim.x + threadIdx.x;

int item_cnt = 0, address = 0, tran_sz, sdata_address = 0;

int i j;



if(blockIdx.x < gridDim.x - 1) {  

	// first load

	sdata[threadIdx.x] = dk_buffer[threadIdx.x];

	__syncthreads();        // cause a memory access error. why???????

	//tran_sz = sdata[1];                        // sdata[1]: store the size of current transaction

	

	 //count and read next transaction block if necessary

	//for(i = 0; i < trans_num; i++) {			

	//	for(j = 0; j < tran_sz; j++)             // all of the two branches will be executed

	//		if(sdata[sdata_address + 2 + j] == item_idx) item_cnt++;			

	//	sdata_address += (tran_sz + 2);

	//	if(sdata_address+1 > blockDim.x-1 || sdata_address + sdata[sdata_address+1] + 1 > blockDim.x - 1) {

	//		address = address + blockDim.x - 16;   // step 64 bytes backward									

	//		sdata[threadIdx.x] = dk_buffer[address + threadIdx.x];

	 //       	__syncthreads();                 // last load will access data beyond dk_buffer, will be a error?

	//		sdata_address = sdata_address%16;

	//	}

	//	tran_sz = sdata[sdata_address + 1];		

	//}	

	dk_freq_itemset_cnt[item_idx].itemset_cnt = sdata[threadIdx.x]; // item_cnt;

	dk_freq_itemset_cnt[item_idx].itemset_id  = item_idx;

}

//else {    // the last block has less threads than number of blockDim.x 

	  // first load

//	sdata[threadIdx.x] = dk_buffer[threadIdx.x];

//	__syncthreads();

//	tran_sz = sdata[1];                        //sdata[1]: store the size of current transaction

	  // count and read next transaction block if necessary

//	for(i = 0; i < trans_num - 1; i++) {

//		if(item_idx < itemset_num) {			

//			for(j = 0; j < tran_sz; j++) //all of the two branches will be executed

//				if(sdata[sdata_address + 2 + j] == item_idx) item_cnt++;	

//		}		

//		sdata_address += (tran_sz + 2);

//		if(sdata_address+1 > blockDim.x-1 || sdata_address + sdata[sdata_address+1] + 1 > blockDim.x - 1) {

//			address = address + blockDim.x - 16;  // step 64 bytes backward									

//			sdata[threadIdx.x] = dk_buffer[address + threadIdx.x];

//			__syncthreads();                      // last load will access data beyond dk_buffer, will be a error?

//			sdata_address = sdata_address%16;

//		}

//		tran_sz = sdata[sdata_address + 1];		

//	}	

//	if(item_idx < itemset_num) {

//		dk_freq_itemset_cnt[item_idx].itemset_cnt = item_cnt; 

//		dk_freq_itemset_cnt[item_idx].itemset_id = item_idx;

//	}

//}

}[/codebox]

[codebox]main:

//get_1itemset_Count3<<<gridDim, blockDim, DBASE_MAX_TRAN_LEN+100>>>( 0, DBASE_MAXITEM, DBASE_NUM_TRANS, DBASE_MAX_TRAN_LEN+4, d_buffer, d_itemsets_cnt_struct); //DBASE_MAX_TRAN_LEN+4

//get_1itemset_Count5<<<gridDim, blockDim, DBASE_MAX_TRAN_LEN+100>>>( 0, DBASE_MAXITEM, DBASE_NUM_TRANS, DBASE_MAX_TRAN_LEN+4, d_buffer, d_itemsets_cnt_struct);   //DBASE_MAX_TRAN_LEN+4

get_1itemset_Count6<<<gridDim, blockDim, 256>>>( 0, DBASE_MAXITEM, DBASE_NUM_TRANS, d_buffer, d_itemsets_cnt_struct);   

[/codebox]

difference in main is just which function is annotated

i know where the problem is, it is the configuration of extern shared memory, which should be number of bytes, i had a wrong understand