Hi, everyone
I wrote several versions of a device function in order to get the best runtime performance as i can. Function get_1itemset_Count5 works well and the output is correct, but its memory access is non_coalesced because just less than 29 integers are read from global memory to shared memory and number of integers are not fixed with a alignment problem every time , so i wrote function get_1itemset_Count6 to make memory access coalesced by reading 256 integers every time. Compiling succeeds but the output is totally wrong, i detected the source of error step by step and found that every block can read 256 integers correctly when i annotate __syncthreads(), otherwise, 68 integers of 256 are read, and there are 19 integers are read corresponding to blockDim(64, 1) if annotating __syncthreads(). The key number(68, 19) are not a multiple of 32, and the problem occurs at beginning of the device function, it’s don’t make sense so i am confused, hope someone can give me some advices.
Thanks in advance for your help.
-Gimurk
CUDA configuration and codes as following:
DEVICE: Geforce 8800 GT
SYSTEM: Fedora 7
CUDA driver: NVIDIA Driver for Linux with CUDA Support (169.09)
CUDA toolkit: CUDA Toolkit version 1.1 for Fedora 7
CUDA SDK: CUDA SDK version 1.1 for Linux
[codebox]// count the candidate frequent 1-itemset by scaning database
global void get_1itemset_Count5( int stream_start, int itemset_num, int trans_num, int sdata_sz, int *dk_buffer, ITEMCNT *dk_freq_itemset_cnt)
{
extern __shared__ int sdata[]; // sdata[DBASE_MAX_TRAN_LEN + 4]
int item_idx = stream_start + blockIdx.x * blockDim.x + threadIdx.x;
int item_cnt = 0, address = 0, tran_sz;
int i, j;
if(item_idx < itemset_num) {
// read first transaction
if(threadIdx.x < sdata_sz)
sdata[threadIdx.x] = dk_buffer[threadIdx.x];
__syncthreads();
tran_sz = sdata[1]; // sdata[1]: store the size of current transaction
// count and read next transaction
for(i = 0; i < trans_num - 1; i++) {
for(j = 2; j < tran_sz + 2; j++) { // all branch will be evaluated
if(sdata[j] == item_idx) item_cnt++;
}
address += tran_sz + 2;
if(threadIdx.x < sdata[tran_sz + 3] + 4)
sdata[threadIdx.x] = dk_buffer[address + threadIdx.x];
__syncthreads();
tran_sz = sdata[1];
}
// last transaction counting
for(j = 2; j < tran_sz + 2; j++) { // all branch will be evaluated
if(sdata[j] == item_idx) item_cnt++;
}
dk_freq_itemset_cnt[item_idx].itemset_cnt = item_cnt;
dk_freq_itemset_cnt[item_idx].itemset_id = item_idx;
}
}[/codebox]
[codebox]// count the candidate frequent 1-itemset by scaning database
global void get_1itemset_Count6( int stream_start, int itemset_num, int trans_num, int *dk_buffer, ITEMCNT *dk_freq_itemset_cnt)
{ // blockDim.x must be larger than 16
extern __shared__ int sdata[]; // sdata[blockDim.x]
int item_idx = stream_start + blockIdx.x * blockDim.x + threadIdx.x;
int item_cnt = 0, address = 0, tran_sz, sdata_address = 0;
int i j;
if(blockIdx.x < gridDim.x - 1) {
// first load
sdata[threadIdx.x] = dk_buffer[threadIdx.x];
__syncthreads(); // cause a memory access error. why???????
//tran_sz = sdata[1]; // sdata[1]: store the size of current transaction
//count and read next transaction block if necessary
//for(i = 0; i < trans_num; i++) {
// for(j = 0; j < tran_sz; j++) // all of the two branches will be executed
// if(sdata[sdata_address + 2 + j] == item_idx) item_cnt++;
// sdata_address += (tran_sz + 2);
// if(sdata_address+1 > blockDim.x-1 || sdata_address + sdata[sdata_address+1] + 1 > blockDim.x - 1) {
// address = address + blockDim.x - 16; // step 64 bytes backward
// sdata[threadIdx.x] = dk_buffer[address + threadIdx.x];
// __syncthreads(); // last load will access data beyond dk_buffer, will be a error?
// sdata_address = sdata_address%16;
// }
// tran_sz = sdata[sdata_address + 1];
//}
dk_freq_itemset_cnt[item_idx].itemset_cnt = sdata[threadIdx.x]; // item_cnt;
dk_freq_itemset_cnt[item_idx].itemset_id = item_idx;
}
//else { // the last block has less threads than number of blockDim.x
// first load
// sdata[threadIdx.x] = dk_buffer[threadIdx.x];
// __syncthreads();
// tran_sz = sdata[1]; //sdata[1]: store the size of current transaction
// count and read next transaction block if necessary
// for(i = 0; i < trans_num - 1; i++) {
// if(item_idx < itemset_num) {
// for(j = 0; j < tran_sz; j++) //all of the two branches will be executed
// if(sdata[sdata_address + 2 + j] == item_idx) item_cnt++;
// }
// sdata_address += (tran_sz + 2);
// if(sdata_address+1 > blockDim.x-1 || sdata_address + sdata[sdata_address+1] + 1 > blockDim.x - 1) {
// address = address + blockDim.x - 16; // step 64 bytes backward
// sdata[threadIdx.x] = dk_buffer[address + threadIdx.x];
// __syncthreads(); // last load will access data beyond dk_buffer, will be a error?
// sdata_address = sdata_address%16;
// }
// tran_sz = sdata[sdata_address + 1];
// }
// if(item_idx < itemset_num) {
// dk_freq_itemset_cnt[item_idx].itemset_cnt = item_cnt;
// dk_freq_itemset_cnt[item_idx].itemset_id = item_idx;
// }
//}
}[/codebox]
[codebox]main:
//get_1itemset_Count3<<<gridDim, blockDim, DBASE_MAX_TRAN_LEN+100>>>( 0, DBASE_MAXITEM, DBASE_NUM_TRANS, DBASE_MAX_TRAN_LEN+4, d_buffer, d_itemsets_cnt_struct); //DBASE_MAX_TRAN_LEN+4
//get_1itemset_Count5<<<gridDim, blockDim, DBASE_MAX_TRAN_LEN+100>>>( 0, DBASE_MAXITEM, DBASE_NUM_TRANS, DBASE_MAX_TRAN_LEN+4, d_buffer, d_itemsets_cnt_struct); //DBASE_MAX_TRAN_LEN+4
get_1itemset_Count6<<<gridDim, blockDim, 256>>>( 0, DBASE_MAXITEM, DBASE_NUM_TRANS, d_buffer, d_itemsets_cnt_struct);
[/codebox]
difference in main is just which function is annotated