Thanks everyone, here is my kernel,
When I run this as a single kernel(by default) with <<<8,256>>>, execution time : 8.40ms
when I run 2 kernels with two streams with <<<8,256>>>, ececution time : 8.56ms
when I run 4 kernels with 4 streams with <<<8,256>>>, ececution time : 9.10ms
Why this is not running concurrently? My GPU is 2.1 compute capability and I have tested concurrent kernel execution with simpler kernels, those are running concurrently.
“status” buffer is on shared memory, sh1_tbl, temper_tbl, … and all the other buffers are on constant space.
Please have a look into this and let me know your views, Thank you so much…
global void MTkernel(mtgp32_kernel_status_t* d_status, uint32_t* d_data, int size, int abid) {
const int bid = blockIdx.x;
const int tid = threadIdx.x;
const int actualBid = abid+bid;
int pos = pos_tbl[actualBid];
uint32_t r;
// copy status data from global memory to shared memory.
status_read(status, d_status, actualBid, tid);
// main loop
for (int i = 0; i < size; i += LARGE_SIZE) {
uint32_t Y;
uint32_t X = (status[LARGE_SIZE - N + tid] & mask) ^ status[LARGE_SIZE - N + tid + 1];
X ^= X << sh1_tbl[actualBid];
Y = X ^ (status[LARGE_SIZE - N + tid + pos] >> sh2_tbl[actualBid]);
r = Y ^ param_tbl[actualBid][Y & 0x0f];
status[tid] = r;
uint32_t T = status[LARGE_SIZE - N + tid + pos - 1];
T ^= T >> 16;
T ^= T >> 8;
d_data = r ^ temper_tbl[actualBid][T & 0x0f];
__syncthreads();
X = (status[(4 * THREAD_NUM - N + tid) % LARGE_SIZE]& mask) ^ status[(4 * THREAD_NUM - N + tid + 1) % LARGE_SIZE];
X ^= X << sh1_tbl[actualBid];
Y = X ^ (status[(4 * THREAD_NUM - N + tid + pos) % LARGE_SIZE] >> sh2_tbl[actualBid]);
r = Y ^ param_tbl[actualBid][Y & 0x0f];
status[tid + THREAD_NUM] = r;
T = status[(4 * THREAD_NUM - N + tid + pos - 1) % LARGE_SIZE];
T ^= T >> 16;
T ^= T >> 8;
d_data = r ^ temper_tbl[actualBid][T & 0x0f];
__syncthreads();
X = (status[2 * THREAD_NUM - N + tid] & mask) ^ status[2 * THREAD_NUM - N + tid + 1];
X ^= X << sh1_tbl[actualBid];
Y = X ^ (status[2 * THREAD_NUM - N + tid + pos] >> sh2_tbl[actualBid]);
r = Y ^ param_tbl[actualBid][Y & 0x0f];
status[tid + 2 * THREAD_NUM] = r;
T = status[tid + pos - 1 + 2 * THREAD_NUM - N];
T ^= T >> 16;
T ^= T >> 8;
d_data = r ^ temper_tbl[actualBid][T & 0x0f];
__syncthreads();
}
// write back status for next call
status_write(d_status, status, actualBid, tid);
}