I have run some experiments for Synchronous and Asynchronous computations with dummy operations. Can you please get any idea why it shows unexpected behavior?
SJ-AJ: should not be negative.
SJ: synchronous
AJ: asynchronous
I found following results:
Case 1: only first thread of first block will do the computations
global void dummy(int goalVal, int *Ain, int *Aout, float *in, float *out, int n)
{
for(int i=0; i < MAX_ITER; i++){
float result = 0.0;
if(blockIdx.x == 0){
if(threadIdx.x == 0){
for(int j=0; j < n; j++){
for(int i=0; i < n; i++){
result += in[i];
result *= in[i];
result -= in[i];
}
out[j] = result;
}
}
}
__gpu_sync(i+1, i+1, Ain, Aout);
}
}
Results:
#Blocks = 128
N SJ AJ SJ-AJ
2 0.296896 0.25088 0.046016
4 0.52256 0.430016 0.092544
8 1.456 1.35696 0.09904
16 4.967968 4.90512 0.062848
32 18.72784 18.77392 -0.04608
64 73.10294 73.31844 -0.21549
128 284.7325 288.0857 -3.35312
256 1132.916 1151.862 -18.9462
512 4534.985 4584.027 -49.0415
1024 18015.97 18304.48 -288.512
Case 2: All threads will do the computations
global void dummy(int goalVal, int *Ain, int *Aout, float *in, float *out, int n)
{
int tx = threadIdx.x;
for(int i=0; i < MAX_ITER; i++){
float result = 0.0;
for(int j=tx; j < n; j+=blockDim.x){
for(int i=0; i < n; i++){
result += in[i];
result *= in[i];
result -= in[i];
}
out[j] = result;
}
__gpu_sync(i+1, i+1, Ain, Aout);
}
}
Results:
#Blocks = 128
N SJ AJ SJ-AJ
2 0.264864 0.183168 0.081696
4 0.309792 0.17072 0.139072
8 0.375968 0.23776 0.138208
16 0.51632 0.380864 0.135456
32 0.806944 0.687392 0.119552
64 2.309824 1.301312 1.008512
128 6.785312 2.610272 4.17504
256 14.73555 9.168864 5.566688
512 40.94973 35.81248 5.137245
1024 148.5331 142.7693 5.763779