In short: i have a function (including cudaDeviceSynchronize) and its performance is affected by whether i reset the device before the function is called or not. instead of reset stopping the cpu thread for 2 sec with usleep() also works.
Are there internal hidden operations in the cuBLAS/GPU even after cudaDeviceSynchronize() ???
Note: I am not sure whether it is cuBLAS related or not and I use SDK 5.0.
In detail:
I have 2 streams, working parallel on dgemm operations. According to the broadcasted seminar i issue the operations in the correct order: “for each stream only one operation” . Check the attached profile, kernel overlapping is reached.
What dgemm operations to be done is in a list called “records”. I have several list, so i iterate over them something like this:
for(int idx=0;...)
{
... get records ...
if(idx==9) usleep(2000*1000);
/// if(idx==9)cudaDeviceReset();
float elapsedTime;unsigned long long int iFlop;
test_cuda_stream_nocom_mod(2,a_max,records,elapsedTime,iFlop);
}
And as you seee, i made eg a sleep before the 9th function call.
And here is the function:
int test_cuda_stream_nocom_mod(unsigned int MAC,
unsigned int iMaxDim,
std::vector<std::vector<int> >& records,
float& elapsedTime,
unsigned long long int& iFlop)
{
....
cudaDeviceSynchronize();
double t1 = omp_get_wtime();
for(int z=0;z<20; ++z) {MAC=MAC_orig;double tje = omp_get_wtime();
for(unsigned int i=0;i<records.size(); i+=MAC)
{
MAC=std::min( unsigned (records.size()-i),MAC);
for(unsigned int streamid=0;streamid<MAC; streamid++)
{
m[streamid] = records[i+streamid][0];
n[streamid] = records[i+streamid][1];
p[streamid] = records[i+streamid][2];
q[streamid] = records[i+streamid][3];
s1[streamid] = records[i+streamid][4];
s2[streamid] = records[i+streamid][5];
betaoff[streamid] = records[i+streamid][8];
tran1[streamid] = CUBLAS_OP_N;
/// at first op, it is tricky, row->col major transfer
if(records[i+streamid][6]==111) tran1[streamid] = CUBLAS_OP_T;
tran2[streamid] = CUBLAS_OP_N;
if(records[i+streamid][7]==112) tran2[streamid] = CUBLAS_OP_T;
iFlop += m[streamid]*n[streamid]*q[streamid]*2;
iFlop += m[streamid]*q[streamid]*p[streamid]*2;
}
for(unsigned int streamid=0;streamid<MAC; streamid++)
{
cublasSetStream(handle,(streams[streamid]));
cublasDgemm(handle,tran1[streamid], CUBLAS_OP_T,
m[streamid], q[streamid], n[streamid], d_scalar, d_A, s1[streamid],
d_X, q[streamid], d_scalar+1, d_T+MAC*streamid, m[streamid]);
}
for(unsigned int streamid=0;streamid<MAC; streamid++)
{
cublasSetStream(handle,(streams[streamid]));
cublasDgemm(handle,tran2[streamid], CUBLAS_OP_T,
p[streamid], m[streamid], q[streamid], d_scalar, d_B, s2[streamid],
d_T+MAC*streamid , m[streamid], d_scalar+betaoff[streamid],
d_Y+MAC*streamid, p[streamid]);
}
}
cudaDeviceSynchronize();
std::cout<<"time_final="<<(omp_get_wtime()-tje)*1000<<"\t";
}
}
So clearly there is a cudaDeviceSynchronize() before i start to measure time inside the function, so ouside world should not affect the measured time! Right?
Here are the result:
with sleep()
time_final=4.02432 time_final=3.98754 time_final=3.96998
time_final=3.98191 time_final=3.98908 time_final=4.04792
time_final=4.04767 time_final=4.05209 time_final=4.03403
time_final=4.0131 time_final=4.04794 time_final=4.00167
time_final=4.0529 time_final=4.04246 time_final=4.04557
time_final=4.04071 time_final=4.04703 time_final=4.04958
time_final=4.04384 time_final=4.04707 time_final=80.6137
Without sleep()
time_final=4.32178 time_final=4.31352 time_final=4.28114
time_final=4.2951 time_final=4.28662 time_final=4.3073
time_final=4.33142 time_final=4.30037 time_final=4.28163
time_final=4.29431 time_final=4.27964 time_final=4.2852
time_final=4.31007 time_final=4.26812 time_final=4.25016
time_final=4.30884 time_final=4.31049 time_final=4.16662
time_final=4.30808 time_final=4.27037 time_final=85.8138
What’s happening?
Thanks