I am using cublas to do some computation. The size of the vectors I tried is in the range of 1000-3000, and the size of matrices is 1000^2-3000^2. The main program is in a loop. When the difference between two vectors (which are updated during each loop) is less than some tolerance, exist the loop. I am using GT9400 now, and for vector size 1024, timeelapsed from the following code is 0.0160 while the same code in matlab takes 0.031. I would hope GPU has more improvement.
The program starts by computing some matrices:
AB=Cb
DEFG=Cf
//Where A,B, D,E,F,G only depend on the size of the vector. After I got Cb, Cf, these will be destroyed/
Since Cf and Cb will be called in the loop many times, I was thinking to define them as __constant__float*, but it turns out that the argument in the cublas functions can not be constant variables, so I have to only define them as float*.
C*m=Yold;
//Following are the loop with the kernel functions defined at the end. All the variables with their names start with d are device variables, which are consistent with the requirement in the cublas. This way also reduces the time on transferring the data back to CPU.
cudaThreadSynchronize();
start_time=clock();
for(iterations=1;iterations<maxIteration;iterations++){
FunctionEvaluation<<<Block_Num,Thread_Num,0>>>(d_Yold,d_f);
// This function evaluate the function of each element of the vector, d_f(i)=f(d_Yold(i))
cublasSgemv('n', (N+1), (N+1), alpha, d_Cf, (N+1), d_f, 1, beta, d_B, 1); //Cf*f=B
ICcorrected<<<1,1,0>>>(d_B,d_f);///corrected for initial conditions, get d_f, corrected coefficients
// I need to add a constant to the first element of B (B[0]). Using one kernel doing this, I do not need to return back the data to CPU. But this sounds not an optimal way.
cublasSgemv('n', (N+1), (N+1), alpha, d_Cb, (N+1), d_f, 1, beta, d_Ynew, 1); //Cb*f=Ynew
yDifference<<<Block_Num,Thread_Num,0>>>(d_Ynew,d_Yold,d_e);
//calculate the difference between two vectors
h_e2=cublasSdot (N+1, d_e, 1,d_e, 1);
//calculate the norm of the vector. If less than the tolerance, exist the for loop. h_e2 is a variable on CPU as cublasSdot defined. Will this degenerate the performance? I would like some way that the kernel code is running the loop, only exist for some conditions. But I do not know how to do this…
if(abs(h_e2)<deltaLimit){
break;//exist
}
else{
EvaluateArray<<<1,1,0>>>(d_B,d_f); //all the elments of B are the same as the elments in f in the same order.
EvaluateArray<<<1,1,0>>>(d_Yold,d_Ynew) ); //[b]all the elments of Yold are the same as the elments in Ynew in the same order. I tried using one kernel function instead of these two functions but it turned out requiring more time. There should be a good way to improve this part but I am not sure..[/b]
}
}
status = cublasGetVector(N+1, sizeof(h_Ynew[0]), d_Ynew, 1, h_Ynew, 1);
cudaThreadSynchronize();
end_time=clock();
timeelapsed=double(end_time- start_time)/CLK_TCK;
//I think this is the time used for the overall loop,
global static void FunctionEvaluation(float y, float f)
{
const int tid=threadIdx.x;
const int bid=blockIdx.x;
float time;
time=cos((bidThread_Num +tid)PI/N);
f[bidThread_Num +tid]=H/2cos(H/2time+H/2+epsilony[bid*Thread_Num +tid]);
}
global static void ICcorrected(float *Bold, float Bnew)
{
int i;
for(i=0;i<=N;i++)
Bnew[i]=Bold[i];
Bnew[0]=Bold[0]+2y0;///
}
global static void yDifference(float *Y1, float *Y2, float *e)
{
const int tid=threadIdx.x;
const int bid=blockIdx.x;
e[bid*Thread_Num +tid]=Y1[bid*Thread_Num +tid]-Y2[bid*Thread_Num +tid];
}
global static void EvaluateArray(float *Y1, float *Y2)
{
int i;
for(i=0;i<=N;i++){
Y1[i]=Y2[i]; }
}