disappointing CUDA BLAS performance

[font=“Courier”][font=“Courier”]I have a simple program to compare the performance of cudablas with my own blas routines. One example is “cublasDsyr”. My test vector size is 512. My kernel is a simple unoptimized one, with 512 blocks of 512 threads (which means only N*(N+1)/2 of the threads will be active). But even with this kernel the timing for 1000 iteration is 60 ms, vs 81ms for “cublasDsyr”. With a little optimization (using 256 threads) my kernel takes 50 ms. I cannot believe that a naive algorithm like mine performs better than cublas. So what am I missing?

Thanks for any help.

main()

{</P> <P>int i = 1000;</P> <P>while(i--){

#if CUBLAS

	cublasDsyr('u',512, 1, Vec, 1, Matrix, 512);

#else

	MyDsyr<<<512, 512>>>(Vec, Matrix, 1);</P> <P>#endif 

}

cudaThreadSynchronize();

}</P> <P>

__global__ void MyDsyr(const double *vec,  double *matrix, const double alfa)

{

__shared__ double vv;	//broadcast?</P> <P>//upper half calculation

int bx= blockIdx.x* N + threadIdx.x;</P> <P> if(threadIdx.x == 0){

  vv = vec[blockIdx.x]*alfa;

 }

 __syncthreads();</P> <P> if(threadIdx.x <= blockIdx.x){

  matrix[ bx] += vv*vec[threadIdx.x];

 }

 __syncthreads();

}

[/font][/font]

Sorry for the messed up code part in my original question. I still don’t know how to use this tool! I’ll just put the code as a part of this post until I figure it out.

main()
{

int i = 1000;

while(i–){
#if CUBLAS
cublasDsyr(‘u’,512, 1, Vec, 1, Matrix, 512);
#else
MyDsyr<<<512, 512>>>(Vec, Matrix, 1);

#endif
}
cudaThreadSynchronize();
}

global void MyDsyr(const double *vec, double *matrix, const double alfa)
{
shared double vv; //broadcast?

//upper half calculation
int bx= blockIdx.x* N + threadIdx.x;

if(threadIdx.x == 0){
vv = vec[blockIdx.x]*alfa;
}
__syncthreads();

if(threadIdx.x <= blockIdx.x){
matrix[ bx] += vv*vec[threadIdx.x];
}
__syncthreads();
}