Cublas regression with CUDA3.2 17% slower ?

Hi all,
since I have upgraded to CUDA3.2 (and then the drivers) I’m experiencing a performance
loss with cublasCdotc.
I’m using cublas on a C1060.

Times with CUDA 3.1:

2^10 ==> 0.278656
2^11 ==> 0.183392
2^12 ==> 0.17536
2^13 ==> 0.175712
2^14 ==> 0.178944
2^15 ==> 0.182112
2^16 ==> 0.18688
2^17 ==> 0.197568
2^18 ==> 0.221216
2^19 ==> 0.26976
2^20 ==> 0.363808

Times with CUDA 3.2:

2^10 ==> 0.353632
2^11 ==> 0.196704
2^12 ==> 0.193696
2^13 ==> 0.192224
2^14 ==> 0.192672
2^15 ==> 0.19712
2^16 ==> 0.201376
2^17 ==> 0.230816
2^18 ==> 0.238784
2^19 ==> 0.28592
2^20 ==> 0.427104

As you can see treating vectors long 2^20 complex the operation is 17% slower

Attached the test compiled with:

g++ dotprod.cpp -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcublas

Regards
Gaetano Mendola

===========

#include
#include <cublas.h>
#include <cuda_runtime_api.h>

int main() {

const size_t myPower = 20;
const size_t myVectorSize = 1 << myPower;

const cuComplex* x;
cudaMalloc((void**)&x, sizeof(cuComplex)*myVectorSize);

const cuComplex* y;
cudaMalloc((void**)&y, sizeof(cuComplex)*myVectorSize);

for (int i = 10; i <= myPower; ++i) {
cudaEvent_t start, stop;

cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaEventRecord(start, 0);
const cuComplex myResult = cublasCdotc(1<<i,
                                     x,
                                     1,
                                     y,
                                     1);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);

float elapsedTime;

cudaEventElapsedTime(&elapsedTime, start, stop);

std::cout << "2^" << i << " ==> " << elapsedTime << std::endl;

cudaEventDestroy(start);
cudaEventDestroy(stop);

}

}