Weird problem of cublasZscal

I tried to normalize a vector with cublasZscal function. However, the result is quite odd.

It seems that cublasZscal has a problem.

I defined one vector, d_a, with (N=65536) elements,

scale them with a complex constant in the loop,

[codebox]

#include <stdio.h>

#include <cublas.h>

typedef double2 Complex;

int main(int argc, char *argv)

{

    int     i, m;

    int     N = 256 * 256;

cublasStatus status;

    Complex         f;

    Complex         *h_a, *h_cpu;

    Complex         *d_a;

cudaMallocHost((void **) &h_a, N*sizeof(Complex));

    cudaMallocHost((void **) &h_cpu, N*sizeof(Complex));

    cudaMalloc((void **)&d_a, N* sizeof(Complex));

cublasInit();

f.x = 0.3f;

    f.y = 0.0f;

for(m = 0 ; m < 20 ; m++)

    {

            for(i = 0 ; i < N ; i++)

            {

                    h_a[i].x = i;

                    h_a[i].y = 0.2f * i;

h_cpu[i].x = (h_a[i].x) * (f.x);

                    h_cpu[i].y = (h_a[i].y) * (f.x);

            }

cublasSetVector(N, sizeof(Complex), h_a, 1, d_a, 1);

            cudaThreadSynchronize();

//for(int k = 0 ; k < 256 ; k++)

            //      cublasZscal(256, f, &(d_a[256*k]), 1);

cublasZscal(N, f, d_a, 1);

            cudaThreadSynchronize();

cublasGetVector(N, sizeof(Complex), d_a, 1, h_a, 1);

            cudaThreadSynchronize();

for(i = 0 ; i < N ; i++)

            {

                    if( fabs(h_a[i].x - h_cpu[i].x) > 0.01)

                            fprintf(fp, "loop : %d, idx : %d, (%lf, %lf) should be (%lf, %lf)\n", m, i, h_a[i].x, h_a[i].y, h_cpu[i].x, h_cpu[i].y);

            }

    }

cublasShutdown();

cudaFreeHost(h_a);

    cudaFreeHost(h_cpu);

    cudaFree(d_a);

fclose(fp);

}

[/codebox]

****** After the reinstall of the driver, the problem is solved.