I tried to normalize a vector with cublasZscal function. However, the result is quite odd.
It seems that cublasZscal has a problem.
I defined one vector, d_a, with (N=65536) elements,
scale them with a complex constant in the loop,
[codebox]#include <stdio.h>
#include <cublas.h>
typedef double2 Complex;
int main(int argc, char *argv)
{
int i, m;
int N = 256 * 256;
cublasStatus status;
Complex f;
Complex *h_a, *h_cpu;
Complex *d_a;
cudaMallocHost((void **) &h_a, N*sizeof(Complex));
cudaMallocHost((void **) &h_cpu, N*sizeof(Complex));
cudaMalloc((void **)&d_a, N* sizeof(Complex));
cublasInit();
f.x = 0.3f;
f.y = 0.0f;
for(m = 0 ; m < 20 ; m++)
{
for(i = 0 ; i < N ; i++)
{
h_a[i].x = i;
h_a[i].y = 0.2f * i;
h_cpu[i].x = (h_a[i].x) * (f.x);
h_cpu[i].y = (h_a[i].y) * (f.x);
}
cublasSetVector(N, sizeof(Complex), h_a, 1, d_a, 1);
cudaThreadSynchronize();
//for(int k = 0 ; k < 256 ; k++)
// cublasZscal(256, f, &(d_a[256*k]), 1);
cublasZscal(N, f, d_a, 1);
cudaThreadSynchronize();
cublasGetVector(N, sizeof(Complex), d_a, 1, h_a, 1);
cudaThreadSynchronize();
for(i = 0 ; i < N ; i++)
{
if( fabs(h_a[i].x - h_cpu[i].x) > 0.01)
fprintf(fp, "loop : %d, idx : %d, (%lf, %lf) should be (%lf, %lf)\n", m, i, h_a[i].x, h_a[i].y, h_cpu[i].x, h_cpu[i].y);
}
}
cublasShutdown();
cudaFreeHost(h_a);
cudaFreeHost(h_cpu);
cudaFree(d_a);
fclose(fp);
}
[/codebox]
****** After the reinstall of the driver, the problem is solved.