When i use cudoublecomplex in cusolver,the error occurs

Hi,
when i use cusolver to solve a eigenvalues of a Hermitian matrix,i use std::complex in cpu host,and i use the cuDoublecompelx host in the GPU host,but when i use it ,sometimes it will be segment fault ,sometimes it will be cudaMemcpy error,how can solve it,it is the code

std::complex<double> h[9]={{1.0,0.0},{0.0,0.0},{0.0,0.0},{0.0,0.0},{1.0,0.0},{0.0,0.0},{0.0,0.0},{0.0,0.0},{1.0,0.0}};
std::complex<double> s[9]={{1.0,0.0},{0.0,0.0},{0.0,0.0},{0.0,0.0},{1.0,0.0},{0.0,0.0},{0.0,0.0},{0.0,0.0},{1.0,0.0}};
std::complex<double> ev[9];
double en[9];
dc.Dngvd_complex(3, 3, h, s, en, ev);

the cusolver is as below
void Diag_Cusolver_gvd::Dngvd_complex(int N, int M, std::complex *A, std::complex *B, double *W, std::complex *V){

// copy A, B to the GPU
    assert(N == M);
    if (M != m) {
        this->finalize();
        this->init_complex(M);
    }
    for (int i=0;i<M*N;i++)
        std::cout<<"the h matrix is "<<A[i]<<std::endl;
    checkCudaErrors( cudaMemcpy(d_A2, A, sizeof(cuDoubleComplex) * lda * m, cudaMemcpyHostToDevice) );
    checkCudaErrors( cudaMemcpy(d_B2, B, sizeof(cuDoubleComplex) * lda * m, cudaMemcpyHostToDevice) );

// Query working space of Zhegvd
// The helper functions below can calculate the sizes needed for pre-allocated buffer.
// The S and D data types are real valued single and double precision, respectively.
// The C and Z data types are complex valued single and double precision, respectively.
    checkCudaErrors( 
        cusolverDnZhegvd_bufferSize(
            cusolverH,
            itype,
            jobz,
            uplo,
            m,
            d_A2,
            lda,
            d_B2,
            lda,
            d_W,
            &lwork)
    );      
    checkCudaErrors( cudaMalloc((void**)&d_work2, sizeof(cuDoubleComplex)*lwork) );

// compute spectrum of (A,B)
    checkCudaErrors(
        cusolverDnZhegvd(
            cusolverH,
            itype,
            jobz,
            uplo,
            m,
            d_A2,
            lda,
            d_B2,
            lda,
            d_W,
            d_work2,
            lwork,
            devInfo)
    );
    checkCudaErrors( cudaDeviceSynchronize() );
    
// copy (W, V) to the cpu root
    checkCudaErrors( cudaMemcpy(W, d_W, sizeof(double)*m, cudaMemcpyDeviceToHost) );
    checkCudaErrors( cudaMemcpy(V, d_A2, sizeof(std::complex<double>)*lda*m, cudaMemcpyDeviceToHost) );
    checkCudaErrors( cudaMemcpy(&info_gpu, devInfo, sizeof(int), cudaMemcpyDeviceToHost) );
    assert(0 == info_gpu);

// free the buffer
    if (d_work2 ) checkCudaErrors( cudaFree(d_work2) );

}

and the error likes below

Where are you allocating d_A2 and A?

the A is sent to the function,while the d_A2 was allocated here