Hello dear CUDA community,
I am quiet new to CUDA programming, but I a making progress. However, I tried to include the CUBLAS function cublasCgerc_v2 into my program, but the results are more or less total rubbish.
Assume a matrix A with zeros only, a vector x = [1 … 8] and a vector y = [1 … 7]. Everything is complex valued, but with imaginary part 0.
Then, A should be updated with
A = A + x*y**H
Updating the matrix A with cublasCgerc_v2 should give me a matrix:
1 2 3 4 5 6 7
2 4 6 8 10 12 14
3 6 9 12 15 18 21
4 8 12 16 20 24 28
5 10 15 20 25 30 35
6 12 18 24 30 36 42
7 14 21 28 35 42 49
8 16 24 32 40 48 56
However, I a not able to get there.
An excerpt of my code is:
size_t M = 8;
size_t N = 7;
CudaAECComplex alpha;
alpha.x = 1.0;
alpha.y = 0.0;
CudaAECComplex *x,*y,*A;
CudaAECComplex *dx,*dy,*dA;
x = new CudaAECComplex[M];
y = new CudaAECComplex[N];
A = new CudaAECComplex[M*N];
cudaMalloc((void**)&dx, M*sizeof(CudaAECComplex));
cudaMalloc((void**)&dy, N*sizeof(CudaAECComplex));
cudaMalloc((void**)&dA, M*N*sizeof(CudaAECComplex));
for(unsigned int i=0;i<M;++i) {
x[i].x = i+1;
x[i].y = 0;
}
for(unsigned int i=0;i<N;++i) {
y[i].x = i+1;
y[i].y = 0;
}
for(unsigned int i=0;i<M*N;++i) {
A[i].x = 0;
A[i].y = 0;
}
cudaMemcpy(dx, (void**)x, M*sizeof(CudaAECComplex), cudaMemcpyHostToDevice);
cudaMemcpy(dy, (void**)y, N*sizeof(CudaAECComplex), cudaMemcpyHostToDevice);
cudaMemcpy(dA, (void**)A, M*N*sizeof(CudaAECComplex), cudaMemcpyHostToDevice);
size_t status = cublasCgerc_v2(m_cublasHandle, M, N, &alpha, dx, 1, dy, 1, dA, M);
std::cout << "Status: " << status << std::endl;
cudaMemcpy((void**)A, dA, M*N*sizeof(CudaAECComplex), cudaMemcpyDeviceToHost);
for(unsigned int i=0;i<M;++i) {
for(unsigned int j=0;j<N;++j) {
std::cout << "("<< A[i+j*N].x << "," << A[i+j*N].y << ") ";
}
std::cout << std::endl;
}
cudaFree(dx);
cudaFree(dy);
cudaFree(dA);
delete[] x;
delete[] y;
delete[] A;
I guess there is a very dumb error somewhere in there, but I was not able to locate it. Any help is deepply appreciated.
Best regards, floxtrott