Greetings,
I am learning CUDA and I have developed some codes successfully but now working with complex numbers I am having some problem while using cudaMemcpy instruction. In the following code I try to multiply two matrices, A and B, they are created in the host but when checking its value in the kernel it is zero. Would you please tell me why this is happening?.
Thanks in advance.
My code:
//#include <complex> // not necessary for this code
#include <iostream>
#include <cmath>
//#include "cuda.h" // not necessary when compiling with nvcc
#include "math.h"
#include "cuComplex.h"
#define N 2
using namespace std;
/*#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}*/
__global__ void func(cuDoubleComplex *A, cuDoubleComplex *B, cuDoubleComplex *C) {
int COL = blockIdx.x*blockDim.x + threadIdx.x; /* blockIdx.x; */
int ROW = blockIdx.y*blockDim.y + threadIdx.y; /* blockIdx.x; */
cuDoubleComplex Tmp = make_cuDoubleComplex(0, 0);
cuDoubleComplex Mult = make_cuDoubleComplex(0, 0);
if (ROW < N && COL < N) {
for (int i = 0; i < N; i++) {
Mult = cuCmul(A[ROW * N + i], B[i * N + COL]);
printf("\n Row %d, Col %d", ROW, COL);
printf("\n B = %d", B[i*N+COL].y);
printf("\n N %d", N);
printf("\n Mult %d, %d", Mult.x, Mult.y);
/*cout << "\nMult " << i << "= " << cuCreal(Mult) << ","<< cuCimag (Mult)*/;
Tmp = cuCadd(Tmp, Mult);
printf("\n Tmp %d, %d", Tmp.x, Tmp.y);
}
}
C[ROW * N + COL] = Tmp; /*Por la forma como se linealiza la matriz*/
}
int main(void) {
cuDoubleComplex *dev_A, *dev_B, *dev_C;
cuDoubleComplex A[N*N], B[N*N], C[N*N];
A[0] = make_cuDoubleComplex(1, 2);
A[1] = make_cuDoubleComplex(3, 0);
A[2] = make_cuDoubleComplex(0, 2);
A[3] = make_cuDoubleComplex(0, 6);
B[0] = make_cuDoubleComplex(0, 0);
B[1] = make_cuDoubleComplex(0, 2);
B[2] = make_cuDoubleComplex(5, 3);
B[3] = make_cuDoubleComplex(2, -8);
for (int i = 0; i < N*N; i++) {
cout << "\nA cout = " << cuCreal(A[i]) << ", " << cuCimag(A[i]);
}
for (int i = 0; i < N*N; i++) {
cout << "\nB cout = " << cuCreal(B[i]) << ", " << cuCimag(B[i]);
}
// allocate the memory on the GPU
cudaMalloc(&dev_A, (N*N) * sizeof(cuDoubleComplex));
cudaMalloc(&dev_B, (N*N) * sizeof(cuDoubleComplex));
cudaMalloc(&dev_C, (N*N) * sizeof(cuDoubleComplex));
cudaMemcpy(dev_A, A, (N*N) * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
cudaMemcpy(dev_B, B, (N*N) * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
dim3 blocksPerGrid(1, 1, 1);
dim3 threadsPerBlock(N, N, 1);
func << <blocksPerGrid, threadsPerBlock >> >(dev_A, dev_B, dev_C);
cudaThreadSynchronize();
for (int i = 0; i < N*N; i++) {
cout << "\nCALC cout = " << C[i].x << ", " << C[i].y;
}
/* free device buffer */
cudaFree(dev_A);
cudaFree(dev_B);
cudaFree(dev_C);
return 0;
}