CudaMemcpy is not working in my code.

Greetings,
I am learning CUDA and I have developed some codes successfully but now working with complex numbers I am having some problem while using cudaMemcpy instruction. In the following code I try to multiply two matrices, A and B, they are created in the host but when checking its value in the kernel it is zero. Would you please tell me why this is happening?.

Thanks in advance.

My code:

//#include <complex>  // not necessary for this code
#include <iostream>
#include <cmath>
//#include "cuda.h"  // not necessary when compiling with nvcc
#include "math.h"
#include "cuComplex.h"

#define N   2

using namespace std;

/*#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
	if (code != cudaSuccess)
	{
		fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
		if (abort) exit(code);
	}
}*/

__global__ void func(cuDoubleComplex *A, cuDoubleComplex *B, cuDoubleComplex *C) {

	int COL = blockIdx.x*blockDim.x + threadIdx.x; /* blockIdx.x; */
	int ROW = blockIdx.y*blockDim.y + threadIdx.y; /* blockIdx.x; */
	cuDoubleComplex Tmp = make_cuDoubleComplex(0, 0);
	cuDoubleComplex Mult = make_cuDoubleComplex(0, 0);
	if (ROW < N && COL < N) {
		for (int i = 0; i < N; i++) {
		    Mult = cuCmul(A[ROW * N + i], B[i * N + COL]);
			printf("\n Row %d, Col %d", ROW, COL);
			printf("\n B = %d", B[i*N+COL].y);
			printf("\n N %d", N);
			printf("\n Mult %d, %d", Mult.x, Mult.y);
			/*cout << "\nMult  " << i << "= " << cuCreal(Mult) << ","<< cuCimag (Mult)*/;
			Tmp = cuCadd(Tmp, Mult);
			printf("\n Tmp %d, %d", Tmp.x, Tmp.y);
	
		}
	}
	
	C[ROW * N + COL] = Tmp; /*Por la forma como se linealiza la matriz*/
}
int main(void) {


	cuDoubleComplex *dev_A, *dev_B, *dev_C;
	cuDoubleComplex A[N*N], B[N*N], C[N*N];
	

	A[0] = make_cuDoubleComplex(1, 2);
	A[1] = make_cuDoubleComplex(3, 0);
	A[2] = make_cuDoubleComplex(0, 2);

	A[3] = make_cuDoubleComplex(0, 6);
	B[0] = make_cuDoubleComplex(0, 0);
	B[1] = make_cuDoubleComplex(0, 2);
	B[2] = make_cuDoubleComplex(5, 3);
	B[3] = make_cuDoubleComplex(2, -8);
	for (int i = 0; i < N*N; i++) {
		cout << "\nA cout = " << cuCreal(A[i]) << ", " << cuCimag(A[i]);
			}
	for (int i = 0; i < N*N; i++) {
		cout << "\nB cout = " << cuCreal(B[i]) << ", " << cuCimag(B[i]);
	}

	// allocate the memory on the GPU
	cudaMalloc(&dev_A, (N*N) * sizeof(cuDoubleComplex));
	cudaMalloc(&dev_B, (N*N) * sizeof(cuDoubleComplex));
	cudaMalloc(&dev_C, (N*N) * sizeof(cuDoubleComplex));
	
	cudaMemcpy(dev_A, A, (N*N) * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_B, B, (N*N) * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
	
	dim3 blocksPerGrid(1, 1, 1);
	dim3 threadsPerBlock(N, N, 1);

	func << <blocksPerGrid, threadsPerBlock >> >(dev_A, dev_B, dev_C);
	cudaThreadSynchronize();
		
	for (int i = 0; i < N*N; i++) {
		cout << "\nCALC cout = " << C[i].x << ", " << C[i].y;
	}

	/* free device buffer */
	cudaFree(dev_A);
	cudaFree(dev_B);
	cudaFree(dev_C);


	return 0;
}

%d is an incorrect printf format specifier when printing out a double quantity, you could use %f instead
That doesn’t have anything to do with CUDA, but is a requirement of C and C++

also, not relevant to your question, but you have no cudaMemcpy operation after your kernel call, so the printout of C won’t work.

Thank you very much Robert, you solved the problem. I wasn’t obtaining any value for B because of the %d print format.

Thanks again!!!