It is not clear what the question is. CUDA is a subset of C++, so

(1) You can create the same kind of data structures on the device that you create on the host, including an array of pointers to arrays of elements of type `T`

.

(2) You can copy data structures like an array of pointers to arrays of elements of type `T`

just like you normally do in C++, by performing a deep copy.

Some people like to represent 2D matrices as an array of pointers to 1D vectors of `T`

(not something I wold advise, for performance reasons). Here is an example of how such a matrix can be moved between host and device. Your use case seems closely related.

```
#include <cstdio>
#include <cstdlib>
#include <complex>
#include "cuComplex.h"
#define N (2)
#define M (3)
typedef std::complex<float> T;
__global__ void print_device_matrix (cuComplex** mat)
{
printf ("matrix on device:\n");
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
printf ("(%f, %f) ", cuCrealf (mat[i][j]), cuCimagf (mat[i][j]));
}
printf ("\n");
}
}
int main (void)
{
/* allocate host "matrix" */
T **mat = (T**)malloc (N * sizeof (mat[0]));
for (int i = 0; i < N; i++) {
mat[i] = (T *)malloc (M * sizeof (mat[0][0]));
}
/* fill in host "matrix" */
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
mat[i][j] = T (float(i)+1, float(j)+1);
}
}
/* print host "matrix" */
printf ("matrix on host:\n");
for (int i = 0; i < N; i++) {
for (int j = 0; j < M; j++) {
printf ("(%f, %f) ", real(mat[i][j]), imag(mat[i][j]));
}
printf ("\n");
}
/* allocate device "matrix" */
T **tmp = (T**)malloc (N * sizeof (tmp[0]));
for (int i = 0; i < N; i++) {
cudaMalloc ((void **)&tmp[i], M * sizeof (tmp[0][0]));
}
cuComplex **matD = 0;
cudaMalloc ((void **)&matD, N * sizeof (matD[0]));
/* copy "matrix" from host to device */
cudaMemcpy (matD, tmp, N * sizeof (matD[0]), cudaMemcpyHostToDevice);
for (int i = 0; i < N; i++) {
cudaMemcpy (tmp[i], mat[i], M * sizeof (matD[0][0]), cudaMemcpyHostToDevice);
}
free (tmp);
/* print device "matrix" */
print_device_matrix<<<1,1>>> (matD);
/* free host "matrix" */
for (int i = 0; i < N; i++) {
free (mat[i]);
}
free (mat);
/* free device "matrix" */
tmp = (T**)malloc (N * sizeof (tmp[0]));
cudaMemcpy (tmp, matD, N * sizeof (matD[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) {
cudaFree (tmp[i]);
}
free (tmp);
cudaFree (matD);
return EXIT_SUCCESS;
}
```