cudaMemcpy size for structures

I have been having trouble assigning the right structure sizes when cudaMemcpy() method is called. Can anyone help me out with the following code?

#include <stdio.h>
#include <time.h>
#include <cassert>
#include <cstdlib>
#include <functional>
#include <iostream>
#include <algorithm>

struct Matrix{
    int width;
    int height;
    float* elements;

__global__ void doThings(Matrix* matrices)
    int idx = blockDim.x * blockIdx.x + threadIdx.x; // 0, 1, 2 
    matrices[idx].width = 5; 
    matrices[idx].height = 4; 
    matrices[idx].elements[0] = 42;
    printf("matrices[%d].elements[0] : %.2f\n", idx, matrices[idx].elements[0]);

int main (void) {
    int rows, cols, numMat = 2; // These are actually determined at run-time
    Matrix* data = (Matrix*)malloc(numMat * sizeof(Matrix));
    Matrix* h_data = (Matrix*)malloc(numMat * sizeof(Matrix));
    memcpy(h_data, data, numMat * sizeof(Matrix));

for (int i=0; i<5; i++){

    cudaMalloc(&(h_data[i].elements), rows*cols*sizeof(float));
    cudaMemcpy(data[i].elements, h_data[i].elements, numMat*sizeof(Matrix), cudaMemcpyHostToDevice);

 }// matrix data is now on the gpu, now copy the "meta" data to gpu

 Matrix* d_data;
 cudaMalloc(&d_data, numMat*sizeof(Matrix)); 
 cudaMemcpy(d_data, h_data, numMat*sizeof(Matrix),  cudaMemcpyHostToDevice);
 // ... Do other things ...

dim3 block (4); 
dim3 grid (1); 

doThings <<< grid, block >>> (data); 

for (int i=0; i<5; i++){
    cudaMemcpy(h_data[i].elements, data[i].elements, numMat*sizeof(Matrix), cudaMemcpyDeviceToHost);

cudaMemcpy(h_data, d_data, numMat*sizeof(Matrix),  cudaMemcpyDeviceToHost);

for (int i=0; i<5; i++){
    printf("h_data[%d].elements[0] = %.2f\n", i, h_data[i].elements);
    printf("h_data[%d].width = %d\n", i, h_data[i].width);
    printf("h_data[%d].height = %d\n", i, h_data[i].height);

return (0); 


You main function should look more like this :

    int rows, cols; // These are actually determined at run-time
    const int numMat = 2;
    const size_t allocSize = numMat * sizeof(Matrix);
    Matrix* data = (Matrix*)malloc( allocSize );
    Matrix* h_data = (Matrix*)malloc( allocSize );
    memcpy(h_data, data, allocSize );

    size_t elementSize = rows*cols*sizeof(float);

    for( int i=0; i< numMat; i++ )
        cudaMalloc(&(h_data[i].elements), elementSize );    // rows and cols need to be set
        cudaMemcpy(data[i].elements, h_data[i].elements, elementSize, cudaMemcpyHostToDevice );
    }// matrix data is now on the gpu, now copy the "meta" data to gpu

The key thing is avoid using literal values for sizes and counts especially when they are used multiple times. The variable numMat was used correctly but then you used 5 for some reason. The idea is that you should be able to change sizes or counts in only place and everything still works.