I have been having trouble assigning the right structure sizes when cudaMemcpy() method is called. Can anyone help me out with the following code?
#include <stdio.h>
#include <time.h>
#include <cassert>
#include <cstdlib>
#include <functional>
#include <iostream>
#include <algorithm>
struct Matrix{
int width;
int height;
float* elements;
};
__global__ void doThings(Matrix* matrices)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x; // 0, 1, 2
matrices[idx].width = 5;
matrices[idx].height = 4;
matrices[idx].elements[0] = 42;
printf("matrices[%d].elements[0] : %.2f\n", idx, matrices[idx].elements[0]);
__syncthreads();
}
int main (void) {
int rows, cols, numMat = 2; // These are actually determined at run-time
Matrix* data = (Matrix*)malloc(numMat * sizeof(Matrix));
Matrix* h_data = (Matrix*)malloc(numMat * sizeof(Matrix));
memcpy(h_data, data, numMat * sizeof(Matrix));
for (int i=0; i<5; i++){
cudaMalloc(&(h_data[i].elements), rows*cols*sizeof(float));
cudaMemcpy(data[i].elements, h_data[i].elements, numMat*sizeof(Matrix), cudaMemcpyHostToDevice);
}// matrix data is now on the gpu, now copy the "meta" data to gpu
Matrix* d_data;
cudaMalloc(&d_data, numMat*sizeof(Matrix));
cudaMemcpy(d_data, h_data, numMat*sizeof(Matrix), cudaMemcpyHostToDevice);
// ... Do other things ...
dim3 block (4);
dim3 grid (1);
doThings <<< grid, block >>> (data);
for (int i=0; i<5; i++){
cudaMemcpy(h_data[i].elements, data[i].elements, numMat*sizeof(Matrix), cudaMemcpyDeviceToHost);
}
cudaMemcpy(h_data, d_data, numMat*sizeof(Matrix), cudaMemcpyDeviceToHost);
for (int i=0; i<5; i++){
printf("h_data[%d].elements[0] = %.2f\n", i, h_data[i].elements);
printf("h_data[%d].width = %d\n", i, h_data[i].width);
printf("h_data[%d].height = %d\n", i, h_data[i].height);
}
cudaFree(d_data);
return (0);
}