I have some code with UMA and it is not really working as I expected. The code below corresponds to the .cuh and .cu:
Array_Ops.cuh
extern "C"
{
__declspec(dllexport) size_t array_1D_size = 0, array_1D_length = 0;
__declspec(dllexport) float *array_1D_1, *array_1D_2;
__declspec(dllexport) void allocate_arrays_1D(size_t MB);
__declspec(dllexport) void deallocate_arrays_1D(void);
__declspec(dllexport) void print_arrays_1D(void);
}
Array_Ops.cu
#include "Array_Ops.cuh"
#include <sstream> // For the array printing
#include <fstream> // For the array printing
#include <string> // For the array printing
using std::stringstream;
using std::ofstream;
using std::string;
__declspec(dllexport) void allocate_arrays_1D(size_t MB)
{
array_1D_length = (MB * 1024 * 1024) / sizeof(size_t);
array_1D_size = array_1D_length * sizeof(size_t);
cudaMallocManaged(&array_1D_1, array_1D_size);
cudaMallocManaged(&array_1D_2, array_1D_size);
cudaMemset(array_1D_1, 0, array_1D_size);
cudaMemset(array_1D_2, 0, array_1D_size);
}
__declspec(dllexport) void deallocate_arrays_1D(void)
{
cudaFree(array_1D_1);
cudaFree(array_1D_2);
array_1D_size = array_1D_length = 0;
}
__declspec(dllexport) void print_arrays_1D(void)
{
stringstream ss_in, ss_out;
ss_in << "Input_Array_GPU_1D_" << array_1D_length << ".txt";
ss_out << "Output_Array_GPU_1D_" << array_1D_length << ".txt";
string file_name = ss_in.str();
ofstream array_1_out(file_name, ofstream::out);
for (size_t i = 0; i < array_1D_length; i++)
array_1_out << array_1D_1[i] << std::endl;
array_1_out.close();
file_name = ss_out.str();
ofstream array_2_out(file_name, ofstream::out);
for (size_t i = 0; i < array_1D_length; i++)
array_2_out << array_1D_2[i] << std::endl;
array_2_out.close();
}
The header declares a few variables and 3 functions, to allocate, deallocate and print the arrays. I do some operations on these arrays and need to print their content to disk for quality control.
In the implementation I allocate the arrays with cudaMallocManaged, and while I operate these arrays from within the device, the application works fine, but then I need to print stuff to disk and use the print_arrays_1D() method.
Since it is running just host code, I was expecting the UMA to transparently move arrays 1 and 2 from device memory to host memory, but the program will crash at the loop starting at line 38 in the .cu file. That is, when the program tries to read the array which, until that point, is in device memory.
I thought of trying to allocate a temporary array with new, do a cudaMemcpy from the array in device to this temp array and see what happens, but then it negates the point of using UMA. As for the error handling in CUDA calls, they are omitted just for simplicity and easier reading.
Do you guys see any problem in this code?
If you need further clarification, just let me know.