I’m trying to implement a sparse N-dimensional vector array data structure. It works on the host but I’m not sure how I should copy everything to the device. The code below is mostly just to fill the data structure with some data to test the kernel, it will be loaded dynamically from a file in the future.
struct svm_node
{
int index;
float value;
};
// Row and column count for variable lenght multidimensional arrays
const unsigned int rowCount = 1000; // Number of rows
unsigned int columnCount[rowCount]; // number of columns/items in each row
svm_node** h_x_nodes; // A multidimensional array
unsigned int memsizeRows = sizeof(svm_node*)* rowCount;
// Allocate host memory for the rows in h_x_nodes;
h_x_nodes = (svm_node**) malloc(memsizeRows);
// Create a list of columns to test, in reality this will be
// created by parsing a file
for(unsigned int i = 0; i <= rowCount; ++i)
{
columnCount[i] = 10;
}
// Fill h_x_nodes with some data
for(unsigned int i = 0; i <= rowCount; ++i)
{
// Allocate host memory for the number of columns/items in row i
h_x_nodes[i] = (svm_node*) malloc(sizeof(svm_node)*columnCount[i]);
for(unsigned int j = 0; j < (columnCount[i]); j++)
{
float value = 0.5f + (float)j;
svm_node item;
item.index = j;
item.value = value;
h_x_nodes[i][j] = item;
}
}
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
unsigned int timer = 0;
cutilCheckError( cutCreateTimer( &timer));
cutilCheckError( cutStartTimer( timer));
Do you have an idea on how to copy h_x_nodes efficiently?