Hi everyone,
I just try multiple Kx, where K - sparse matrix[256256, 256256], x - vector[256256].
At the begining I used types:
- cusp::array2d<float, cusp::host_memory>
- cusp::array1d<float, cusp::host_memory>
and cusp::multiply (cusp/multiply.h) for multiplication.
However, I have found that it will generate error if the size of matrix more then [20000, 20000].
In my case I have matrix [65536, 65536]. Actually, the length of non-zero elements in this matrix is 65536.
Also, I have tried to use cusp::coo_matrix_view, where I put x_index, y_index, and values.
It does not generate error, however the following operation:
cusp::multiply(A, x, b);
calculate nothing.
Could anyone give any tip how to work with big sparse matrix in cuda?
Any sample will be usefull.
My current test code:
int count = 256*256;
int host_I[count]; // COO row indices
int host_J[count]; // COO column indices
float host_V[count]; // COO values
// x and b arrays in host memory
float host_x[count];
float host_b[count];
// allocate device memory for CSR format
int * device_I; cudaMalloc(&device_I, count * sizeof(int));
int * device_J; cudaMalloc(&device_J, count * sizeof(int));
float * device_V; cudaMalloc(&device_V, count * sizeof(float));
// allocate device memory for x and y arrays
float * device_x; cudaMalloc(&device_x, count * sizeof(float));
float * device_b; cudaMalloc(&device_b, count * sizeof(float));
// copy raw data from host to device
cudaMemcpy(device_I, host_I, count * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(device_J, host_J, count * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(device_V, host_V, count * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(device_x, host_x, count * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(device_b, host_b, count * sizeof(float), cudaMemcpyHostToDevice);
// matrices and vectors now reside on the device
// NOTE raw pointers must be wrapped with thrust::device_ptr!
thrust::device_ptr wrapped_device_I(device_I);
thrust::device_ptr wrapped_device_J(device_J);
thrust::device_ptr wrapped_device_V(device_V);
thrust::device_ptr wrapped_device_x(device_x);
thrust::device_ptr wrapped_device_b(device_b);
// use array1d_view to wrap the individual arrays
typedef typename cusp::array1d_view< thrust::device_ptr > DeviceIndexArrayView;
typedef typename cusp::array1d_view< thrust::device_ptr > DeviceValueArrayView;
DeviceIndexArrayView row_indices (wrapped_device_I, wrapped_device_I + count);
DeviceIndexArrayView column_indices(wrapped_device_J, wrapped_device_J + count);
DeviceValueArrayView values (wrapped_device_V, wrapped_device_V + count);
DeviceValueArrayView x (wrapped_device_x, wrapped_device_x + count);
DeviceValueArrayView b (wrapped_device_b, wrapped_device_b + count);
// combine the three array1d_views into a coo_matrix_view
typedef cusp::coo_matrix_view<DeviceIndexArrayView,
DeviceIndexArrayView,
DeviceValueArrayView> DeviceView;
// construct a coo_matrix_view from the array1d_views
DeviceView A(count, count, count, row_indices, column_indices, values);
// compute b = A * x
cusp::multiply(A, x, b);
// copy back to host
cudaMemcpy(host_b, device_b, count * sizeof(float), cudaMemcpyDeviceToHost);
// print the result
for(int i = 0; i < count; i++)
{
printf(“%f\n”, host_b[i]);
}
// free device arrays
cudaFree(device_I);
cudaFree(device_J);
cudaFree(device_V);
cudaFree(device_x);
cudaFree(device_b);