# CUSP large matrix-vector multiplication

Hi everyone,

I just try multiple Kx, where K - sparse matrix[256256, 256256], x - vector[256256].
At the begining I used types:

1. cusp::array2d<float, cusp::host_memory>
2. cusp::array1d<float, cusp::host_memory>
and cusp::multiply (cusp/multiply.h) for multiplication.
However, I have found that it will generate error if the size of matrix more then [20000, 20000].
In my case I have matrix [65536, 65536]. Actually, the length of non-zero elements in this matrix is 65536.

Also, I have tried to use cusp::coo_matrix_view, where I put x_index, y_index, and values.
It does not generate error, however the following operation:
cusp::multiply(A, x, b);
calculate nothing.

Could anyone give any tip how to work with big sparse matrix in cuda?
Any sample will be usefull.

My current test code:

int count = 256*256;
int host_I[count]; // COO row indices
int host_J[count]; // COO column indices
float host_V[count]; // COO values

// x and b arrays in host memory
float host_x[count];
float host_b[count];

// allocate device memory for CSR format
int * device_I; cudaMalloc(&device_I, count * sizeof(int));
int * device_J; cudaMalloc(&device_J, count * sizeof(int));
float * device_V; cudaMalloc(&device_V, count * sizeof(float));

// allocate device memory for x and y arrays
float * device_x; cudaMalloc(&device_x, count * sizeof(float));
float * device_b; cudaMalloc(&device_b, count * sizeof(float));

// copy raw data from host to device
cudaMemcpy(device_I, host_I, count * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(device_J, host_J, count * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(device_V, host_V, count * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(device_x, host_x, count * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(device_b, host_b, count * sizeof(float), cudaMemcpyHostToDevice);

// matrices and vectors now reside on the device

// NOTE raw pointers must be wrapped with thrust::device_ptr!
thrust::device_ptr wrapped_device_I(device_I);
thrust::device_ptr wrapped_device_J(device_J);
thrust::device_ptr wrapped_device_V(device_V);
thrust::device_ptr wrapped_device_x(device_x);
thrust::device_ptr wrapped_device_b(device_b);

// use array1d_view to wrap the individual arrays
typedef typename cusp::array1d_view< thrust::device_ptr > DeviceIndexArrayView;
typedef typename cusp::array1d_view< thrust::device_ptr > DeviceValueArrayView;

DeviceIndexArrayView row_indices (wrapped_device_I, wrapped_device_I + count);
DeviceIndexArrayView column_indices(wrapped_device_J, wrapped_device_J + count);
DeviceValueArrayView values (wrapped_device_V, wrapped_device_V + count);
DeviceValueArrayView x (wrapped_device_x, wrapped_device_x + count);
DeviceValueArrayView b (wrapped_device_b, wrapped_device_b + count);

// combine the three array1d_views into a coo_matrix_view
typedef cusp::coo_matrix_view<DeviceIndexArrayView,
DeviceIndexArrayView,
DeviceValueArrayView> DeviceView;

// construct a coo_matrix_view from the array1d_views
DeviceView A(count, count, count, row_indices, column_indices, values);

// compute b = A * x
cusp::multiply(A, x, b);

// copy back to host
cudaMemcpy(host_b, device_b, count * sizeof(float), cudaMemcpyDeviceToHost);

// print the result
for(int i = 0; i < count; i++)
{
printf("%f\n", host_b[i]);
}

// free device arrays
cudaFree(device_I);
cudaFree(device_J);
cudaFree(device_V);
cudaFree(device_x);
cudaFree(device_b);

Why not cuSPARSE?(it comes with the CUDA SK so you already have it, and is less messy than the above procedure. ).

http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv

I use that all the time with no issues, but the matrix needs to be in CSR format. There are also utilities to convert from coo format to csr.

(use code blocks next time)

Try this:

``````#include <cusp/coo_matrix.h>
#include <cusp/verify.h>
#include <cusp/array1d.h>
#include <cusp/multiply.h>
#include <cusp/print.h>
#define DIM 65536

int main(){
// allocate space for a tridiagonal matrix
cusp::coo_matrix<int,float, cusp::host_memory> A(DIM, DIM, (3*DIM)-2);
//initialize all 3 diagonals to 1
int idx = 0;
int row = 0;
A.row_indices[idx] = row; A.column_indices[idx] = row; A.values[idx] = 1;
idx++;
A.row_indices[idx] = row; A.column_indices[idx] = row+1; A.values[idx] = 1;
idx++; row++;
while (row < DIM-1){
A.row_indices[idx] = row; A.column_indices[idx] = row-1; A.values[idx] = 1;
idx++;
A.row_indices[idx] = row; A.column_indices[idx] = row; A.values[idx] = 1;
idx++;
A.row_indices[idx] = row; A.column_indices[idx] = row+1; A.values[idx] = 1;
idx++; row++;
}
A.row_indices[idx] = row; A.column_indices[idx] = row-1; A.values[idx] = 1;
idx++;
A.row_indices[idx] = row; A.column_indices[idx] = row; A.values[idx] = 1;
if (!cusp::is_valid_matrix(A)) {printf("Invalid COO\n"); return 1;}
cusp::array1d<float, cusp::host_memory> x(DIM), y(DIM);
for (int i = 0; i < DIM; i++){
x[i] = 1;
y[i] = 0;}
// test host multiply
cusp::multiply(A, x, y);
//cusp::print(y);
for (int i = 1; i < DIM-1; i++)
if (y[i] != 3.0f) {printf("invalid value at %d, was: %f, should be %f\n", i, y[i], 3.0f); return 1;}
if (y[0] != 2.0f) {printf("invalid value at %d, was: %f, should be %f\n", 0, y[0], 2.0f); return 1;}
if (y[DIM-1] != 2.0f) {printf("invalid value at %d, was: %f, should be %f\n", DIM-1, y[DIM-1], 2.0f); return 1;}
// test device multiply
cusp::coo_matrix<int,float, cusp::device_memory> d_A = A;
cusp::array1d<float, cusp::device_memory> d_x = x;
cusp::array1d<float, cusp::device_memory> d_y(DIM);
cusp::multiply(d_A, d_x, d_y);
cusp::array1d<float, cusp::host_memory> gy = d_y;
for (int i = 0; i < DIM; i++)
if (y[i] != gy[i]) {printf("host/device mismatch at: %d, should be: %f, was: %f\n", i, y[i], gy[i]); return 1;}

printf("Success!\n");
return 0;
}
``````

I’m being a little sloppy in that I’m testing float values for equality for verification of results, but since I know the only 2 values I care about are 2 and 3, it should be OK, as these can be stored exactly in a float. If you set DIM to a small value and uncomment the cusp::print line you can see the results directly.

And of course cuSparse should be fine too.