These are the host and device code that I am using:
HOST CODE (matvecmult.cu):
======
// HOST CODE 17/03/2008
// Sparse Matrix-Vector multiplication.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil_inline.h>
#include “matvecmult_kernel.cu”
// Multiplicacion matriz por vector A * x = res
void Mult(const double* gval_, const int* gcolind_, const int* growptr_, const int nrow, const int ncol,
const int nnzero, const double* gx_, double* res_)
{
int size_a, size_a2, size_b, size_b2, size_c;
// Load SparseMatrix “A” and vectors “x” and “res” to the device
// Note: Subscript _d identifies a device variable !!
double* gval_d ;
int* growptr_d ;
int* gcolind_d ;
int* ndim_d;
double* gx_d ;
double* res_d;
int ndim_[3] ;
ndim_[0] = nrow ; ndim_[1] = ncol ; ndim_[2] = nnzero ;
size_a = nnzero * sizeof(double); // non-zero elements size
size_a2 = nnzero * sizeof(int); // non-zero elements size
size_b = (nrow+1) * sizeof(double); // row size
size_b2 = nrow * sizeof(int); // row size
size_c = 3*sizeof(int); // ndim size
// Memory allocation into the device
cudaMalloc((void**)&gval_d, size_a);
cudaMalloc((void**)&gcolind_d, size_a2);
cudaMalloc((void**)&growptr_d, size_b2);
cudaMalloc((void**)&ndim_d, size_c);
cudaMalloc((void**)&gx_d, size_b);
cudaMalloc((void**)&res_d, size_b);
// Copy host to device variables
cudaMemcpy(gval_d, gval_, size_a, cudaMemcpyHostToDevice);
cudaMemcpy(gcolind_d, gcolind_, size_a2, cudaMemcpyHostToDevice);
cudaMemcpy(growptr_d, growptr_, size_b2, cudaMemcpyHostToDevice);
cudaMemcpy(ndim_d, ndim_, size_c, cudaMemcpyHostToDevice);
cudaMemcpy(gx_d, gx_, size_b, cudaMemcpyHostToDevice);
cudaMemcpy(res_d, res_, size_b, cudaMemcpyHostToDevice);
// La organizacion del grid en terminos de los bloques y threads.
// Primera version muy simple para el caso de una matriz 10x10.
//if (BLOCK_SIZE/ndim_(1) >= 512){
// cout << “Quietor!!! Demasiado Thread para tan poco bloque …” << endl;
// system(“pause”);
//}
dim3 dimBlock(BLOCK_SIZE);
dim3 dimGrid(ndim_[1]/BLOCK_SIZE);
// Launch the device computation
Mult_d<<< dimGrid, dimBlock >>>(gval_d, gcolind_d, growptr_d, ndim_d, gx_d, res_d);
// Read result from the device
cudaMemcpy(res_, res_d, size_b, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(gval_d);
cudaFree(gcolind_d);
cudaFree(growptr_d);
cudaFree(ndim_d);
}
==
DEVICE CODE
==
// DEVICE CODE
// Sparse Matrix-Vector multiplication.
#ifndef MATVECMULT_KERNEL_H
#define MATVECMULT_KERNEL_H
#include <stdio.h>
#include “matvecmult.h”
#endif
// Sparse matrix multiplication A * x = res !!
global void Mult_d(double* gval_, int* gcolind_, int* growptr_, int* ndim_, double* gx_, double* res_){
// Thread index
int tx = threadIdx.x ;
int k_b, k_e, ij_aux ;
double daux = 0;
k_b = growptr_[tx] ;
k_e = growptr_[tx+1];
while (k_b<k_e) {
ij_aux = gcolind_[k_b] ;
daux += gval_[k_b]*gx_[ij_aux];
k_b++;
}
res_[tx] = daux;
}