Here is the code. I think the input data size matters.
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
/* Using updated (v2) interfaces to cublas */
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse.h>
// Utilities and system includes
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
#include <helper_cuda.h> // helper function CUDA error checking and initialization
int main(int argc, char **argv)
{
if (argc<3)
{
std::cout << "mat_file, rhs_file" << std::endl;
exit(0);
}
int iarg=1;
char* mat_filename = argv[iarg++];
char* rhs_filename = argv[iarg++];
printf("mat file:%s\n", mat_filename);
printf("rhs file:%s\n", rhs_filename);
int *nnz_rows;
int M = 0, N = 0, nnz = 0, *I = NULL, *J = NULL;
unsigned long long long_nnz=0;
float *val = NULL;
float *x;
float *rhs;
int *d_col, *d_row;
float *d_val, *d_x, *d_v;
float *d_rhs, *d_u;
float alpha, beta;
int header[3];
FILE* file1;
FILE* rhs_file;
// This will pick the best possible CUDA capable device
cudaDeviceProp deviceProp;
int devID = findCudaDevice(argc, (const char **)argv);
if (devID < 0)
{
printf("exiting...\n");
exit(EXIT_SUCCESS);
}
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
// Statistics about the GPU device
printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);
int version = (deviceProp.major * 0x10 + deviceProp.minor);
if (version < 0x11)
{
printf("spmv: requires a minimum CUDA compute 1.1 capability\n");
exit(EXIT_SUCCESS);
}
file1 = fopen(mat_filename, "rb");
rhs_file = fopen(rhs_filename, "rb");
fread(header, sizeof(int), 3, file1);
fread(&long_nnz, sizeof(unsigned long long), 1, file1);
M = header[1]; N = header[2];
printf("M=%d, N=%d\n", M, N);
printf("long_nnz=%d\n", long_nnz);
nnz = long_nnz;
/* Generate a random tridiagonal symmetric matrix in CSR format */
//M = N = 1048576;
//nnz = (N-2)*3 + 4;
//I = (int *)malloc(sizeof(int)*(M+1));
I = (int *)malloc(sizeof(int)*(M+1));
nnz_rows = (int *)malloc(sizeof(int)*(M));
J = (int *)malloc(sizeof(int)*nnz);
val = (float *)malloc(sizeof(float)*nnz);
x = (float *)malloc(sizeof(float)*N);
rhs = (float *)malloc(sizeof(float)*M);
fread(header, sizeof(int), 2, rhs_file);
printf("numrow=%d\n", header[1]);
fread(rhs, sizeof(float), M, rhs_file);
fclose(rhs_file);
for (int i = 0; i < N; i++)
{
x[i] = 0.0f;
}
fread(nnz_rows, sizeof(int), M, file1);
fread(J, sizeof(int), (nnz), file1);
fread(val, sizeof(float), (nnz), file1);
fclose(file1);
memset(I,0,sizeof(int)*(M+1));
for(int i=0;i<M;++i)
{
I[i+1] = I[i]+nnz_rows[i];
}
/* Get handle to the CUBLAS context */
cublasHandle_t cublasHandle = 0;
cublasStatus_t cublasStatus;
cublasStatus = cublasCreate(&cublasHandle);
checkCudaErrors(cublasStatus);
/* Get handle to the CUSPARSE context */
cusparseHandle_t cusparseHandle = 0;
cusparseStatus_t cusparseStatus;
cusparseStatus = cusparseCreate(&cusparseHandle);
checkCudaErrors(cusparseStatus);
cusparseMatDescr_t descr = 0;
cusparseStatus = cusparseCreateMatDescr(&descr);
checkCudaErrors(cusparseStatus);
checkCudaErrors(cublasStatus);
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
checkCudaErrors(cudaMalloc((void **)&d_col , nnz*sizeof(int)));
checkCudaErrors(cudaMalloc((void **)&d_row , (M+1)*sizeof(int)));
checkCudaErrors(cudaMalloc((void **)&d_val , nnz*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_rhs , M*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_u , M*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_v , N*sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&d_x , N*sizeof(float)));
checkCudaErrors(cudaMemcpy(d_col , J , nnz*sizeof(int) , cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_row , I , (M+1)*sizeof(int) , cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_val , val , nnz*sizeof(float) , cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_x , x , N*sizeof(float) , cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_rhs , rhs , M*sizeof(float) , cudaMemcpyHostToDevice));
free(I);
free(J);
free(val);
free(nnz_rows);
alpha = 1.0f;
beta = 0.0f;
float a, b;
//cublasStatus = cublasScopy(cublasHandle, M, d_rhs, 1, d_u, 1);
checkCudaErrors(cudaMemcpy(d_u , d_rhs, M*sizeof(float) , cudaMemcpyDeviceToDevice));
cudaDeviceSynchronize();
checkCudaErrors(cublasStatus);
cublasStatus = cublasSnrm2(cublasHandle, M, d_u, 1, &b);
cudaDeviceSynchronize();
checkCudaErrors(cublasStatus);
//printf("|b|=%12.12f\n", b);
beta = b!=0.0f?1.0f/b:1.0f;
//printf("1.0/|b|=%12.12f\n", beta);
cublasStatus = cublasSscal(cublasHandle, M, &beta, d_u, 1);
cudaDeviceSynchronize();
checkCudaErrors(cublasStatus);
cusparseStatus = cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE,
M, N, nnz, &alpha, descr, d_val, d_row, d_col, d_u, &beta, d_v);
//cusparseStatus = cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
// M, N, nnz, &alpha, descr, d_val, d_row, d_col, d_v, &beta, d_u);
cudaDeviceSynchronize();
checkCudaErrors(cusparseStatus);
cublasStatus = cublasSnrm2(cublasHandle, N, d_v, 1, &a);
cudaDeviceSynchronize();
//printf("0 a = %12.12f \n", a);
checkCudaErrors(cublasStatus);
cusparseDestroy(cusparseHandle);
cublasDestroy(cublasHandle);
cudaFree(d_col);
cudaFree(d_row);
cudaFree(d_val);
cudaFree(d_x);
cudaFree(d_u);
cudaFree(d_v);
cudaFree(d_rhs);
free(x);
free(rhs);
exit(1);
}