CUSP and Cusparse are slower than CPU

I am using cusp library for SpMV operation for CSR COO ELL HYB DIA format. But it is giving slow processing (end to end time) than CPU, when i use scipy library (for CPU). I am reading mtx file (from sparse suite collection dataset) on device and and performing multiply operation with a dense vector of one. When i do nvidia-smi it always shows that 175 MB (some time 155MB)memory is used on GPU device even with 1 GB mtx or more size. same happening with cusparse also.
my code which i am using to perform SpMV supposed to run on GPU with CUSP library is:

//read mtx on device in COO format
cusp::coo_matrix<int, float, cusp::device_memory> coo_device;
cusp::io::read_matrix_market_file(coo_device, mtx_file);

// allocate storage for output Y and input X on device
cusp::array1d<float, cusp::device_memory> Y_device(coo_device.num_rows, 0);
cusp::array1d<float, cusp::device_memory> X_device(coo_device.num_cols, 1);
//-----------------COO format---------------------
if (strcmp("coo", format) == 0) {
  // warm up call
  cusp::multiply(coo_device, X_device, Y_device);
  timer t;
  for (int i = 0; i < num_trials; i++)
    cusp::multiply(coo_device, X_device, Y_device);
  cudaThreadSynchronize();
  time_ = t.seconds_elapsed() / num_trials;
}

else if (strcmp("csr", format) == 0) {
  // convert to csr format
  cusp::csr_matrix<int, float, cusp::device_memory> csr_device;
  try {
    csr_device = coo_device;
    // cusp::convert(coo_device, csr);
  } catch (cusp::format_conversion_exception) {
    std::cout << "\tUnable to convert to CSR format" << std::endl;
    return -1;
  }

// warm up call
cusp::multiply(csr_device, X_device, Y_device);
timer t;
for (int i = 0; i < num_trials; i++)
cusp::multiply(csr_device, X_device, Y_device);
cudaThreadSynchronize();
time_ = t.seconds_elapsed() / num_trials;

   [b]For cusparse library i am using following code:[/b]

xHostPtr = (double*)malloc(n * sizeof(xHostPtr[0]));
yHostPtr = (double*)malloc(m * sizeof(yHostPtr[0]));
if ((!xHostPtr) || (!yHostPtr)) {
CLEANUP(“Host malloc failed (vectors)”);
return -1;
}
for (int i = 0; i < m; i++) {
yHostPtr[i] = 0.0;
}
for (int i = 0; i < n; i++) {
xHostPtr[i] = 1.0;
}
/* allocate GPU memory and copy the matrix and vectors into it /
cudaStat1 = cudaMalloc((void
*)&csrRowPtr, (m + 1) * sizeof(csrRowPtr[0]));
cudaStat2 = cudaMalloc((void**)&ColIndex, nnz * sizeof(ColIndex[0]));
cudaStat3 = cudaMalloc((void**)&Values, nnz * sizeof(Values[0]));
cudaStat4 = cudaMalloc((void**)&y, m * sizeof(y[0]));
cudaStat5 = cudaMalloc((void**)&x, n * sizeof(x[0]));
if ((cudaStat1 != cudaSuccess) || (cudaStat2 != cudaSuccess) || (cudaStat3 != cudaSuccess) ||
(cudaStat4 != cudaSuccess) || (cudaStat5 != cudaSuccess)) {
CLEANUP(“Device malloc failed”);
return -1;
}
cudaStat1 = cudaMemcpy(csrRowPtr, csrRowHostPtr, (size_t)((m + 1) * sizeof(csrRowPtr[0])),
cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(ColIndex, ColIndexHostPtr, (size_t)(nnz * sizeof(ColIndex[0])),
cudaMemcpyHostToDevice);
cudaStat3 =
cudaMemcpy(Values, ValuesHostPtr, (size_t)(nnz * sizeof(Values[0])), cudaMemcpyHostToDevice);
cudaStat4 = cudaMemcpy(y, yHostPtr, (size_t)(m * sizeof(y[0])), cudaMemcpyHostToDevice);
cudaStat5 = cudaMemcpy(x, xHostPtr, (size_t)(n * sizeof(x[0])), cudaMemcpyHostToDevice);
if ((cudaStat1 != cudaSuccess) || (cudaStat2 != cudaSuccess) || (cudaStat3 != cudaSuccess) ||
(cudaStat4 != cudaSuccess) || (cudaStat5 != cudaSuccess)) {
CLEANUP(“Memcpy from Host to Device failed”);
return -1;
}

status = cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, m, n, nnz, &alpha, descr,
Values, csrRowPtr, ColIndex, &x[0], &beta, &y[0]);