Hello
I’m using CUSPARSE on Win XP 32 bit, Driver 285.86, Toolkit 4.1 RC2 (build 21)
According to the documentation, it should be possible to pass a pointer to device memory to some functions, either for passing in arguments, or for obtaining the result of a computation.
Specifically, I tried to obtain the result of cusparseSdoti in device memory, but the function crashes. When using host memory, it works properly.
I created a minimal test program to reproduce the error. It crashes in the call to cusparseSdoti. When commenting out the line
useDevicePointer = true;
it works as expected.
#include <cusparse_v2.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
int main(int argc, char **argv)
{
int n = 1000;
int nnz = 100;
// Create a handle
cusparseHandle_t cusparseHandle = 0;
cusparseCreate(&cusparseHandle);
// Allocate and fill host memory for sparse and dense vector
float *h_xVal = new float[nnz];
int *h_xInd = new int[nnz];
for (int i=0; i<nnz; i++)
{
h_xVal[i] = 1.0f;
h_xInd[i] = i * 10;
}
float *h_yVal = new float[n];
for (int i=0; i<n; i++)
{
h_yVal[i] = 1.0f;
}
// Allocate device memory and copy host data to device
float *d_xVal;
cudaMalloc((void**)&d_xVal, nnz*sizeof(float));
cudaMemcpy(d_xVal, h_xVal, nnz*sizeof(float), cudaMemcpyHostToDevice);
int *d_xInd;
cudaMalloc((void**)&d_xInd, nnz*sizeof(int));
cudaMemcpy(d_xInd, h_xInd, nnz*sizeof(int), cudaMemcpyHostToDevice);
float *d_yVal;
cudaMalloc((void**)&d_yVal, n*sizeof(float));
cudaMemcpy(d_yVal, h_yVal, n*sizeof(float), cudaMemcpyHostToDevice);
float *h_result = new float[1];
bool useDevicePointer = false;
useDevicePointer = true;
if (useDevicePointer)
{
// Create a device pointer for the sdoti result
float *d_result;
cudaMalloc((void**)&d_result, 1*sizeof(float));
// Execute sdoti, writing the result to a device pointer
printf("Calling sdoti\n");
cusparseSdoti(cusparseHandle, nnz, d_xVal, d_xInd, d_yVal, d_result, CUSPARSE_INDEX_BASE_ZERO);
printf("Calling sdoti DONE\n");
cudaDeviceSynchronize();
// Copy result from device to host
cudaMemcpy(h_result, d_result, 1*sizeof(float), cudaMemcpyDeviceToHost);
}
else
{
// Execute sdoti, writing the result to a host pointer
printf("Calling sdoti\n");
cusparseSdoti(cusparseHandle, nnz, d_xVal, d_xInd, d_yVal, h_result, CUSPARSE_INDEX_BASE_ZERO);
printf("Calling sdoti DONE\n");
cudaDeviceSynchronize();
}
printf("Result %f\n", h_result[0]);
}
Did I do something wrong? Is this a known issue? (I found nothing about that in the release notes…).
EDIT: BTW, I have a GeForce 8800 - just for the case that it may be related to the Compute Capability or so…