cublasSasum fails if result is on device

Hello everybody, here follows a test case showcasing this point. Am I missing something when using *result on device?

#include <stdio.h>
#include "cublas_v2.h"

inline void wrap_cudaGetLastError (const char *msg) 
/*< check GPU errors >*/
{
    cudaError_t err = cudaGetLastError ();
    if (cudaSuccess != err) { 
	fprintf (stdout, "Cuda error: %s: %s\n", msg, cudaGetErrorString (err)); 
	exit(0);   
    }
}

int main(){
	float L[5] = {0,1,2,3,4};
	float *d_L;
	int nx = 5;

	cudaMalloc(&d_L, sizeof(float)*nx);
	cudaMemcpy(d_L, L, nx*sizeof(float), cudaMemcpyHostToDevice);

	cublasHandle_t cublas_handle;
	cublasCreate(&cublas_handle);

	// cublasSasum with res on host
	float res = 0;
	cublasSasum(cublas_handle, nx, d_L, 1, &res);
	printf("res=%f \n", res);
	wrap_cudaGetLastError("cuda_cal_opL cublasSasum host");

	// cublasSasum with res on device
	float res2 = 0;
	float *d_res;

	cudaMalloc(&d_res, sizeof(float));
	cublasSasum(cublas_handle, nx, d_L, 1, d_res);
	wrap_cudaGetLastError("cuda_cal_opL cublasSasum dev");
	cudaMemcpy(&res2, d_res, sizeof(float), cudaMemcpyDeviceToHost);

	printf("d_res=%f\n", res2);
	

	cudaFree(d_res);
	cublasDestroy(cublas_handle);

	return 0;
}

which when compiled and run, prints out:

res=10.000000 
Segmentation fault (core dumped)

Also, nvcc --version here is release 10.2, V10.2.89.

Regards,

Victor

You need to set the pointer mode to device before the second call to cublasSasum:

cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);

res=10.000000
d_res=10.000000