Hello everybody, here follows a test case showcasing this point. Am I missing something when using *result on device?
#include <stdio.h>
#include "cublas_v2.h"
inline void wrap_cudaGetLastError (const char *msg)
/*< check GPU errors >*/
{
cudaError_t err = cudaGetLastError ();
if (cudaSuccess != err) {
fprintf (stdout, "Cuda error: %s: %s\n", msg, cudaGetErrorString (err));
exit(0);
}
}
int main(){
float L[5] = {0,1,2,3,4};
float *d_L;
int nx = 5;
cudaMalloc(&d_L, sizeof(float)*nx);
cudaMemcpy(d_L, L, nx*sizeof(float), cudaMemcpyHostToDevice);
cublasHandle_t cublas_handle;
cublasCreate(&cublas_handle);
// cublasSasum with res on host
float res = 0;
cublasSasum(cublas_handle, nx, d_L, 1, &res);
printf("res=%f \n", res);
wrap_cudaGetLastError("cuda_cal_opL cublasSasum host");
// cublasSasum with res on device
float res2 = 0;
float *d_res;
cudaMalloc(&d_res, sizeof(float));
cublasSasum(cublas_handle, nx, d_L, 1, d_res);
wrap_cudaGetLastError("cuda_cal_opL cublasSasum dev");
cudaMemcpy(&res2, d_res, sizeof(float), cudaMemcpyDeviceToHost);
printf("d_res=%f\n", res2);
cudaFree(d_res);
cublasDestroy(cublas_handle);
return 0;
}
which when compiled and run, prints out:
res=10.000000
Segmentation fault (core dumped)
Also, nvcc --version
here is release 10.2, V10.2.89
.
Regards,
Victor