Reading [1], see that reuse of cublasHandle_t is a good practice but if I need to make multiple calls per thread will continue to be a good practice?
If create a handle outside of kernel how can reference it to kernel?
//~ nvcc -rdc=true -arch=sm_35 -o t123 t123.cu -lcublas -lcublas_device -lcudadevrt
//~ sudo optirun --no-xorg ./t123
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <cublas_v2.h>
__global__ void kernel2(cublasHandle_t handle, double *x){
double alpha = 2.0;
double *ptrAlpha = α
cublasDscal(handle, _size, ptrAlpha, x, 1); //don't work with handle
cudaDeviceSynchronize();
cublasHandle_t handle2;
cublasStatus_t stat;
stat = cublasCreate(&handle2);
if(stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS initialization failed\n");
return;
}
cublasDscal(handle2, _size, ptrAlpha, x, 1); //work with handle2
cublasDestroy(handle2);
}
int main(int argc, char **argv){
cublasHandle_t handle;
cublasStatus_t stat;
stat = cublasCreate(&handle);
if(stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS initialization failed\n");
return EXIT_FAILURE;
}
double *vetor = new double[_size], *vetorOut = new double[_size];
assert(vetor);
assert(vetorOut);
for(int i=0; i<_size; i++){
vetor[i] = (double)i;
}
double *ptrvetor;
cudaMalloc((void**) &ptrvetor, _size*sizeof(double));
cudaMemcpy(ptrvetor, vetor, _size*sizeof(double), cudaMemcpyHostToDevice);
kernel2<<<1, 1>>>(handle, ptrvetor);
cudaDeviceSynchronize();
cudaMemcpy(vetorOut, ptrvetor, _size*sizeof(double), cudaMemcpyDeviceToHost);
printf("%f\n", vetorOut[_size-1]);
cudaFree(ptrvetor);
delete [] vetor;
vetor = NULL;
delete [] vetorOut;
vetorOut = NULL;
cublasDestroy(handle);
return 0;
}