It’s possible to use the CUBLAS dgmm function to do a vector elementwise multiply. However the kernel you have written is already the most efficient way possible. Here is an example of using dgmm:
$ cat t2268.cu
#include <cublas_v2.h>
#include <iostream>
int main(){
const int ds = 32;
float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, sizeof(d_a[0])*ds);
cudaMalloc(&d_b, sizeof(d_b[0])*ds);
cudaMalloc(&d_c, sizeof(d_c[0])*ds);
float *h = new float[ds];
for (int i = 0; i < ds; i++) h[i] = i+1;
cudaMemcpy(d_a, h, sizeof(d_a[0])*ds, cudaMemcpyHostToDevice);
for (int i = 0; i < ds; i++) h[i] = 2;
cudaMemcpy(d_b, h, sizeof(d_b[0])*ds, cudaMemcpyHostToDevice);
cublasHandle_t hd;
cublasStatus_t stat = cublasCreate(&hd);
cublasSideMode_t mode = CUBLAS_SIDE_LEFT;
int m = ds;
int n = 1;
int lda = ds;
int incx = 1;
int ldc = ds;
stat = cublasSdgmm(hd, mode, m, n, d_a, lda, d_b, incx, d_c, ldc);
std::cout << (int)stat << std::endl;
cudaError_t err = cudaMemcpy(h, d_c, sizeof(d_c[0])*ds, cudaMemcpyDeviceToHost);
std::cout << cudaGetErrorString(err) << std::endl;
for (int i = 0; i < ds; i++) std::cout << h[i] << std::endl;
}
$ nvcc -o t2268 t2268.cu -lcublas
$ ./t2268
0
no error
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
40
42
44
46
48
50
52
54
56
58
60
62
64
$
If you have CUDA fortran-specific questions, you are likely to get better help on one of the HPC compilers forums.