CUBLAS Vector Multiply

It’s possible to use the CUBLAS dgmm function to do a vector elementwise multiply. However the kernel you have written is already the most efficient way possible. Here is an example of using dgmm:

$ cat t2268.cu
#include <cublas_v2.h>
#include <iostream>

int main(){

  const int ds = 32;

  float *d_a, *d_b, *d_c;
  cudaMalloc(&d_a, sizeof(d_a[0])*ds);
  cudaMalloc(&d_b, sizeof(d_b[0])*ds);
  cudaMalloc(&d_c, sizeof(d_c[0])*ds);
  float *h = new float[ds];
  for (int i = 0; i < ds; i++) h[i] = i+1;
  cudaMemcpy(d_a, h, sizeof(d_a[0])*ds, cudaMemcpyHostToDevice);
  for (int i = 0; i < ds; i++) h[i] = 2;
  cudaMemcpy(d_b, h, sizeof(d_b[0])*ds, cudaMemcpyHostToDevice);
  cublasHandle_t hd;
  cublasStatus_t stat = cublasCreate(&hd);
  cublasSideMode_t mode = CUBLAS_SIDE_LEFT;
  int m = ds;
  int n = 1;
  int lda = ds;
  int incx = 1;
  int ldc = ds;
  stat = cublasSdgmm(hd, mode, m, n, d_a, lda, d_b, incx, d_c, ldc);
  std::cout << (int)stat << std::endl;
  cudaError_t err = cudaMemcpy(h, d_c, sizeof(d_c[0])*ds, cudaMemcpyDeviceToHost);
  std::cout << cudaGetErrorString(err) << std::endl;
  for (int i = 0; i < ds; i++) std::cout << h[i] << std::endl;
}
$ nvcc -o t2268 t2268.cu -lcublas
$ ./t2268
0
no error
2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
40
42
44
46
48
50
52
54
56
58
60
62
64
$

If you have CUDA fortran-specific questions, you are likely to get better help on one of the HPC compilers forums.