Hello, Mat. I have two very large vectors that I need to perform point multiplication on each element. Now I have tried some parallel solutions, but the effect seems not ideal. It still takes several tens of microseconds. Is there a faster solution
./dot
NN=120000000
CUPTI ERROR: cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL) returned: CUPTI_ERROR_INSUFFICIENT_PRIVILEGES, at ../../src-cupti/prof_cuda_cupti.c:338.
t_12=51750
t_23=51890
CC=-17.000000 16.000000
The running code is as follows
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include<openacc.h>
#include "device_launch_parameters.h"
#define NN 200*200*3000
//时间
#if defined(_WIN32) || defined(_WIN64)
#include <sys/timeb.h>
#define gettime(a) _ftime(a)
#define usec(t1,t2) ((((t2).time-(t1).time)*1000+((t2).millitm-(t1).millitm))*100)
typedef struct _timeb timestruct;
#else
#include <sys/time.h>
#define gettime(a) gettimeofday(a,NULL)
#define usec(t1,t2) (((t2).tv_sec-(t1).tv_sec)*1000000+((t2).tv_usec-(t1).tv_usec))
typedef struct timeval timestruct;
#endif
#pragma acc routine seq
cuDoubleComplex operator*(cuDoubleComplex &a1,cuDoubleComplex &a2 )
{
cuDoubleComplex res;
res.x = a1.x*a2.x-a1.y*a2.y;
res.y = a1.x*a2.y+a1.y*a2.x;
return res;
}
int main()
{
cudaError_t cudaStatus;
cublasStatus_t cublasStatus;
cublasHandle_t handle;
cuDoubleComplex*AA=(cuDoubleComplex*)malloc(NN*sizeof(cuDoubleComplex));
cuDoubleComplex*BB=(cuDoubleComplex*)malloc(NN*sizeof(cuDoubleComplex));
cuDoubleComplex*CC=(cuDoubleComplex*)malloc(NN*sizeof(cuDoubleComplex));
cublasStatus = cublasCreate(&handle);
printf("NN=%d\n",NN);
for(int i=0;i<NN;i++)
{
AA[i]={1,2};
BB[i]={3,10};
CC[i].x=0;
CC[i].y=0;
}
#pragma acc enter data copyin(AA[0:NN],BB[0:NN],CC[0:NN])
timestruct t1, t2,t3;
long long t_12,t_23;
gettime(&t1);
#pragma acc kernels present(AA[0:NN],BB[0:NN],CC[0:NN])
{
for(int i=0;i<NN;i++)
CC[i]=cuCmul(AA[i],BB[i]);
}
gettime(&t2);
#pragma acc parallel present(AA[0:NN],BB[0:NN],CC[0:NN])
{
#pragma acc loop independent
for(int i=0;i<NN;i++)
CC[i]=AA[i]*BB[i];
}
gettime(&t3);
t_12=usec(t1,t2);
t_23=usec(t2,t3);
printf("t_12=%lld\n",t_12);
printf("t_23=%lld\n",t_23);
#pragma acc exit data copyout(CC[0:NN])
printf("CC=%f %f\n",CC[2].x,CC[2].y);
return 0;
}