I used OpenACC to accelerate my program, but I found that copying data from the GPU to the CPU wasted a lot of time. I tried to solve this problem but failed.
Firstly, I tried using # pragma acc update host and cudamemcpy to modify my functions, and I tested them in a program,
nvc++ -o copy -acc -gpu=cc87 -cuda -cudalib copy.
CUDA to CPU t_12=119156 microsecond
Openacc to CPU t_23=181720 microsecond
Accelerator Kernel Timing data
/home/orin/Downloads/copy.c
main NVIDIA devicenum=0
time(us): 210,041
42: data region reached 1 time
42: data copyin transfers: 10
device time(us): total=28,704 max=3,435 min=1,588 avg=2,870
53: update directive reached 1 time
53: data copyout transfers: 10
device time(us): total=181,337 max=24,605 min=13,725 avg=18,133
#include<stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#if defined(_WIN32) || defined(_WIN64)
#include <sys/timeb.h>
#define gettime(a) _ftime(a)
#define usec(t1,t2) ((((t2).time-(t1).time)*1000+((t2).millitm-(t1).millitm))*100)
typedef struct _timeb timestruct;
#else
#include <sys/time.h>
#define gettime(a) gettimeofday(a,NULL)
#define usec(t1,t2) (((t2).tv_sec-(t1).tv_sec)*1000000+((t2).tv_usec-(t1).tv_usec))
typedef struct timeval timestruct;
#endif
int main()
{
int M_Array=200*200,Fs=2000;
float* a = (float*)malloc(M_Array * Fs * sizeof(float));
float* b = (float*)malloc(M_Array * Fs * sizeof(float));
float* c = (float*)malloc(M_Array * Fs * sizeof(float));
long long t_12,t_23;
timestruct t1, t2,t3;
for(int i=0;i<M_Array * Fs;i++)
a[i]=0.1*i;
cublasHandle_t handle;
cublasCreate(&handle);
#pragma acc enter data copyin(a[0:M_Array*Fs]) create(b[0:M_Array*Fs])
#pragma acc host_data use_device(a,b)
cublasScopy(handle,M_Array*Fs,a,1,b,1);
gettime(&t1);
#pragma acc host_data use_device(a)
cudaMemcpy(c,a,M_Array* Fs*sizeof(float) ,cudaMemcpyDeviceToHost);
gettime(&t2);
#pragma acc update host(b[0:M_Array*Fs])
gettime(&t3);
t_12=usec(t1,t2);
t_23=usec(t2,t3);
printf("CUDA to CPU t_12=%lld microsecond\n",t_12);
printf("Openacc to CPU t_23=%lld microsecond\n",t_23);
return 0;
}
There was a noticeable improvement in the testing program, but after I tried to replace update with cudamemcpy in the large program, the program’s time did not change and it took 0.5 seconds to copy the same amount of data to the CPU
I tried locking page memory again and performed well in the test program, but it still cannot be done in the main program
nvc++ -o test -acc -gpu=cc87 -cuda -cudalib -Minfo=accel test.c
update to CPU t_12=92836 microsecond cc[1]=1.000000
cudaMemcpy to CPU t_23=11097 microsecond bb[1]=1.000000
Accelerator Kernel Timing data
main NVIDIA devicenum=0
time(us): 103,862
59: compute region reached 1 time
64: kernel launched 1 time
grid: [65535] block: [128]
elapsed time(us): total=7,632 max=7,632 min=7,632 avg=7,632
59: data region reached 3 times
59: data copyin transfers: 10
device time(us): total=11,345 max=1,211 min=708 avg=1,134
69: update directive reached 1 time
69: data copyout transfers: 10
device time(us): total=92,517 max=16,835 min=7,517 avg=9,251
#include <iostream>
#include <condition_variable>
#include <cstring>
#include <cstdlib>
#include<openacc.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#if defined(_WIN32) || defined(_WIN64)
#include <sys/timeb.h>
#define gettime(a) _ftime(a)
#define usec(t1,t2) ((((t2).time-(t1).time)*1000+((t2).millitm-(t1).millitm))*100)
typedef struct _timeb timestruct;
#else
#include <sys/time.h>
#define gettime(a) gettimeofday(a,NULL)
#define usec(t1,t2) (((t2).tv_sec-(t1).tv_sec)*1000000+((t2).tv_usec-(t1).tv_usec))
typedef struct timeval timestruct;
#endif
int main()
{
int NN=200*2000*200;
float *aa;
cudaHostAlloc((void **)&aa, NN * sizeof(*aa), cudaHostAllocDefault);
float*bb=(float*)malloc(NN*sizeof(float));
float*cc=(float*)malloc(NN*sizeof(float));
for(int i=0;i<NN;i++)
bb[i]=i+1/2;
long long t_12,t_23;
timestruct t1, t2,t3;
#pragma acc enter data copyin(bb[0:NN]) create(cc[0:NN])
#pragma acc kernels
#pragma acc loop independent
for(int i=0;i<NN;i++)
cc[i]=bb[i];
gettime(&t1);
#pragma acc update host(cc[0:NN])
gettime(&t2);
#pragma acc host_data use_device(bb)
cudaMemcpy(aa,bb,NN*sizeof(float) ,cudaMemcpyDeviceToHost);
gettime(&t3);
t_12=usec(t1,t2);
t_23=usec(t2,t3);
printf("update to CPU t_12=%lld microsecond cc[1]=%f\n",t_12,cc[1]);
printf("cudaMemcpy to CPU t_23=%lld microsecond bb[1]=%f\n",t_23,bb[1]);
return 0;
}
I think the main program may have taken up too much memory. When the main program starts running, it needs to take up 10GB of memory space. I opened up many variables on the GPU through OpenACC, and in the end, only a float array of size 200 * 200 * 2000 was returned. I think this size of array should not take so long