I wrote a parallel guide for testing a.cpp file as follows:
#include <stdio.h>
#include<malloc.h>
#define N 256
int main()
{
float* a=(float*)malloc(N*sizeof(float));
float* b=(float*)malloc(N*sizeof(float));
float* c=(float*)malloc(N*sizeof(float));
for(int i=0;i<N;i++)
{
a[N]=0;
b[i]=c[i]=i;
}
#pragma acc enter data create(a[0:N])
#pragma acc enter data copyin(b[0:N],c[0:N])
#pragma acc kernels present(a[0:N],b[0:N],c[0:N])
#pragma acc loop independent
for(int i=0;i<N;i++)
{
a[i]=b[i]+c[i];
}
#pragma acc update host (a[:N])
printf("%f\n",a[N-1]);
return 0;
}
compile:nvc++ -acc -gpu=cc87 -Minfo -o text text.cpp
main compilation information is as follows:
main:
15, Generating enter data copyin(b[:256],c[:256])
Generating enter data create(a[:256])
Generating present(a[:256],c[:256],b[:256])
21, Loop is parallelizable
Generating NVIDIA GPU code
21, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
26, Generating update self(a[:256])
The results are fine: output 510.00000.
But I tried to write this.cpp file in the form of a class to compile and run, there is a problem, the following is my project file.
process.h
#include<malloc.h>
#include <stdio.h>
#define N 256
class Processing
{
public:
Processing();
~Processing();
void pocess();
public:
float* a;
float* b;
float* c;
};
process.cpp
#include "process.h"
Processing::Processing()
{
a=(float*)malloc(N*sizeof(float));
b=(float*)malloc(N*sizeof(float));
c=(float*)malloc(N*sizeof(float));
for(int i=0;i<N;i++)
{
a[i]=0;
b[i]=c[i]=i;
}
#pragma acc enter data create(a[0:N])
#pragma acc enter data copyin(b[0:N],c[0:N])
}
Processing::~Processing()
{
}
void Processing::pocess()
{
#pragma acc kernels present(a[0:N],b[0:N],c[0:N])
#pragma acc loop independent
for(int i=0;i<N;i++)
{
a[i]=b[i]+c[i];
}
#pragma acc update host (a[:N])
printf("a[256]2=%f\n",a[N-1]);
}
text.cpp
#include "process.h"
int main()
{
Processing* tt = new Processing();
tt->pocess();
return 0;
}
The compiled make file looks like this:
text:text.o process.o
nvc++ -acc -gpu=cc87 -Minfo -o text text.o process.o
text.o:text.cpp
nvc++ -acc -c text.cpp
process.o:process.cpp
nvc++ -acc -gpu=cc87 -Minfo -c process.cpp
clean:
rm -f process.o text.o
Compile complete display:
nvc++ -acc -c text.cpp
nvc++ -acc -gpu=cc87 -Minfo -c process.cpp
Processing::Processing():
19, Generating enter data copyin(c[:256],b[:256])
Generating enter data create(a[:256])
Processing::pocess():
25, Generating present(a[:256],c[:256],b[:256])
Generating implicit create(this[:]) [if not already present]
29, Loop is parallelizable
Generating NVIDIA GPU code
29, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
34, Generating update self(a[:256])
nvc++ -acc -gpu=cc87 -Minfo -o text text.o process.o
This problem occurs after running:
hostptr=0x1a6b3eb0,stride=1,size=1,extent=-1,eltsize=24,name=this[:],flags=0x20000200=present+implicit,async=-1,threadid=1
Present table dump for device[1]: NVIDIA Tesla GPU 0, compute capability 8.7, threadid=1
Hint: specify 0x800 bit in NV_ACC_DEBUG for verbose info.
host:0x1a6b3ed0 device:0x203c88000 size:1024 presentcount:1+1 line:19 name:a[:256]
host:0x1a6b42e0 device:0x203c88400 size:1024 presentcount:1+1 line:19 name:b[:256]
host:0x1a6b46f0 device:0x203c88800 size:1024 presentcount:1+1 line:19 name:c[:256]
allocated block device:0x203c88000 size:1024 thread:1
allocated block device:0x203c88400 size:1024 thread:1
allocated block device:0x203c88800 size:1024 thread:1
FATAL ERROR: data in PRESENT clause was not found on device 1: name=this[:] host:0x1a6b3eb0
file:/home/pwz/Desktop/text1/process.cpp _ZN10Processing6pocessEv line:25