I have a question about the openacc parallel lead

I wrote a parallel guide for testing a.cpp file as follows:

#include <stdio.h> 
#include<malloc.h>
 #define N 256

int main() 
{ 
  
	float* a=(float*)malloc(N*sizeof(float));
	float* b=(float*)malloc(N*sizeof(float));
	float* c=(float*)malloc(N*sizeof(float));
     for(int i=0;i<N;i++)
 	{
  	a[N]=0;
   	b[i]=c[i]=i;
  	}
     #pragma acc enter data create(a[0:N])
     #pragma acc enter data copyin(b[0:N],c[0:N])
 
     #pragma acc kernels present(a[0:N],b[0:N],c[0:N])
     #pragma acc loop independent
     for(int i=0;i<N;i++)
       {
       a[i]=b[i]+c[i];
       }
    #pragma acc update host (a[:N])
    printf("%f\n",a[N-1]);
    return 0; 
} 

compile:nvc++ -acc -gpu=cc87 -Minfo -o text text.cpp
main compilation information is as follows:

main:
     15, Generating enter data copyin(b[:256],c[:256])
         Generating enter data create(a[:256])
         Generating present(a[:256],c[:256],b[:256])
     21, Loop is parallelizable
         Generating NVIDIA GPU code
         21, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     26, Generating update self(a[:256])

The results are fine: output 510.00000.
But I tried to write this.cpp file in the form of a class to compile and run, there is a problem, the following is my project file.
process.h

#include<malloc.h>
#include <stdio.h> 
#define N 256
class Processing
{
public:
    Processing();
    ~Processing();
    void pocess();
public:
    float* a;
    float* b;
    float* c;
};

process.cpp

#include "process.h"


Processing::Processing()
{
    a=(float*)malloc(N*sizeof(float));
    b=(float*)malloc(N*sizeof(float));
    c=(float*)malloc(N*sizeof(float));
    for(int i=0;i<N;i++)
 	{
  	a[i]=0;
   	b[i]=c[i]=i;
  	}
    #pragma acc enter data create(a[0:N])
    #pragma acc enter data copyin(b[0:N],c[0:N])

}
Processing::~Processing()
{

}
void Processing::pocess()
{

    #pragma acc kernels present(a[0:N],b[0:N],c[0:N])
    #pragma acc loop independent
    for(int i=0;i<N;i++)
       {
       a[i]=b[i]+c[i];
       }
    #pragma acc update host (a[:N])
    printf("a[256]2=%f\n",a[N-1]);
}

text.cpp

#include "process.h"


int main() 
{ 
    Processing* tt = new Processing();
    tt->pocess();
    return 0; 
} 

The compiled make file looks like this:

text:text.o  process.o 
	nvc++ -acc -gpu=cc87 -Minfo -o text text.o process.o
text.o:text.cpp
	nvc++ -acc -c text.cpp 
process.o:process.cpp
	nvc++ -acc -gpu=cc87 -Minfo -c process.cpp

clean:
	rm -f process.o text.o

Compile complete display:

nvc++ -acc -c text.cpp 
nvc++ -acc -gpu=cc87 -Minfo -c process.cpp
Processing::Processing():
     19, Generating enter data copyin(c[:256],b[:256])
         Generating enter data create(a[:256])
Processing::pocess():
     25, Generating present(a[:256],c[:256],b[:256])
         Generating implicit create(this[:]) [if not already present]
     29, Loop is parallelizable
         Generating NVIDIA GPU code
         29, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     34, Generating update self(a[:256])
nvc++ -acc -gpu=cc87 -Minfo -o text text.o process.o

This problem occurs after running:

hostptr=0x1a6b3eb0,stride=1,size=1,extent=-1,eltsize=24,name=this[:],flags=0x20000200=present+implicit,async=-1,threadid=1
Present table dump for device[1]: NVIDIA Tesla GPU 0, compute capability 8.7, threadid=1
Hint: specify 0x800 bit in NV_ACC_DEBUG for verbose info.
host:0x1a6b3ed0 device:0x203c88000 size:1024 presentcount:1+1 line:19 name:a[:256]
host:0x1a6b42e0 device:0x203c88400 size:1024 presentcount:1+1 line:19 name:b[:256]
host:0x1a6b46f0 device:0x203c88800 size:1024 presentcount:1+1 line:19 name:c[:256]
allocated block device:0x203c88000 size:1024 thread:1
allocated block device:0x203c88400 size:1024 thread:1
allocated block device:0x203c88800 size:1024 thread:1
FATAL ERROR: data in PRESENT clause was not found on device 1: name=this[:] host:0x1a6b3eb0
 file:/home/pwz/Desktop/text1/process.cpp _ZN10Processing6pocessEv line:25

Classes have a “this” pointer that needs to get copied in as well, just before the data members:

    for(int i=0;i<N;i++)
 	{
  	a[i]=0;
   	b[i]=c[i]=i;
  	}
    #pragma acc enter data copyin(this)
    #pragma acc enter data create(a[0:N])
    #pragma acc enter data copyin(b[0:N],c[0:N])

Hello, I added this pointer according to your method, and then added the cuda function to calculate fft, but I met a new problem.
New header file FFTuse.h for calculating fft

#ifndef FFTUSE_H
#define FFTUSE_H

#include <cuda_runtime.h>
#include <cufft.h>
#include<iostream>

extern void ManyFFT_D2C(cufftHandle& plan, float* input, cuComplex* output);
extern void SetFFTPlan_D2C(cufftHandle& plan, int NN, int M);

#endif

FFTuse.cpp

#include "FFTuse.h"

void ManyFFT_D2C(cufftHandle& plan, float* input, cuComplex* output) {
    cufftResult result;
    #pragma acc host_data use_device(input,output)
    result=cufftExecR2C(plan, input, output);  
    if(result==CUFFT_SUCCESS)
    {
        std::cout<<"SUCCESS"<<std::endl;
    }
    else
    {
        std::cout<<result<<std::endl;
    }
}

void SetFFTPlan_D2C(cufftHandle& plan, int NN, int M)
{
    cufftResult result;
    int rank = 1;
    int n[1];
    n[0] = NN;
    int istride = 1;
    int idist = NN;
    int ostride = 1;
    int odist = NN;
    int inembed[2];
    int onembed[2];
    int batch = M;
    inembed[0] = NN;  onembed[0] = NN;
    inembed[1] = M; onembed[1] = M;
    result=cufftPlanMany(&plan, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch);
    if(result==CUFFT_SUCCESS)
    {
        std::cout<<"SUCCESS"<<std::endl;
    }
    else
    {
        std::cout<<result<<std::endl;
    }
}

New variable in process.h

cufftHandle plan; 
cuComplex* d;
cuComplex* e;

Implementation in process.cpp

Processing::Processing(int L)
{
    SetFFTPlan_D2C(plan, N,1);
    len=L;
    a=(float*)malloc(N*sizeof(float));
    b=(float*)malloc(N*sizeof(float));
    c=(float*)malloc(N*sizeof(float));
    d=(cuComplex*)malloc(N*sizeof(cuComplex));
    e=(cuComplex*)malloc(N*sizeof(cuComplex));

    for(int i=0;i<N;i++)
 	{
  	a[i]=0;
   	b[i]=c[i]=i;
  	}
    #pragma acc enter data copyin(this)
    #pragma acc enter data create(a[0:N])
    #pragma acc enter data copyin(b[0:N],c[0:N])
    #pragma acc enter data create(d[0:N])

    #pragma acc parallel loop present(a[0:N])
    for(int i=0;i<N;i++)
    {
        a[i]=i;
    }
    #pragma acc update host (a[:N])
    printf("a[256]2=%f\n",a[N-1]);
}
Processing::~Processing()
{

}
void Processing::pocess(float*aa,float*bb,float*cc)
{

    #pragma acc kernels present(aa[0:N],bb[0:N],cc[0:N])
    #pragma acc loop independent
    for(int i=0;i<N;i++)
       {
       aa[i]=bb[i]+cc[i];
       }
    #pragma acc update host (aa[:N])
    printf("a[256]2=%f\n",aa[N-1]);
}
void Processing::ccc()
{
    pocess(a,b,c);
    ManyFFT_D2C(plan, a,d);
    // #pragma acc update host(d[0:N])
    // for(int i=0;i<10;i++)
    // {
    //     printf("d[256].x=%f\n",d[i].x);
    // }
    cudaDeviceSynchronize();
    #pragma acc host_data use_device(d)
    cudaError_t ss=cudaMemcpy(e,d,N*sizeof(cuComplex),cudaMemcpyDeviceToHost);
    if(ss==cudaSuccess)
    {
        std::cout<<"SUCCESS"<<std::endl;
    }
    else
    {
        std::cout<<cudaGetErrorString(ss)<<std::endl;
    }

}

Compile the make file to look like this:

text:text.o  process.o FFTuse.o
	nvc++ -acc -gpu=cc87 -fast -cuda -cudalib -std=c++17 -Minfo=accel -o text text.o process.o FFTuse.o
text.o:text.cpp
	nvc++ -acc -cuda -cudalib -c text.cpp 
process.o:process.cpp
	nvc++  -acc -gpu=cc87 -fast -cuda -cudalib -std=c++17   -Minfo=accel -c process.cpp
FFTuse.o:FFTuse.cpp
	nvc++ -c -cuda -cudalib  -Minfo=accel FFTuse.cpp
clean:
	rm -f process.o text.o

This problem occurs after running

  1. Running the void Processing::ccc() function in the above form will cause the following problem:
cufftPlanMany   SUCCESS
a[256]2=255.000000
a[256]2=510.000000
cufftExecR2C    SUCCESS
an illegal memory access was encountered
  1. Use void Processing::ccc() function cudaDeviceSynchronize(); Comment it out, and this question appears:
cufftPlanMany   SUCCESS
a[256]2=255.000000
a[256]2=510.000000
cufftExecR2C    SUCCESS
invalid argument

The return value of the fft function looks fine, the second time just comments out cudaDeviceSynchronize(), but the return value of cudaMemcpy is not the same twice
3. Change the comment of the void Processing::ccc() function

pocess(a,b,c);
    ManyFFT_D2C(plan, a,d);
    #pragma acc update host(d[0:N])
    for(int i=0;i<10;i++)
    {
        printf("d[256].x=%f\n",d[i].x);
    }
    // cudaDeviceSynchronize();
    // #pragma acc host_data use_device(d)
    // cudaError_t ss=cudaMemcpy(e,d,N*sizeof(cuComplex),cudaMemcpyDeviceToHost);
    // if(ss==cudaSuccess)
    // {
    //     std::cout<<"SUCCESS"<<std::endl;
    // }
    // else
    // {
    //     std::cout<<cudaGetErrorString(ss)<<std::endl;
    // }

Problems such as:

cufftPlanMany   SUCCESS
a[256]2=255.000000
a[256]2=510.000000
cufftExecR2C    SUCCESS
Failing in Thread:1
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution

I have copied all the variables that need to be copied to the gpu into the device, but I don’t know why there is a problem updating to the host

Can you post “text.cpp” so I can try to recreate the issue?

The output seems to suggest that it’s getting past the FFTs so I presume the issue is someplace else.

You may want to try running the code under “compute-sanitizer” to see where the errors are coming from.

From the output return value, fft is no problem, is running to the following code error, and there are three different kinds of errors, I doubt that the program is not the problem, is the compile file problem, or bashrc is the lack of environment variables, I use orin, according to its boot installed HPC environment, the following is text.cpp

#include "process.h"
int main() 
{ 
    Processing* tt = new Processing(256);
    tt->ccc();
    

    return 0; 

If I change the option to compile the file

text:text.o  process.o FFTuse.o
	nvc++ -acc -gpu=managed,cc87 -fast -cuda -cudalib=cufft,cublas  -std=c++17  -lcudart -Minfo=accel -o text text.o process.o FFTuse.o
text.o:text.cpp
	nvc++ -acc -gpu=managed,cc87 -cuda -cudalib=cufft,cublas -lcudart  -c text.cpp 
process.o:process.cpp
	nvc++  -acc -gpu=managed,cc87 -fast -cuda -cudalib=cufft,cublas -std=c++17 -lcudart  -Minfo=accel -c process.cpp
FFTuse.o:FFTuse.cpp
	nvc++ -acc -cuda -gpu=managed,cc87 -cudalib=cufft,cublas -lcudart  -Minfo=accel -c FFTuse.cpp

clean:
	rm -f process.o text.o

After running, the first two problems that appeared before are gone, but the third problem still exists, when running to #pragma acc update host(d[0:N]), there will still be an error, but the error is different:

Duan error (core has been dumped)

I think I found the problem.

In you’re makefile, you don’t have OpenACC enabled for FFTuse.cpp since the “-acc” is missing. Hence the “host_data” directive is ignored and you’re passing host pointers to cufftExecR2C. Hence the FFT is working on bad addresses but the device error isn’t seen until the next CUDA call (in this case the cudaMemCpy).

Duan error (core has been dumped)

I’m not seeing this error and not sure what a “Duan error” is, so am assuming it’s a seg fault. Though the “update” directive is effectively ignored when managed memory is used, so I doubt it’s coming from there. Seg faults are host side, so you might run it trough a debugger, like gdb, to see where the error is coming from.