OpenACC C/CUDA error

Hello,

I’m trying to do a simple example code where I link OpenACC C and CUDA.

The C code looks like this:

  int memsize = 10;
  int *p = acc_malloc(memsize*sizeof(int));
  int a[(int)memsize];
  int i;

#pragma acc data copy(a[0:(int)memsize]), deviceptr(p)
  {
    #pragma acc kernels loop
    for (i=0;i<memsize;i++){
      p[i] = i;
      a[i] = p[i];
    }

#pragma acc host_data use_device(p)
    {
      cuda_kernel(p,memsize);
    }

    printf("L\n");

#pragma acc wait
#pragma acc kernels loop
    for (i=0;i<memsize;i++){
       a[i] = p[i];
    }


  }

  acc_free(p);
  return(0);

And the cuda like this:

__global__ void cukernel(int* p, int sz){
  /* This is the actual CUDA kernel */
  int tid = blockIdx.x;
  if (tid < sz)
    p[tid] = p[tid*2];
  
}

extern "C" int cuda_kernel(int* openacc_ptr, size_t openacc_memsize){

  cudaThreadSynchronize();
  cukernel<<<openacc_memsize,1>>>(openacc_ptr,(int)openacc_memsize);
  cudaThreadSynchronize();

  printf("Inside cuda_kernel.\n");
  return(0);
}

I compile the cuda with

 nvcc -c cuda.cu

and then compile and link with:

pgcc -acc -Mcuda cfile.c cuda.o

Running the binary I get:

L
call to cuModuleGetFunction returned error 700: Launch failed

Can anyone help me with this? Am I doing something wrong with the compile + link or is it a runtime error? It’s really a precursory step using larger CUDA kernels within an OpenACC code.

-Nick.

Hi Nick,

“host_data” is meant to use the device pointer from a variable that has both a host and device pointer. “p” is already a deviceptr so no need to use “host_data”. Though, we should problably just ignore “p” in this case instead of getting this odd run time error. I added TPR#19568 and sent it off to engineering.

Here’s the work around version:

% cat test.c
 #include <stdio.h>
 #include <openacc.h>
 
int main () {
 
const int memsize = 10;
 int *p = acc_malloc(memsize*sizeof(int));
 int a[(int)memsize];
 int i;
 
#pragma acc data copy(a[0:(int)memsize]), deviceptr(p)
 {
 #pragma acc kernels loop
 for (i=0;i<memsize;i++){
 p[i] = i;
 a[i] = p[i];
 }
 
// uncomment this to get it to fail
//#pragma acc host_data use_device(p)
 {
 cuda_kernel(p,memsize);
 }
 
printf("L\n");
 
#pragma acc wait
 #pragma acc kernels loop
 for (i=0;i<memsize;i++){
 a[i] = p[i];
 }
 
}
 printf("A[4]=%d\n",a[4]);
 acc_free(p);
 return(0);
 }
 % cat test_cu.cu
 #include <stdio.h>
 
__global__ void cukernel(int* p, int sz){
 /* This is the actual CUDA kernel */
 int tid = blockIdx.x;
 if (tid < sz)
 p[tid] = p[tid*2];
 
}
 
extern "C" int cuda_kernel(int* openacc_ptr, size_t openacc_memsize){
 int rc;
 printf("Enter cuda_kernel.\n");
 cudaThreadSynchronize();
 cukernel<<<openacc_memsize,1>>>(openacc_ptr,(int)openacc_memsize);
 rc = cudaGetLastError();
 cudaThreadSynchronize();
 
printf("Inside cuda_kernel. %d\n", rc);
 return(0);
 }
 % nvcc -c test_cu.cu ; pgcc -acc -ta=nvidia,5.0 test.c test_cu.o -Mcuda -V13.7 -Minfo=accel ; a.out
 test.c:
 main:
 11, Generating copy(a[0:memsize])
 13, Generating present_or_copy(a[0:memsize])
 Generating NVIDIA code
 Generating compute capability 1.0 binary
 Generating compute capability 2.0 binary
 Generating compute capability 3.0 binary
 14, Loop is parallelizable
 Accelerator kernel generated
 14, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
 27, Generating present_or_copy(a[0:memsize])
 Generating NVIDIA code
 Generating compute capability 1.0 binary
 Generating compute capability 2.0 binary
 Generating compute capability 3.0 binary
 28, Loop is parallelizable
 Accelerator kernel generated
 28, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
 Enter cuda_kernel.
 Inside cuda_kernel. 0
 L
 A[4]=8
  • Mat

Hi Nick,

Engineering tried to find a way to either flag this as an error or just ignore “p” in this context, but unfortunately it doesn’t look possible since there isn’t a way to track what type of pointer p is (device or host).

I’ve closed TPR#19568.

  • Mat