I’m trying to do a simple example code where I link OpenACC C and CUDA.
The C code looks like this:
int memsize = 10;
int *p = acc_malloc(memsize*sizeof(int));
int a[(int)memsize];
int i;
#pragma acc data copy(a[0:(int)memsize]), deviceptr(p)
{
#pragma acc kernels loop
for (i=0;i<memsize;i++){
p[i] = i;
a[i] = p[i];
}
#pragma acc host_data use_device(p)
{
cuda_kernel(p,memsize);
}
printf("L\n");
#pragma acc wait
#pragma acc kernels loop
for (i=0;i<memsize;i++){
a[i] = p[i];
}
}
acc_free(p);
return(0);
And the cuda like this:
__global__ void cukernel(int* p, int sz){
/* This is the actual CUDA kernel */
int tid = blockIdx.x;
if (tid < sz)
p[tid] = p[tid*2];
}
extern "C" int cuda_kernel(int* openacc_ptr, size_t openacc_memsize){
cudaThreadSynchronize();
cukernel<<<openacc_memsize,1>>>(openacc_ptr,(int)openacc_memsize);
cudaThreadSynchronize();
printf("Inside cuda_kernel.\n");
return(0);
}
I compile the cuda with
nvcc -c cuda.cu
and then compile and link with:
pgcc -acc -Mcuda cfile.c cuda.o
Running the binary I get:
L
call to cuModuleGetFunction returned error 700: Launch failed
Can anyone help me with this? Am I doing something wrong with the compile + link or is it a runtime error? It’s really a precursory step using larger CUDA kernels within an OpenACC code.
“host_data” is meant to use the device pointer from a variable that has both a host and device pointer. “p” is already a deviceptr so no need to use “host_data”. Though, we should problably just ignore “p” in this case instead of getting this odd run time error. I added TPR#19568 and sent it off to engineering.
Here’s the work around version:
% cat test.c
#include <stdio.h>
#include <openacc.h>
int main () {
const int memsize = 10;
int *p = acc_malloc(memsize*sizeof(int));
int a[(int)memsize];
int i;
#pragma acc data copy(a[0:(int)memsize]), deviceptr(p)
{
#pragma acc kernels loop
for (i=0;i<memsize;i++){
p[i] = i;
a[i] = p[i];
}
// uncomment this to get it to fail
//#pragma acc host_data use_device(p)
{
cuda_kernel(p,memsize);
}
printf("L\n");
#pragma acc wait
#pragma acc kernels loop
for (i=0;i<memsize;i++){
a[i] = p[i];
}
}
printf("A[4]=%d\n",a[4]);
acc_free(p);
return(0);
}
% cat test_cu.cu
#include <stdio.h>
__global__ void cukernel(int* p, int sz){
/* This is the actual CUDA kernel */
int tid = blockIdx.x;
if (tid < sz)
p[tid] = p[tid*2];
}
extern "C" int cuda_kernel(int* openacc_ptr, size_t openacc_memsize){
int rc;
printf("Enter cuda_kernel.\n");
cudaThreadSynchronize();
cukernel<<<openacc_memsize,1>>>(openacc_ptr,(int)openacc_memsize);
rc = cudaGetLastError();
cudaThreadSynchronize();
printf("Inside cuda_kernel. %d\n", rc);
return(0);
}
% nvcc -c test_cu.cu ; pgcc -acc -ta=nvidia,5.0 test.c test_cu.o -Mcuda -V13.7 -Minfo=accel ; a.out
test.c:
main:
11, Generating copy(a[0:memsize])
13, Generating present_or_copy(a[0:memsize])
Generating NVIDIA code
Generating compute capability 1.0 binary
Generating compute capability 2.0 binary
Generating compute capability 3.0 binary
14, Loop is parallelizable
Accelerator kernel generated
14, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
27, Generating present_or_copy(a[0:memsize])
Generating NVIDIA code
Generating compute capability 1.0 binary
Generating compute capability 2.0 binary
Generating compute capability 3.0 binary
28, Loop is parallelizable
Accelerator kernel generated
28, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
Enter cuda_kernel.
Inside cuda_kernel. 0
L
A[4]=8
Engineering tried to find a way to either flag this as an error or just ignore “p” in this context, but unfortunately it doesn’t look possible since there isn’t a way to track what type of pointer p is (device or host).