Hi all,
I try to use a (self written) CUDA function in a program with OpenACC.
Unfortunately is every online-help not useful :-(
I wrote the following functions (a hello world test):
saxpy.cu:
#include <stdio.h>
extern const int N;
extern const int blocksize;
global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}
global
void saxpy(const int size_vec, double *factor_1, double *factor_2, double result)
{
int ii = blockIdx.xblockDim.x + threadIdx.x;
if (ii < size_vec) result[ii] *= factor_1[ii] * factor_2[ii];
}
void test(int size_vec, int _factor_1_size, int _factor_2_size, int _result_size, double *_factor_1_data, double *_factor_2_data, double *_result_data)
{
double * d_factor_1_data;
double * d_factor_2_data;
double * d_result_data;
const int factor_1_size = _factor_1_sizesizeof(double);
const int factor_2_size = _factor_2_sizesizeof(double);
const int result_size = _result_size*sizeof(double);
cudaMalloc( (void**)&d_factor_1_data, factor_1_size);
cudaMalloc( (void**)&d_factor_2_data, factor_2_size);
cudaMalloc( (void**)&d_result_data, result_size);
cudaMemcpy( d_factor_1_data, _factor_1_data, factor_1_size, cudaMemcpyHostToDevice );
cudaMemcpy( d_factor_2_data, _factor_2_data, factor_2_size, cudaMemcpyHostToDevice );
cudaMemcpy( d_result_data, _result_data, _result_size, cudaMemcpyHostToDevice );
int* _size_vec;
cudaMalloc((void**)&_size_vec, sizeof(int));
cudaMemcpy(_size_vec, &size_vec, sizeof(int), cudaMemcpyHostToDevice);
saxpy<<<(size_vec+255)/256, 256>>>(size_vec, d_factor_1_data, d_factor_2_data, d_result_data);
//cout << "a: " << _result_data[10] << endl;
cudaMemcpy( _result_data, d_result_data, _result_size, cudaMemcpyDeviceToHost );
//cout << "b: " << _result_data[10] << endl;
cudaFree(d_factor_1_data);
cudaFree(d_factor_2_data);
cudaFree(d_result_data);
}
and:
main.cpp:
#include <stdio.h>
#include
#include
#include
#include
#include
#include
#include
#include “/home/rosenbs/src/carp-dcse-pt/branches/mechanics/PT_C/toolbox/vector/toolbox_vector.h”
void test(int size_vec, int _factor_1_size, int _factor_2_size, int _result_size, double *_factor_1_data, double *_factor_2_data, double *_result_data);
const int N = 16;
const int blocksize = 16;
using namespace std;
int main(){
int size_vec = 10000;
toolbox_vector result(size_vec);
toolbox_vector factor_1(size_vec);
toolbox_vector factor_2(size_vec);
for(int ii=0; ii<size_vec; ii++){
factor_1[ii] = ii+1;
factor_2[ii] = 2.0/(ii+1);
result[ii] = -123.0;
}
double * _factor_1_data = factor_1.data();
double * _factor_2_data = factor_2.data();
double * _result_data = result.data();
const int _factor_1_size = factor_1.size();
const int _factor_2_size = factor_2.size();
const int _result_size = result.size();
cout << "Test A: " << _result_data[10] << endl;
test(size_vec, _factor_1_size, _factor_2_size, _result_size, _factor_1_data, _factor_2_data, _result_data);
cout << "Test B: " << _result_data[10] << endl;
return 0;
}
Remark: A toolbox_vector is (more or less) a self written std-vector.
If I try to compile and link the program, I use the following commands:
→ nvcc -O3 --compile -x c++ -o main.o main.cpp
→ nvcc -c saxpy.cu
→ nvcc --cudart static -gencode arch=compute_52,code=compute_52 -gencode arch=compute_52,code=sm_52 -link -o “test” main.o saxpy.o
works perfect (but only with CUDA).
If I try the following:
pgc++ -O3 -c -x c++ -o main.o main.cpp
nvcc -c saxpy.cu
(That works, with a warning).
But If I link I get for nvcc
→ nvcc --cudart static -gencode arch=compute_52,code=compute_52 -gencode arch=compute_52,code=sm_52 -link -o “test” main.o saxpy.o
the Error “undefined reference to `__c_mcopy8’”
And for pgc++
→ pgc++ arch=compute_52,code=compute_52 arch=compute_52,code=sm_52 -o “test” main.o saxpy.o+:
the Errpr: undefined reference to cudaMalloc' \ \ OK, obvious, missing library ....... But I can't find, which library I have to add (or which path to the librarys) that pgc++ can compile the Cuda functions. \ \ I also tried: --> pgc++ -O3 -fPIC -c -x c++ -o main.o main.cpp --> nvcc -c saxpy.cu --> pgc++ -fPIC -ta=nvidia:cuda8.0 -shared -o stats.so -o "test" main.o saxpy.o which leads to the Error: /usr/bin/ld: saxpy.o: relocation R_X86_64_32 against
.bss’ can not be used when making a shared object; recompile with -fPIC
Can anyone tell me, how I have to link my program to execute a Cuda function with PGI on the device?
Unfortunately is the link https://devblogs.nvidia.com/parallelforall/3-versatile-openacc-interoperability-techniques/ not useful for this.
I would be really grateful :-/
Best,
Stefan
PS: I tried to use -Mcudax86, but this is not what I want. With this option PGI execute the Cuda function on the host, but I want it to be on the device.