How let Python invoke cuda dynamic link library .so file

13911815368 · September 22, 2020, 6:10am

I try to let python invoke cuda dynamic link library .so file in Xavier Nx. but python alway display “undefined symbol: __fatbinwrap_45_tmpxft_000028de_00000000_6_cuda_mut03_cpp1_ii_ae402fc2” in vscode.
there is no error message for command nvcc and g++ as below:
1>nvcc -arch=sm_72 -Xcompiler ‘-fPIC’ -dc cuda_mut03.cu
2>nvcc -arch=sm_72 -Xcompiler ‘-fPIC’ -dlink cuda_mut03.o -o test.o
3>g++ -shared -o cuda_mut03.so test.o -L/usr/local/cuda-10.2/lib64 -lcudart -lcudadevrt
python get error mssage as invoke cuda_mut03.so and the error message is same as linux command"ldd -r cuda_mut03.so":
libgtk3-nocsd.so.0 => /usr/lib/aarch64-linux-gnu/libgtk3-nocsd.so.0 (0x0000007f9ebc5000)
libcudart.so.10.2 => /usr/local/cuda-10.2/lib64/libcudart.so.10.2 (0x0000007f9eb51000)
libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000007f9e9f7000)
libdl.so.2 => /lib/aarch64-linux-gnu/libdl.so.2 (0x0000007f9e9e2000)
libpthread.so.0 => /lib/aarch64-linux-gnu/libpthread.so.0 (0x0000007f9e9b6000)
/lib/ld-linux-aarch64.so.1 (0x0000007f9ec23000)
libstdc++.so.6 => /usr/lib/aarch64-linux-gnu/libstdc++.so.6 (0x0000007f9e822000)
librt.so.1 => /lib/aarch64-linux-gnu/librt.so.1 (0x0000007f9e80b000)
libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000007f9e752000)
libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000007f9e72e000)
undefined symbol: __fatbinwrap_45_tmpxft_000028de_00000000_6_cuda_mut03_cpp1_ii_ae402fc2 (./cuda_mut03.so)

the code of cuda_mut03.cu

#include <stdio.h>
#include <cuda.h>
#include “link_cu.h”
#define W 5
#define H 5
#define TB 2
#define KERNEL_SIZE 2

global void Conv2DKernel(float *output, float *input, float *kernel, int inputSize, int kernelSize)
{
int col = threadIdx.x + blockDim.x * blockIdx.x;
int row = threadIdx.y + blockDim.y * blockIdx.y;

const int limit = inputSize - kernelSize + 1;
if (col >= limit || row >= limit)
    return;

int curCol = 0;
int curRow = 0;
float sum = 0.0f;
for (int i = 0; i < kernelSize; ++i)
{
    for (int j = 0; j < kernelSize; ++j)
    {
        curCol = col + j;
        curRow = row + i;
        sum += (kernel[i * kernelSize + j] * input[curRow * inputSize + curCol]);
    }
}
output[row * limit + col] = sum;

}

void display(float *arr, int w, int h)
{
for (int i = 0; i < w; ++i)
{
for (int j = 0; j < h; ++j)
{
printf(“%d,”, int(arr[i * w + j]));
}
printf(“\n”);
}
printf(“\n”);
}

int rand_num(int start, int end)
{
return rand() % (end + 1 - start) + start;
}

int main()
{
int imgSize = W * H;
int convOutW = W - KERNEL_SIZE + 1;
int convOutSize = convOutW * convOutW;
int mSize = imgSize * sizeof(float);

float *h_A = (float *)malloc(mSize);
float *h_Kernel = (float *)malloc(KERNEL_SIZE * KERNEL_SIZE * sizeof(float));
float *h_C = (float *)malloc(convOutSize * sizeof(float));

  for (int i = 0; i < imgSize; ++i)
{
    h_A[i] = rand_num(0, 10);
}

for (int j = 0; j < KERNEL_SIZE * KERNEL_SIZE; ++j)
{
    h_Kernel[j] = rand_num(0, 2);
}

display(h_A, W, H);
display(h_Kernel, KERNEL_SIZE, KERNEL_SIZE);

float *d_A = NULL;
float *d_Kernel = NULL;
cudaMalloc(&d_A, mSize);
cudaMalloc(&d_Kernel, KERNEL_SIZE * KERNEL_SIZE * sizeof(float));


cudaMemcpy(d_A, h_A, mSize, cudaMemcpyHostToDevice);
cudaMemcpy(d_Kernel, h_Kernel, KERNEL_SIZE * KERNEL_SIZE * sizeof(float), cudaMemcpyHostToDevice);


float *d_C = NULL;
cudaMalloc(&d_C, convOutSize * sizeof(float));

 dim3 dimBlock(TB, TB);
int tb = (W + TB - 1) / TB;
dim3 dimGrid(tb, tb);

Conv2DKernel<<<dimGrid, dimBlock>>>(d_C, d_A, d_Kernel, W, KERNEL_SIZE);


cudaMemcpy(h_C, d_C, convOutSize * sizeof(float), cudaMemcpyDeviceToHost);

display(h_C, convOutW, convOutW);


cudaFree(d_A);
cudaFree(d_Kernel);
cudaFree(d_C);
free(h_A);
free(h_Kernel);
free(h_C);

return 0;

}

the code of link_cu.h

#include “cuda_runtime.h”
extern void display(float *arr, int w, int h);
extern int rand_num(int start, int end);

I’m working this for 3days, does anyone know how to fix it ?

AastaLLL · September 22, 2020, 7:40am

Hi,

Here is an example for your reference:

github.com

AastaNV/TRT_object_detection/blob/master/main.py#L18


      
          import pycuda.driver as cuda
          
          
import coco
          import uff
          import tensorrt as trt
          import graphsurgeon as gs
          #from config import model_ssd_inception_v2_coco_2017_11_17 as model
          #from config import model_ssd_mobilenet_v1_coco_2018_01_28 as model
          from config import model_ssd_mobilenet_v2_coco_2018_03_29 as model
          
          
ctypes.CDLL("lib/libflattenconcat.so")
          COCO_LABELS = coco.COCO_CLASSES_LIST
          
          

          
# initialize
          TRT_LOGGER = trt.Logger(trt.Logger.INFO)
          trt.init_libnvinfer_plugins(TRT_LOGGER, '')
          runtime = trt.Runtime(TRT_LOGGER)
          
          

          
# compile model into TensorRT

Thanks.

13911815368 · September 22, 2020, 7:58am

Hi AastaLLL:

My problem is not for the python. I have the python code “ctype.CDLL(”…/cuda_mut03.so")", and the python alwasy dispaly error message “undefined symbol: __fatbinwrap_45_tmpxft_000028de_00000000_6_cuda_mut03_cpp1_ii_ae402fc2 (./cuda_mut03.so)”.

if I use linux command “ldd -r cuda_mut03.so” will display the same error message.
so, it must be som problem on dynamic link library. but I don’t know how to fix this issue.

Thanks and Regard.

AastaLLL · October 13, 2020, 8:36am

Hi,

Sorry for the late update.

Have you created some intermediate format for function mapping first?
For example, you can use pybind11 or swig.
https://pybind11.readthedocs.io/en/stable/basics.html

To use CUDA, you will need to bind the CPU based API.
The workflow will look like this: python -> pybinding -> c++ -> CUDA

Thanks.