How let Python invoke cuda dynamic link library .so file

I try to let python invoke cuda dynamic link library .so file in Xavier Nx. but python alway display “undefined symbol: __fatbinwrap_45_tmpxft_000028de_00000000_6_cuda_mut03_cpp1_ii_ae402fc2” in vscode.
there is no error message for command nvcc and g++ as below:
1>nvcc -arch=sm_72 -Xcompiler ‘-fPIC’ -dc cuda_mut03.cu
2>nvcc -arch=sm_72 -Xcompiler ‘-fPIC’ -dlink cuda_mut03.o -o test.o
3>g++ -shared -o cuda_mut03.so test.o -L/usr/local/cuda-10.2/lib64 -lcudart -lcudadevrt
python get error mssage as invoke cuda_mut03.so and the error message is same as linux command"ldd -r cuda_mut03.so":
libgtk3-nocsd.so.0 => /usr/lib/aarch64-linux-gnu/libgtk3-nocsd.so.0 (0x0000007f9ebc5000)
libcudart.so.10.2 => /usr/local/cuda-10.2/lib64/libcudart.so.10.2 (0x0000007f9eb51000)
libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000007f9e9f7000)
libdl.so.2 => /lib/aarch64-linux-gnu/libdl.so.2 (0x0000007f9e9e2000)
libpthread.so.0 => /lib/aarch64-linux-gnu/libpthread.so.0 (0x0000007f9e9b6000)
/lib/ld-linux-aarch64.so.1 (0x0000007f9ec23000)
libstdc++.so.6 => /usr/lib/aarch64-linux-gnu/libstdc++.so.6 (0x0000007f9e822000)
librt.so.1 => /lib/aarch64-linux-gnu/librt.so.1 (0x0000007f9e80b000)
libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000007f9e752000)
libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000007f9e72e000)
undefined symbol: __fatbinwrap_45_tmpxft_000028de_00000000_6_cuda_mut03_cpp1_ii_ae402fc2 (./cuda_mut03.so)

the code of cuda_mut03.cu

#include <stdio.h>
#include <cuda.h>
#include “link_cu.h”
#define W 5
#define H 5
#define TB 2
#define KERNEL_SIZE 2

global void Conv2DKernel(float *output, float *input, float *kernel, int inputSize, int kernelSize)
{
int col = threadIdx.x + blockDim.x * blockIdx.x;
int row = threadIdx.y + blockDim.y * blockIdx.y;

const int limit = inputSize - kernelSize + 1;
if (col >= limit || row >= limit)
    return;

int curCol = 0;
int curRow = 0;
float sum = 0.0f;
for (int i = 0; i < kernelSize; ++i)
{
    for (int j = 0; j < kernelSize; ++j)
    {
        curCol = col + j;
        curRow = row + i;
        sum += (kernel[i * kernelSize + j] * input[curRow * inputSize + curCol]);
    }
}
output[row * limit + col] = sum;

}

void display(float *arr, int w, int h)
{
for (int i = 0; i < w; ++i)
{
for (int j = 0; j < h; ++j)
{
printf(“%d,”, int(arr[i * w + j]));
}
printf(“\n”);
}
printf(“\n”);
}

int rand_num(int start, int end)
{
return rand() % (end + 1 - start) + start;
}

int main()
{
int imgSize = W * H;
int convOutW = W - KERNEL_SIZE + 1;
int convOutSize = convOutW * convOutW;
int mSize = imgSize * sizeof(float);

float *h_A = (float *)malloc(mSize);
float *h_Kernel = (float *)malloc(KERNEL_SIZE * KERNEL_SIZE * sizeof(float));
float *h_C = (float *)malloc(convOutSize * sizeof(float));

  for (int i = 0; i < imgSize; ++i)
{
    h_A[i] = rand_num(0, 10);
}

for (int j = 0; j < KERNEL_SIZE * KERNEL_SIZE; ++j)
{
    h_Kernel[j] = rand_num(0, 2);
}

display(h_A, W, H);
display(h_Kernel, KERNEL_SIZE, KERNEL_SIZE);

float *d_A = NULL;
float *d_Kernel = NULL;
cudaMalloc(&d_A, mSize);
cudaMalloc(&d_Kernel, KERNEL_SIZE * KERNEL_SIZE * sizeof(float));


cudaMemcpy(d_A, h_A, mSize, cudaMemcpyHostToDevice);
cudaMemcpy(d_Kernel, h_Kernel, KERNEL_SIZE * KERNEL_SIZE * sizeof(float), cudaMemcpyHostToDevice);


float *d_C = NULL;
cudaMalloc(&d_C, convOutSize * sizeof(float));

 dim3 dimBlock(TB, TB);
int tb = (W + TB - 1) / TB;
dim3 dimGrid(tb, tb);

Conv2DKernel<<<dimGrid, dimBlock>>>(d_C, d_A, d_Kernel, W, KERNEL_SIZE);


cudaMemcpy(h_C, d_C, convOutSize * sizeof(float), cudaMemcpyDeviceToHost);

display(h_C, convOutW, convOutW);


cudaFree(d_A);
cudaFree(d_Kernel);
cudaFree(d_C);
free(h_A);
free(h_Kernel);
free(h_C);

return 0;

}

the code of link_cu.h

#include “cuda_runtime.h”
extern void display(float *arr, int w, int h);
extern int rand_num(int start, int end);

I’m working this for 3days, does anyone know how to fix it ?

Hi,

Here is an example for your reference:

Thanks.

Hi AastaLLL:

My problem is not for the python. I have the python code “ctype.CDLL(”…/cuda_mut03.so")", and the python alwasy dispaly error message “undefined symbol: __fatbinwrap_45_tmpxft_000028de_00000000_6_cuda_mut03_cpp1_ii_ae402fc2 (./cuda_mut03.so)”.

if I use linux command “ldd -r cuda_mut03.so” will display the same error message.
so, it must be som problem on dynamic link library. but I don’t know how to fix this issue.

Thanks and Regard.

Hi,

Sorry for the late update.

Have you created some intermediate format for function mapping first?
For example, you can use pybind11 or swig.
https://pybind11.readthedocs.io/en/stable/basics.html

To use CUDA, you will need to bind the CPU based API.
The workflow will look like this: python -> pybinding -> c++ -> CUDA

Thanks.