I try to let python invoke cuda dynamic link library .so file in Xavier Nx. but python alway display “undefined symbol: __fatbinwrap_45_tmpxft_000028de_00000000_6_cuda_mut03_cpp1_ii_ae402fc2” in vscode.
there is no error message for command nvcc and g++ as below:
1>nvcc -arch=sm_72 -Xcompiler ‘-fPIC’ -dc cuda_mut03.cu
2>nvcc -arch=sm_72 -Xcompiler ‘-fPIC’ -dlink cuda_mut03.o -o test.o
3>g++ -shared -o cuda_mut03.so test.o -L/usr/local/cuda-10.2/lib64 -lcudart -lcudadevrt
python get error mssage as invoke cuda_mut03.so and the error message is same as linux command"ldd -r cuda_mut03.so":
libgtk3-nocsd.so.0 => /usr/lib/aarch64-linux-gnu/libgtk3-nocsd.so.0 (0x0000007f9ebc5000)
libcudart.so.10.2 => /usr/local/cuda-10.2/lib64/libcudart.so.10.2 (0x0000007f9eb51000)
libc.so.6 => /lib/aarch64-linux-gnu/libc.so.6 (0x0000007f9e9f7000)
libdl.so.2 => /lib/aarch64-linux-gnu/libdl.so.2 (0x0000007f9e9e2000)
libpthread.so.0 => /lib/aarch64-linux-gnu/libpthread.so.0 (0x0000007f9e9b6000)
/lib/ld-linux-aarch64.so.1 (0x0000007f9ec23000)
libstdc++.so.6 => /usr/lib/aarch64-linux-gnu/libstdc++.so.6 (0x0000007f9e822000)
librt.so.1 => /lib/aarch64-linux-gnu/librt.so.1 (0x0000007f9e80b000)
libm.so.6 => /lib/aarch64-linux-gnu/libm.so.6 (0x0000007f9e752000)
libgcc_s.so.1 => /lib/aarch64-linux-gnu/libgcc_s.so.1 (0x0000007f9e72e000)
undefined symbol: __fatbinwrap_45_tmpxft_000028de_00000000_6_cuda_mut03_cpp1_ii_ae402fc2 (./cuda_mut03.so)
the code of cuda_mut03.cu
#include <stdio.h>
#include <cuda.h>
#include “link_cu.h”
#define W 5
#define H 5
#define TB 2
#define KERNEL_SIZE 2
global void Conv2DKernel(float *output, float *input, float *kernel, int inputSize, int kernelSize)
{
int col = threadIdx.x + blockDim.x * blockIdx.x;
int row = threadIdx.y + blockDim.y * blockIdx.y;
const int limit = inputSize - kernelSize + 1;
if (col >= limit || row >= limit)
return;
int curCol = 0;
int curRow = 0;
float sum = 0.0f;
for (int i = 0; i < kernelSize; ++i)
{
for (int j = 0; j < kernelSize; ++j)
{
curCol = col + j;
curRow = row + i;
sum += (kernel[i * kernelSize + j] * input[curRow * inputSize + curCol]);
}
}
output[row * limit + col] = sum;
}
void display(float *arr, int w, int h)
{
for (int i = 0; i < w; ++i)
{
for (int j = 0; j < h; ++j)
{
printf(“%d,”, int(arr[i * w + j]));
}
printf(“\n”);
}
printf(“\n”);
}
int rand_num(int start, int end)
{
return rand() % (end + 1 - start) + start;
}
int main()
{
int imgSize = W * H;
int convOutW = W - KERNEL_SIZE + 1;
int convOutSize = convOutW * convOutW;
int mSize = imgSize * sizeof(float);
float *h_A = (float *)malloc(mSize);
float *h_Kernel = (float *)malloc(KERNEL_SIZE * KERNEL_SIZE * sizeof(float));
float *h_C = (float *)malloc(convOutSize * sizeof(float));
for (int i = 0; i < imgSize; ++i)
{
h_A[i] = rand_num(0, 10);
}
for (int j = 0; j < KERNEL_SIZE * KERNEL_SIZE; ++j)
{
h_Kernel[j] = rand_num(0, 2);
}
display(h_A, W, H);
display(h_Kernel, KERNEL_SIZE, KERNEL_SIZE);
float *d_A = NULL;
float *d_Kernel = NULL;
cudaMalloc(&d_A, mSize);
cudaMalloc(&d_Kernel, KERNEL_SIZE * KERNEL_SIZE * sizeof(float));
cudaMemcpy(d_A, h_A, mSize, cudaMemcpyHostToDevice);
cudaMemcpy(d_Kernel, h_Kernel, KERNEL_SIZE * KERNEL_SIZE * sizeof(float), cudaMemcpyHostToDevice);
float *d_C = NULL;
cudaMalloc(&d_C, convOutSize * sizeof(float));
dim3 dimBlock(TB, TB);
int tb = (W + TB - 1) / TB;
dim3 dimGrid(tb, tb);
Conv2DKernel<<<dimGrid, dimBlock>>>(d_C, d_A, d_Kernel, W, KERNEL_SIZE);
cudaMemcpy(h_C, d_C, convOutSize * sizeof(float), cudaMemcpyDeviceToHost);
display(h_C, convOutW, convOutW);
cudaFree(d_A);
cudaFree(d_Kernel);
cudaFree(d_C);
free(h_A);
free(h_Kernel);
free(h_C);
return 0;
}
the code of link_cu.h
#include “cuda_runtime.h”
extern void display(float *arr, int w, int h);
extern int rand_num(int start, int end);
I’m working this for 3days, does anyone know how to fix it ?