Errors cross compiler in QT with NVCC

Hi, I need some directions for my problem. I need to cross compile some CUDA code in QT. So probably my problem is with NVCC compiler. My host is Ubuntu 18.04 and device where I want to run it is Jetson. This is how my gst_dde.pro file looks like:

QT -=  gui


TARGET = gst_dde

TEMPLATE = lib


CONFIG += plugin c++11

CONFIG += link_pkgconfig


PKGCONFIG += opencv \

             cuda-8.0 \

             gstreamer-video-1.0


DEFINES += QT_DEPRECATED_WARNINGS


SOURCES +=  main.cpp


HEADERS += gst_dde_global.h \

    gst_dde.cuh


QMAKE_EXTRA_COMPILERS += cuda


CUDA_SOURCES += gst_dde.cu


CUDA_DIR = /usr/local/cuda


CUDA_ARCH = sm_62


INCLUDEPATH += $$CUDA_DIR/include


LIBS += -L $$CUDA_DIR/lib64 -lcudart -lcuda


cuda.commands = $$CUDA_DIR/bin/nvcc -c -arch=$$CUDA_ARCH -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}


cuda.dependency_type = TYPE_C


cuda.depend_command = $$CUDA_DIR/bin/nvcc -M ${QMAKE_FILE_NAME}


cuda.input = CUDA_SOURCES


cuda.output = ${OBJECTS_DIR}${QMAKE_FILE_BASE}_cuda.o


QMAKE_EXTRA_COMPILERS += cuda


unix {

    target.path = vlatacom/vsp/gstreamer_plugins

    INSTALLS += target

}


QMAKE_RPATHDIR +=   $$(ORIGIN).


DISTFILES += gst_dde.cu

gst_dde.cuh looks like:

#ifndef GST_DDE_CUH

#define GST_DDE_CUH


#include <device_launch_parameters.h>

#include <cuda_runtime.h>

#include <cuda.h>


#define HEIGHT 480

#define WIDTH 640

#define BLOCK_DIM 1024

#define GRID_DIM 300

#define BLOCK_DIM2 64


__global__

void BLUR(float* In, float* Out, int kDim, int inWidth, int outWidth, int outHeight);


void BLUR_wrapper(float* In, float* Out, int kDim, int inWidth, int outWidth, int outHeight);


#endif

gst_dde.cu looks like:

#include "gst_dde.cuh"



__global__

void BLUR(float* In, float* Out, int kDim, int inWidth, int outWidth, int outHeight);


void BLUR_wrapper(float* In, float* Out, int kDim, int inWidth, int outWidth, int outHeight);


__global__

void BLUR(float* In, float* Out, int kDim, int inWidth, int outWidth, int outHeight) {

    extern __shared__ float loadIn[];


    int trueDimX = blockDim.x - (kDim - 1);

    int trueDimY = blockDim.y - (kDim - 1);


    int col = (blockIdx.x * trueDimX) + threadIdx.x;

    int row = (blockIdx.y * trueDimY) + threadIdx.y;


    if ((0 <= col)&& (col <= outWidth + 1) && (0 <= row) && (col <= outHeight + 1)) {

        if ((col == 0) || (row == 0)) {

            if ((row < outHeight) && (col < outWidth)) {

                loadIn[threadIdx.y * blockDim.x + threadIdx.x] = In[row * inWidth + col];

            }

            if ((row == outHeight) || (row == outHeight + 1)) {

                loadIn[threadIdx.y * blockDim.x + threadIdx.x] = In[(outHeight - 1) * inWidth + col];

            }

            if ((col == outWidth) || (col == outWidth + 1)) {

                loadIn[threadIdx.y * blockDim.x + threadIdx.x] = In[row * inWidth + outWidth - 1];

            }

        }

        else {

            if ((col < outWidth + 1) && (row < outHeight + 1)) {

                loadIn[threadIdx.y * blockDim.x + threadIdx.x] = In[(row - 1) * inWidth + col - 1];

            }

            else {

                if (row == 1) {

                    loadIn[threadIdx.y * blockDim.x + threadIdx.x] = In[row * inWidth + col - 2];

                }

                else {

                    if (col == 1) {

                        loadIn[threadIdx.y * blockDim.x + threadIdx.x] = In[(row - 2) * inWidth + col];

                    }

                    else {

                        loadIn[threadIdx.y * blockDim.x + threadIdx.x] = In[(row - 2) * inWidth + (col - 2)];

                    }

                }

            }


        }


        __syncthreads();

        if (threadIdx.y < trueDimY && threadIdx.x < trueDimX) {

            float acc = 0;

            for (int i = 0; i < kDim; ++i) {

                for (int j = 0; j < kDim; ++j) {

                    acc = acc + loadIn[(threadIdx.y + i) * blockDim.x + (threadIdx.x + j)];

                }

            }

            Out[row * inWidth + col] = acc * (1.0 / kDim / kDim);

        }

    }

    else

        loadIn[threadIdx.y * blockDim.x + threadIdx.x] = 0.0;

}


void BLUR_wrapper(float* In, float* Out, int kDim, int inWidth, int outWidth, int outHeight)

{

    int bw = 16;

    int bwHalo = bw + (kDim - 1);

    dim3 dimBlock(bwHalo,bwHalo);

    dim3 dimGrid(ceil(WIDTH/float(bw)),ceil(HEIGHT/float(bw)));


    BLUR<<<dimGrid,dimBlock,bwHalo*bwHalo*sizeof(float)>>>(In,Out,kDim,inWidth,outWidth,outHeight);

}

And I also have some main.cpp file where I call BLUR_wrapper that shouldn’t be the problem.

I get this error when I build:
./gst_dde_cuda.o: Relocations in generic ELF (EM: 62)
error adding symbols: File in wrong format
collect2: error: ld returned 1 exit status

Can someone aid me with this? Is problem with location of saved .o file? should I be using NVCC from rootfs that I got from copying from Jetson that is my target or should I use NVCC that is installed on host? And is it a problem that I use CUDA 11 on host and have CUDA 8 on Jetson (that can be installed only on Ubuntu 14 and 16). Hope I am somewhat clear with my problem. Thank you.