Mix Compilation of MPI and CUDA with Dynamic Parallelism

I ran into one compilation problem when I want to compile the MPI+CUDA mixed program with dynamic parallelism support.

Source code of dyn_pal.cu

#include <stdio.h>
    #include <cuda.h>
    #define N 100
    #define M 32
    #define K 2
    __device__ volatile int vint = 0;
    __global__ void entry( volatile int* foo ) {
        for (int i = 0; i < N; ++i) {
            atomicAdd((int*)foo, 1);
        }
    }
    
    //extern "C" __global__ void diverge_cta( volatile int *foo )
    extern "C" __global__ void diverge_cta( volatile int *foo )
    {
      __shared__ int x;
      if ((threadIdx.x%32) != 0) {return;}
    ////    entry(foo);   //original design: each thread call entry()
        if (threadIdx.x == 0) {
            entry<<<1,M>>>( foo );
            cudaDeviceSynchronize();
            x = 5;
            return;
        }
        __syncthreads();
        atomicAdd((int*)foo, x);
    }
    
    extern "C" void mycal(int myrank){
        int *foo; int h_foo;
        cudaMalloc((void**)&foo, sizeof(int)); cudaMemset(foo, 0, sizeof(int));
        printf("foo addr: 0x%x\n", (unsigned)(size_t)foo);
        diverge_cta<<<K,M*32>>>( foo );
        cudaDeviceSynchronize();
        cudaMemcpy(&h_foo, foo, sizeof(int), cudaMemcpyDeviceToHost);
        if (h_foo == K*(M*N+5*(M-1))) {
            printf("simple_scan_test test PASSED\n");
        } else {
            printf("Result: %d\n", h_foo); printf("simple_scan_test test FAILED\n");
        }
    }

Sourcecode of MPI code: indexed_gpu.c.

#include <assert.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <cuda.h>
    #include <cuda_runtime.h>
    void diverge_cta( volatile int *foo);
    void mycal(int myrank);
    
    int main(int argc, char *argv[]){
      int myid, numprocs, i, j, k;
      MPI_Init(&argc, &argv);
      MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
      MPI_Comm_rank(MPI_COMM_WORLD, &myid);
      mycal(myid);
      MPI_Barrier(MPI_COMM_WORLD);
      MPI_Finalize();
      return EXIT_SUCCESS;
    }

The compilation command I used:

nvcc -arch=sm_35 -dc dyn_pal.cu -o dynpal.o -lcudadevrt
mpicc -c indexed_gpu.c -o mpi_bench.o
mpicc mpi_bench.o dynpal.o -o gpu_idx -L/opt/cuda/5.5/lib64 -lcudart

Error I got:
dynpal.o: In function __sti____cudaRegisterAll_42_tmpxft_000044ae_00000000_6_dyn_pal_cpp1_ii_vint()': tmpxft_000044ae_00000000-3_dyn_pal.cudafe1.cpp:(.text+0x314): undefined reference to __cudaRegisterLinkedBinary_42_tmpxft_000044ae_00000000_6_dyn_pal_cpp1_ii_vint’
collect2: ld returned 1 exit status

BTW, if I didn’t include ‘-dc’ when compiling cuda object file (line 1), I got this error:
dyn_pal.cu(25): error: kernel launch from device or global functions requires separate compilation mode
1 error detected in the compilation of “/tmp/tmpxft_00004303_00000000-6_dyn_pal.cpp1.ii”.

If I don’t use MPI programs, the pure CUDA program with parallelism support can compile and run sucessfully on my Kepler GPUs.
I am wondering, whether CUDA dynamic parallelism is supported for mixed program?

Thanks a lot.