Dynamic parallelism on Jetson TX1 isn't working properly

I wrote a simple program to test a dynamic parallelism on Jetson TX1. The program is compiled well but doesn’t work as I expect. CUDA_SAFE_CALL terminates with unknown error on the first cuda call.

Cuda error in file ‘test_cuda_dp.cu’ in line 29 : unknown error.

I tested this program on x64 PC with Ubuntu 15.04 and CUDA 7.5 and it’s working well. Expected output is
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15

Have anybody the same problem? What is may be wrong?

Here is a source code:

#include <iostream>
#include <stdio.h>

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
    cudaError_t err = call; \
    if (cudaSuccess != err) { \
       fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n", \
                 __FILE__, __LINE__, cudaGetErrorString(err) ); \
        exit(EXIT_FAILURE); \
    } \
} while (0)

__global__ void ChildKernel(char* data){
    //Operate on data
    int id = blockIdx.x;
    data[id] = id;
}

__global__ void ParentKernel(char *data){
    int id = blockIdx.x;

    ChildKernel<<<16, 1>>>(&data[id * 16]);
}

int run_kernel() {
    char *data;
    CUDA_SAFE_CALL(cudaMalloc(&data, 32));

    ParentKernel<<<2, 1>>>(data);
    CUDA_SAFE_CALL(cudaGetLastError());
    cudaDeviceSynchronize();

    char hostData[32] = { 0 };
    CUDA_SAFE_CALL(cudaMemcpy(hostData, data, 32, cudaMemcpyDeviceToHost));
    for (int i = 0; i < 16; i++) {
        std::cout << (int) hostData[i] << " " << (int) hostData[i + 16] << std::endl;
    }

    return 0;
}

int main() {
    run_kernel();
    return 0;
}

and here is a cmake file to build a program:

cmake_minimum_required(VERSION 2.8.12.2 FATAL_ERROR)

set(the_project "test_cuda_dp")

find_package(CUDA REQUIRED)
set(CUDA_SEPARABLE_COMPILATION ON)

file(GLOB CUDA_SOURCES "${CMAKE_SOURCE_DIR}/*.cu")
LIST(APPEND CMAKE_CXX_FLAGS "-ffast-math -Wall -fPIC")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-rdc=true")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-arch=compute_35")

cuda_add_executable(${the_project} ${CUDA_SOURCES})
TARGET_LINK_LIBRARIES(${the_project} ${CUDA_LIBRARIES} ${CUDA_LIB} ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudadevrt.a)