TensorRT-3 IShuffleLayer permutation error

I’m trying to implement my dnn model inference with tensorrt-3. I want to convert input data from HWC format to CHW. But when I use the transpose operation of IShuffleLayer, it seems that I can’t permute the channel dimension with the spatial dimension. Here’s my code for testing permutation:

void test(float* in, float* out)
{
    Logger gLogger;
    nvinfer1::IBuilder* builder = createInferBuilder(gLogger);
    nvinfer1::INetworkDefinition* network = builder->createNetwork();

    //  Create input
    auto data = network->addInput("data", nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{4, 4, 3});
    assert(data != nullptr);

    // Permute
    auto ps = network->addShuffle(*data);
    assert(ps != nullptr);

    std::cout << "ps transpose" << std::endl;
    ps->setFirstTranspose(nvinfer1::Permutation{1, 2, 0});

    // Set output layer
    ps->getOutput(0)->setName("out");
    network->markOutput(*ps->getOutput(0));

    // Build the engine
    builder->setMaxBatchSize(1);
    builder->setMaxWorkspaceSize(1 << 20);

    nvinfer1::ICudaEngine* engine = builder->buildCudaEngine(*network);
    network->destroy();
    assert(engine != nullptr);
    nvinfer1::IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);

    assert(engine->getNbBindings() == 2);
    int inputIndex = engine->getBindingIndex("data");
    int outputIndex = engine->getBindingIndex("out");

    void* buffers[2];
    CHECK(cudaMalloc(&buffers[inputIndex], 64 * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], 64 * sizeof(float)));
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    CHECK(cudaMemcpyAsync(buffers[inputIndex], in, 64 * sizeof(float), cudaMemcpyHostToDevice, stream));
    context->enqueue(1, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(out, buffers[outputIndex], 64 * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // release the stream and the buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
    engine->destroy();
    builder->destroy();
}

input data is a float buffer, and when I execute this code I get following errors:

Enhance: helpers.cpp:39: nvinfer1::DimsCHW nvinfer1::getCHW(const nvinfer1::Dims&): Assertion `isIndexedCHW(d)’ failed.
The program has unexpectedly finished.

And it seems ok when I just set the permutation as [0, 2, 1] that will keep the channel dimension and just transpose in spatial dimensions.

Any suggestion?
Thank you very much in advance!