NVJPEG_STATUS_EXECUTION_FAILED error in use nvjpeg multi-thread decode

/CUDALibrarySamples/nvJPEG/nvJPEG-Decoder-MultipleInstances/nvJPEGDecMultipleInstances.cpp
CUDALibrarySamples/nvJPEG/nvJPEG-Decoder-MultipleInstances at master · NVIDIA/CUDALibrarySamples · GitHub
in this sample, which use decoupled API to run multi-thread decode, but i want to use single API in multi-thread; so i modified some code, a segmentation fault occurs in nvjpegDecode(). The error code is NVJPEG_STATUS_EXECUTION_FAILED.
my questions is:
1、 can we use nvjpegdecode() in multithread?
2、if yes, what result in my code run error?

  if(params.num_threads < 2)
  {
    auto& per_thread_params = params.nvjpeg_per_thread_data[0];
    // int buffer_index = 0;
    // CHECK_NVJPEG(nvjpegDecodeParamsSetOutputFormat(per_thread_params.nvjpeg_decode_params, params.fmt));
    for (int i = 0; i < params.batch_size; i++) {
              CHECK_NVJPEG(nvjpegDecode(params.nvjpeg_handle, params.nvjpeg_per_thread_data[i].nvjpeg_state,
                                     (const unsigned char *)img_data[i].data(),
                                     img_len[i], params.fmt, &out[i],
                                     params.nvjpeg_per_thread_data[i].stream));
    }
  }
  else
  {
    for (int i = 0; i < params.batch_size; i++) {
        workers.enqueue(std::bind(
            [&params,  &out, &img_data, &img_len](int iidx, int thread_idx)
                {
                  std::cout << "nvjpeg decode info: " << iidx << thread_idx << std::endl;
                  auto& per_thread_params = params.nvjpeg_per_thread_data[thread_idx];
                  CHECK_NVJPEG(nvjpegDecode(params.nvjpeg_handle, params.nvjpeg_per_thread_data[thread_idx].nvjpeg_state,
                                     (const unsigned char *)img_data[iidx].data(),
                                     img_len[iidx], params.fmt, &out[iidx],
                                     per_thread_params.stream));
                  // cudaStreamSynchronize(per_thread_params.stream);
                  return EXIT_SUCCESS; // the CHECK_ statements returns 1 on failure, so we need to return a value here too.
                }, i, std::placeholders::_1
                )
            );
    }
    workers.wait();
    for ( auto& per_thread_params : params.nvjpeg_per_thread_data) {
        CHECK_CUDA(cudaStreamSynchronize(per_thread_params.stream))
    }
  }

anyone can answer this question? pls help!

???no one answer this question?