TensorRT-LLM error msg

Hi, can you anyone please advise what’s this error msg about "

Error processing query: LLM Call Exception: [500] Internal Server Error
Error during inference of request chat-4bd234c02ea240babd0a16fa4ad4543b – Encountered an error in forwardAsync function: [TensorRT-LLM][ERROR] CUDA runtime error in cudaMemcpy2DAsync( dstPtr, copyPitch, srcPtr, copyPitch, copyWidth, copyHeight, cudaMemcpyHostToDevice, cudaStream.get()): unknown error (/home/jenkins/agent/workspace/LLM/release-0.11/L0_MergeRequest/llm/cpp/tensorrt_llm/batch_manager/transformerBuffers.cpp:255)
1 0x7ff0dbe21c2e void tensorrt_llm::common::check(cudaError, char const*, char const*, int) + 94
2 0x7ff0dd050c71 tensorrt_llm::batch_manager::TransformerBuffers::copyKvBlockOffsets(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager const*, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager const*, tensorrt_llm::runtime::TllmRuntime const&) + 1281
3 0x7ff0dd048647 tensorrt_llm::batch_manager::RuntimeBuffers::setFromInputs(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::batch_manager::DecoderBuffers&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > >, std::less, std::allocator<std::pair<unsigned long const, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&) + 9783
4 0x7ff0dd04b612 tensorrt_llm::batch_manager::RuntimeBuffers::prepareStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::batch_manager::DecoderBuffers&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > >, std::less, std::allocator<std::pair<unsigned long const, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&) + 194
5 0x7ff0dd06ace2 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) + 162
6 0x7ff0dd06ae96 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) + 246
7 0x7ff0dd06b819 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) + 2409
8 0x7ff0dd08c19b tensorrt_llm::executor::Executor::Impl::forwardAsync(std::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) + 411
9 0x7ff0dd08f9dd tensorrt_llm::executor::Executor::Impl::executionLoop() + 301
10 0x7ff5614f4253 /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xdc253) [0x7ff5614f4253]
11 0x7ff563468ac3 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x7ff563468ac3]
12 0x7ff5634faa40 /usr/lib/x86_64-linux-gnu/libc.so.6(+0x126a40) [0x7ff5634faa40]
"

Hi @denglhs1 – is it possible you are running out of memory, either on the GPU or on the host memory? If not, it’s possible that there is an issue with he TensorRT-LLM engine, can you share more details on which model you are running and on what GPU(s)?