debug issues with header only library (cutlass)

When I debugged into cutlass(GitHub - NVIDIA/cutlass: CUDA Templates for Linear Algebra Subroutines) kernels, I encountered an error: Cannot find the function header while disassembling.
I tested that cuda-gdb worked well with the cuda sample matrixMul.
The debug info is attached below, so anyone knows what’s the problem?
Thanks in advance.

Debug info:
$ cuda-gdb ./a.out
NVIDIA (R) CUDA Debugger
10.1 release
Portions Copyright (C) 2007-2018 NVIDIA Corporation
GNU gdb (GDB) 7.12
Copyright (C) 2016 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later http://gnu.org/licenses/gpl.html
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type “show copying”
and “show warranty” for details.
This GDB was configured as “x86_64-pc-linux-gnu”.
Type “show configuration” for configuration details.
For bug reporting instructions, please see:
http://www.gnu.org/software/gdb/bugs/.
Find the GDB manual and other documentation resources online at:
http://www.gnu.org/software/gdb/documentation/.
For help, type “help”.
Type “apropos word” to search for commands related to “word”…
Reading symbols from ./a.out…done.
(cuda-gdb) b main
Breakpoint 1 at 0x406565: file basic_gemm.cu, line 461.
(cuda-gdb) r
Starting program: /home/albert/code_reading/cutlass/examples/00_basic_gemm/a.out
[Thread debugging using libthread_db enabled]
Using host libthread_db library “/lib/x86_64-linux-gnu/libthread_db.so.1”.

Breakpoint 1, main (argc=1, arg=0x7fffffffcf28) at basic_gemm.cu:461
461 int problem[3] = { 128, 128, 128 };
(cuda-gdb) b gemm_mainloop.h:188
No line 188 in file “gemm_mainloop.h”.
Make breakpoint pending on future shared library load? (y or [n]) y
Breakpoint 2 (gemm_mainloop.h:188) pending.
(cuda-gdb) c
Continuing.
[New Thread 0x7fffef784700 (LWP 543)]
[New Thread 0x7fffeef83700 (LWP 544)]
[New Thread 0x7fffee776700 (LWP 545)]
[Switching focus to CUDA kernel 0, grid 6, block (0,0,0), thread (32,0,0), device 0, sm 0, warp 1, lane 0]

Thread 1 “a.out” hit Breakpoint 2, cutlass::gemm::GemmMainloop<cutlass::gemm::GemmTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<8, 8, 8, 1>, 1, 1, false>, cutlass::gemm::GlobalLoadStream<(cutlass::GemmOperand::Kind)0, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<(cutlass::GemmOperand::Kind)0, (cutlass::MatrixLayout::Kind)1, float const, cutlass::Shape<1, 8, 128, 1>, cutlass::Shape<1, 8, 32, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 8, 32, 1>, 1>, float, (cutlass::IteratorAdvance::Kind)1, (cutlass::MemorySpace::Kind)1, int, float, (cutlass::FragmentElementType::Kind)0, cutlass::Shape<0, 0, 0, 0> >, cutlass::Copy<cutlass::Fragment<float, 4, 16ul> > >, cutlass::gemm::GlobalLoadStream<(cutlass::GemmOperand::Kind)1, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<(cutlass::GemmOperand::Kind)1, (cutlass::MatrixLayout::Kind)1, float const, cutlass::Shape<1, 128, 8, 1>, cutlass::Shape<1, 32, 8, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 32, 8, 1>, 1, 4>, float, (cutlass::IteratorAdvance::Kind)1, (cutlass::MemorySpace::Kind)1, int, float, (cutlass::FragmentElementType::Kind)0, cutlass::Shape<0, 0, 0, 0> >, cutlass::Copy<cutlass::Fragment<float, 4, 16ul> > >, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileATraits<float const, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<1, 4, 2, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 0>, float, (cutlass::IteratorAdvance::Kind)1, (cutlass::MemorySpace::Kind)1, int, float, (cutlass::FragmentElementType::Kind)0, cutlass::Shape<0, 0, 0, 0> >, cutlass::Copy<cutlass::Fragment<float, 8, 16ul> > >, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileBTraits<float const, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<1, 4, 2, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 4>, float, (cutlass::IteratorAdvance::Kind)1, (cutlass::MemorySpace::Kind)1, int, float, (cutlass::FragmentElementType::Kind)0, cutlass::Shape<0, 0, 0, 0> >, cutlass::Copy<cutlass::Fragment<float, 8, 16ul> > >, cutlass::gemm::GemmEpilogue<cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<8, 8, 8, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true> >, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<8, 8, 8, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true> >, int> > >, cutlass::gemm::IdentityBlockSwizzle, int, cutlass::gemm::ClearAccumulators<float, 1> > >::multiply_add (this=0x7ffff3fff860)
at /home/albert/code_reading/cutlass/cutlass/gemm/gemm_mainloop.h:188
188 global_to_shared_stream.add_batch_offset(block_swizzle.get_batch_id());
(cuda-gdb) n
sh: 1: Syntax error: Unterminated quoted string

cutlass::gemm::GemmMainloop<cutlass::gemm::GemmTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<8, 8, 8, 1>, 1, 1, false>, cutlass::gemm::GlobalLoadStream<(cutlass::GemmOperand::Kind)0, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<(cutlass::GemmOperand::Kind)0, (cutlass::MatrixLayout::Kind)1, float const, cutlass::Shape<1, 8, 128, 1>, cutlass::Shape<1, 8, 32, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 8, 32, 1>, 1>, float, (cutlass::IteratorAdvance::Kind)1, (cutlass::MemorySpace::Kind)1, int, float, (cutlass::FragmentElementType::Kind)0, cutlass::Shape<0, 0, 0, 0> >, cutlass::Copy<cutlass::Fragment<float, 4, 16ul> > >, cutlass::gemm::GlobalLoadStream<(cutlass::GemmOperand::Kind)1, cutlass::gemm::GemmGlobalIteratorAb<cutlass::gemm::GemmGlobalTileTraits<(cutlass::GemmOperand::Kind)1, (cutlass::MatrixLayout::Kind)1, float const, cutlass::Shape<1, 128, 8, 1>, cutlass::Shape<1, 32, 8, 1>, 1>, int>, cutlass::TileStoreIterator<cutlass::gemm::GemmSharedStoreWithSkewTileAbTraits<float, cutlass::Shape<2, 8, 128, 1>, cutlass::Shape<1, 32, 8, 1>, 1, 4>, float, (cutlass::IteratorAdvance::Kind)1, (cutlass::MemorySpace::Kind)1, int, float, (cutlass::FragmentElementType::Kind)0, cutlass::Shape<0, 0, 0, 0> >, cutlass::Copy<cutlass::Fragment<float, 4, 16ul> > >, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileATraits<float const, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<1, 4, 2, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 0>, float, (cutlass::IteratorAdvance::Kind)1, (cutlass::MemorySpace::Kind)1, int, float, (cutlass::FragmentElementType::Kind)0, cutlass::Shape<0, 0, 0, 0> >, cutlass::Copy<cutlass::Fragment<float, 8, 16ul> > >, cutlass::gemm::SharedLoadStream<cutlass::TileLoadIterator<cutlass::gemm::GemmSharedLoadTileBTraits<float const, cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<1, 4, 2, 1>, cutlass::Shape<1, 4, 8, 1>, cutlass::Shape<1, 1, 1, 1>, 2, 4, 4>, float, (cutlass::IteratorAdvance::Kind)1, (cutlass::MemorySpace::Kind)1, int, float, (cutlass::FragmentElementType::Kind)0, cutlass::Shape<0, 0, 0, 0> >, cutlass::Copy<cutlass::Fragment<float, 8, 16ul> > >, cutlass::gemm::GemmEpilogue<cutlass::gemm::SimplifiedGemmEpilogueTraits<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<8, 8, 8, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true> >, int, cutlass::gemm::GemmEpilogueTraitsHelper<cutlass::gemm::SgemmConfig<cutlass::Shape<8, 128, 128, 1>, cutlass::Shape<8, 8, 8, 1>, 1, 1, false>, cutlass::gemm::LinearScaling<float, cutlass::gemm::FragmentMultiplyAdd<float, float, true> >, int> > >, cutlass::gemm::IdentityBlockSwizzle, int, cutlass::gemm::ClearAccumulators<float, 1> > >::multiply_add (this=0x7ffff3fff860) at /home/albert/code_reading/cutlass/cutlass/gemm/gemm_mainloop.h:188
188 global_to_shared_stream.add_batch_offset(block_swizzle.get_batch_id());
Cannot find the function header while disassembling.

(cuda-gdb)