I am running this code:
this
/temp_can$ nvcc -o sgemm_nt_1 sgemm_nt_1.cu -arch=sm_80 -std=c++17 -I ../cutlass/include -I ../cutlass/tools/util/include --expt-relaxed-constexpr -g -G
[1]+ Killed cuda-gdb sgemm_nt_1
:~/temp_can$
:~/temp_can$ cuda-gdb sgemm_nt_1
NVIDIA (R) CUDA Debugger
11.7 release
Portions Copyright (C) 2007-2022 NVIDIA Corporation
GNU gdb (GDB) 10.2
Copyright (C) 2021 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-pc-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
--Type <RET> for more, q to quit, c to continue without paging--
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from sgemm_nt_1...
(cuda-gdb) break sgemm_nt_1.cu:202
Breakpoint 1 at 0xdb6a: file sgemm_nt_1.cu, line 238.
(cuda-gdb) run
Starting program: /home/zyhuang/temp_can/sgemm_nt_1
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[Detaching after fork from child process 115566]
[New Thread 0x7fffdffff000 (LWP 115570)]
[New Thread 0x7fffdf7fe000 (LWP 115571)]
Using device 0: NVIDIA A100 80GB PCIe (SM80, 108 SMs)
M = 5120
N = 5120
K = 4096
Verification by comparison with cuBLAS is disabled, either because the CMake option CUTLASS_ENABLE_CUBLAS was explicitly set to OFF, or because CMake could not find cuBLAS. If you would like to enable verification with cuBLAS, please set the CMake option CUTLASS_ENABLE_CUBLAS to ON, rerun CMake, and recompile this example.
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (224,0,0), device 0, sm 0, warp 7, lane 0]
Thread 1 "sgemm_nt_1" hit Breakpoint 1, gemm_device<int, int, int, float, cute::tuple<cute::C<1>, int>, cute::Layout<cute::tuple<cute::C<128>, cute::C<8> >, cute::tuple<cute::C<1>, cute::C<128> > >, cute::Layout<cute::tuple<cute::C<32>, cute::C<8> >, cute::tuple<cute::C<1>, cute::C<32> > >, float, cute::tuple<cute::C<1>, int>, cute::Layout<cute::tuple<cute::C<128>, cute::C<8> >, cute::tuple<cute::C<1>, cute::C<128> > >, cute::Layout<cute::tuple<cute::C<32>, cute::C<8> >, cute::tuple<cute::C<1>, cute::C<32> > >, float, cute::tuple<cute::C<1>, int>, cute::Layout<cute::tuple<cute::C<128>, cute::C<128> >, cute::tuple<cute::C<1>, cute::C<128> > >, cute::Layout<cute::tuple<cute::C<16>, cute::C<16> >, cute::tuple<cute::C<1>, cute::C<16> > >, float, float>
<<<(40,40,1),(256,1,1)>>> (M=5120, N=5120, K=4096, A=0x7fff84000000, dA=..., blockA=..., tA=...,
B=0x7fff96000000, dB=..., blockB=..., tB=..., C=0x7fff8e000000, dC=..., tC=..., alpha=1, beta=0)
--Type <RET> for more, q to quit, c to continue without paging--
at sgemm_nt_1.cu:202
202 copy(tAgA(_,_,k), tAsA);
(cuda-gdb) s
cute::Tensor<cute::ViewEngine<cute::gmem_ptr<float const*> >, cute::Layout<cute::tuple<cute::C<4>, cute::C<1>, int>, cute::tuple<cute::C<32>, cute::C<0>, int> > >::operator()<cute::Underscore, cute::Underscore, int>
<<<(40,40,1),(256,1,1)>>> (this=0x7ffff2fffc30, c0=..., c1=..., cs=<error reading variable>)
at /home/zyhuang/temp_can/../cutlass/include/cute/tensor.hpp:239
239 return operator()(make_coord(c0,c1,cs...));
(cuda-gdb)
The code here is:
template <class SrcEngine, class SrcLayout,
class DstEngine, class DstLayout>
CUTE_HOST_DEVICE
void
copy(Tensor<SrcEngine, SrcLayout> const& src,
Tensor<DstEngine, DstLayout> & dst)
{
constexpr int N = decltype(max_common_vector(src, dst))::value;
#if 0
if (thread0()) {
print("copy -- found a max_common_vector of %d\n", N);
print(" "); print(src.data()); print(" o "); print(layout(src)); print("\n");
print(" "); print(dst.data()); print(" o "); print(layout(dst)); print("\n");
}
#endif
if constexpr (N <= 1) {
return copy_if(TrivialPredTensor{}, src, dst);
} else {
constexpr int vec_bits = N * sizeof_bits<typename SrcEngine::value_type>::value;
using VecType = uint_bit_t<cute::min(128, vec_bits)>;
return copy_vec<VecType>(src, dst);
}
}
So the gdb should direct me into in-depth copy, but not “return operator()(make_coord(c0,c1,cs…));”
Why? Is it my usage problem? Thank you!!!