How can I dive into a function using cuda-gdb?

Hi! I am learning this code: https://github.com/NVIDIA/cutlass/blob/main/examples/cute/tutorial/sgemm_nt_1.cu

I want to dive into line 120, and it enters:

  return local_partition(std::forward<Tensor>(tensor),
                         dice(proj, tile),
                         index);

Researcher told me, keep enter “step” will enter “std::forward(tensor),”, and then “dice(proj, tile),”, theoretically, then index, and finally enter local_partition function itself. But it stopped before “index”, why???

--Type <RET> for more, q to quit, c to continue without paging--
    at /home/zyhuang/temp_can/../cutlass/include/cute/tensor.hpp:930
930       return local_partition(std::forward<Tensor>(tensor),
(cuda-gdb) s
128[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (160,0,0), device 0, sm 0, warp 5, lane 0]
std::forward<cute::Tensor<cute::ViewEngine<cute::smem_ptr<float*> >, cute::Layout<cute::tuple<cute::C<128>, cute::C<8> >, cute::tuple<cute::C<1>, cute::C<128> > > >&> (__t=...) at /usr/include/c++/9/bits/move.h:75
75          { return static_cast<_Tp&&>(__t); }
(cuda-gdb) s
cute::local_partition<cute::Tensor<cute::ViewEngine<cute::smem_ptr<float*> >, cute::Layout<cute::tuple<cute::C<128>, cute::C<8> >, cute::tuple<cute::C<1>, cute::C<128> > > >&, cute::tuple<cute::C<16>, cute::C<16> >, cute::tuple<cute::C<1>, cute::C<16> >, unsigned int, cute::tuple<cute::C<1>, cute::Underscore>, (void*)0><<<(40,40,1),(256,1,1)>>> (
    tensor=..., tile=..., index=<error reading variable>, proj=...)
--Type <RET> for more, q to quit, c to continue without paging--
    at /home/zyhuang/temp_can/../cutlass/include/cute/tensor.hpp:931
931                              dice(proj, tile),
(cuda-gdb) s
cute::dice<cute::tuple<cute::C<1>, cute::Underscore>, cute::tuple<cute::C<16>, cute::C<16> >, cute::tuple<cute::C<1>, cute::C<16> > > (c=..., layout=...) at /home/zyhuang/temp_can/../cutlass/include/cute/layout.hpp:658
658       return make_layout(dice(c, layout.shape()),
(cuda-gdb) s
cute::Layout<cute::tuple<cute::C<16>, cute::C<16> >, cute::tuple<cute::C<1>, cute::C<16> > >::shape<>() const (
    this=0x7ffff2fff8b1) at /home/zyhuang/temp_can/../cutlass/include/cute/layout.hpp:125
125         return get<0,I...>(static_cast<cute::tuple<Shape, Stride> const&>(*this));
(cuda-gdb) s
cute::dice<cute::tuple<cute::C<1>, cute::Underscore>, cute::tuple<cute::C<16>, cute::C<16> > > (a=..., b=...)
    at /home/zyhuang/temp_can/../cutlass/include/cute/layout.hpp:658
658       return make_layout(dice(c, layout.shape()),
(cuda-gdb) s
170         return filter_tuple(a, b, [](auto const& x, auto const& y) { return detail::lift_dice(x,y); });
(cuda-gdb) s
cute::filter_tuple<cute::tuple<cute::C<1>, cute::Underscore>, cute::tuple<cute::C<16>, cute::C<16> >, cute::dice<cute::tuple<cute::C<1>, cute::Underscore>, cute::tuple<cute::C<16>, cute::C<16> > >(cute::tuple<cute::C<1>, cute::Underscore> const&, cute::tuple<cute::C<16>, cute::C<16> > const&)::{lambda(auto:1 const&, auto:2 const&)#1}>(cute::tuple<cute::C<1>, cute::Underscore> const&, cute::tuple<cute::C<16>, cute::C<16> > const&, cute::dice<cute::tuple<cute::C<1>, cute::U--Type <RET> for more, q to quit, c to continue without paging--
nderscore>, cute::tuple<cute::C<16>, cute::C<16> > >(cute::tuple<cute::C<1>, cute::Underscore> const&, cute::tuple<cute::C<16>, cute::C<16> > const&)::{lambda(auto:1 const&, auto:2 const&)#1}&&) (t0=..., t1=..., f=...)
    at /home/zyhuang/temp_can/../cutlass/include/cute/algorithm/tuple_algorithms.hpp:416
416       return transform_apply(t0, t1, f, [](auto const&... a) { return cute::tuple_cat(a...); });
(cuda-gdb) s
_ZN4cute15transform_applyIRKNS_5tupleIJNS_1CILi1EEENS_10UnderscoreEEEERKNS1_IJNS2_ILi16EEES8_EEERZNS_4diceIS5_S9_EEDaRKT_RKT0_EUlSF_SI_E_ZNS_12filter_tupleIS5_S9_SJ_EEDaSF_SI_OT1_EUlDpSF_E_EEDaOSD_OSG_SN_OT2_ (t0=..., t1=..., f=..., 
    g=...) at /home/zyhuang/temp_can/../cutlass/include/cute/algorithm/tuple_algorithms.hpp:156
156         return detail::tapply(static_cast<T0&&>(t0), static_cast<T1&&>(t1), f, g, tuple_seq<T0>{});
(cuda-gdb) s
_ZN4cute6detail6tapplyIRKNS_5tupleIJNS_1CILi1EEENS_10UnderscoreEEEERKNS2_IJNS3_ILi16EEES9_EEERZNS_4diceIS6_SA_EEDaRKT_RKT0_EUlSG_SJ_E_RZNS_12filter_tupleIS6_SA_SK_EEDaSG_SJ_OT1_EUlDpSG_E_JLi0ELi1EEEEDaOSE_OSH_SO_OT2_St16integer_sequenceIiJXT3_EEE (t0=..., t1=..., f=..., g=...)
    at /home/zyhuang/temp_can/../cutlass/include/cute/algorithm/tuple_algorithms.hpp:122
--Type <RET> for more, q to quit, c to continue without paging--
122       return g(f(get<I>(static_cast<T0&&>(t0)),
(cuda-gdb) s
cute::get<0ul, cute::C<1>, cute::Underscore> (t=...)
    at /home/zyhuang/temp_can/../cutlass/include/cute/container/tuple.hpp:204
204       return detail::getv<I>(t);
(cuda-gdb) s
_ZN4cute6detail6tapplyIRKNS_5tupleIJNS_1CILi1EEENS_10UnderscoreEEEERKNS2_IJNS3_ILi16EEES9_EEERZNS_4diceIS6_SA_EEDaRKT_RKT0_EUlSG_SJ_E_RZNS_12filter_tupleIS6_SA_SK_EEDaSG_SJ_OT1_EUlDpSG_E_JLi0ELi1EEEEDaOSE_OSH_SO_OT2_St16integer_sequenceIiJXT3_EEE (t0=..., t1=..., f=..., g=...)
    at /home/zyhuang/temp_can/../cutlass/include/cute/algorithm/tuple_algorithms.hpp:122
--Type <RET> for more, q to quit, c to continue without paging--
122       return g(f(get<I>(static_cast<T0&&>(t0)),
(cuda-gdb) s
123                  get<I>(static_cast<T1&&>(t1)))...);
(cuda-gdb) s
cute::get<0ul, cute::C<16>, cute::C<16> > (t=...)
    at /home/zyhuang/temp_can/../cutlass/include/cute/container/tuple.hpp:204
204       return detail::getv<I>(t);
(cuda-gdb) s
_ZN4cute6detail6tapplyIRKNS_5tupleIJNS_1CILi1EEENS_10UnderscoreEEEERKNS2_IJNS3_ILi16EEES9_EEERZNS_4diceIS6_SA_EEDaRKT_RKT0_EUlSG_SJ_E_RZNS_12filter_tupleIS6_SA_SK_EEDaSG_SJ_OT1_EUlDpSG_E_JLi0ELi1EEEEDaOSE_OSH_SO_OT2_St16integer_sequenceIiJXT3_EEE (t0=..., t1=..., f=..., g=...)
    at /home/zyhuang/temp_can/../cutlass/include/cute/algorithm/tuple_algorithms.hpp:123
--Type <RET> for more, q to quit, c to continue without paging--
123                  get<I>(static_cast<T1&&>(t1)))...);
(cuda-gdb) s
122       return g(f(get<I>(static_cast<T0&&>(t0)),

Thank you!!

Short answer: index is a reference. We are passing the reference to the local_partition call. There isn’t any other method called in this case.

Line 120 in sgemm_nt_1.cu looks like the following:

  // Partition gC (M,N) by the tile of tC
  auto tCgC = local_partition(gC, tC, threadIdx.x, Step<_1,_1>{});   // (THR_M,THR_N)

When we step into local_parition, eventually we will wind up at:

// Same as above, but with a projection parameter to strip out unwanted tiling modes for convenience
//   when using projections of the same tiler.
// This is typical at the Thread level where data is partitioned across projected layouts of threads:
//   Tensor dataA = ...                                                            // (M,K)
//   Tensor dataB = ...                                                            // (N,K)
//   Tensor dataC = ...                                                            // (M,N)
//   auto thr_layout = Layout<Shape<_2,_16,_1>, Stride<_16,_1,_0>>{};
//   Tensor thrA = local_partition(dataA, thr_layout, thr_idx, Step<_1, X,_1>{});  // (M/2,K/1)
//   Tensor thrB = local_partition(dataB, thr_layout, thr_idx, Step< X,_1,_1>{});  // (N/16,K/1)
//   Tensor thrC = local_partition(dataC, thr_layout, thr_idx, Step<_1,_1, X>{});  // (M/2,N/16)
template <class Tensor, class LShape, class LStride, class Index, class Projection,
          __CUTE_REQUIRES(is_tensor<remove_cvref_t<Tensor>>::value)>
CUTE_HOST_DEVICE
auto
local_partition(Tensor                     && tensor,
                Layout<LShape,LStride> const& tile,   // coord -> index
                Index                  const& index,  // index to slice for
                Projection             const& proj)
{
  return local_partition(std::forward<Tensor>(tensor),
                         dice(proj, tile),
                         index);
}

The return statement will result in method calls for arguments 1 and 2, but argument 3 is just a reference. There is no method called in order to pass index to local_partition.

If you want to avoid the method calls, one strategy to employ is to set a breakpoint. For example, if we set a breakpoint on local_parition and continue we would avoid stepping into the other methods.

Oh, you mean, set multiple break point, outer function has a breakpoint, inside, set another breakpoint, and when step into the inner function, set a break point now. Right?

Yes! Using breakpoints will avoid the need to step through the entire template function. You can use conditional breakpoints to specify a particular threadIdx as well. For example, if you have an existing breakpoint 1 you can say:

condition 1 threadIdx.x == 0
1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.