Thanks for your quick reply.
We look forward to using IBGDA in future releases.
Here is the linked .ll file after some standard llvm optimization(before inline):
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
%struct.nvshmemi_device_host_state_v1 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i8, i8, ptr, i64, ptr, ptr, i8, i8, i32, ptr, ptr, ptr, ptr, ptr, %struct.gpu_coll_env_params_v2, ptr, ptr, ptr, ptr, i64, i64, ptr, ptr, ptr, ptr, ptr, ptr, ptr, i64, i32, ptr, ptr, i8, i8, i8 }
%struct.gpu_coll_env_params_v2 = type { i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i32, i64, i32, i32, i64, i64, [416 x i8] }
%struct.nvshmemi_version_t = type { i32, i32, i32 }
@nvshmemi_device_state_d = dso_local addrspace(4) externally_initialized global %struct.nvshmemi_device_host_state_v1 zeroinitializer, align 8
@nvshmemi_device_lib_version_d = dso_local addrspace(4) externally_initialized global %struct.nvshmemi_version_t { i32 3, i32 2, i32 5 }, align 4
@llvm.compiler.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(4) @nvshmemi_device_lib_version_d to ptr), ptr addrspacecast (ptr addrspace(4) @nvshmemi_device_state_d to ptr)], section "llvm.metadata"
define ptx_kernel void @ring_put(ptr addrspace(1) %0, ptr addrspace(1) %1) local_unnamed_addr !dbg !11 {
%3 = call fastcc i32 @nvshmem_my_pe(), !dbg !14
%4 = call fastcc i32 @nvshmem_n_pes(), !dbg !15
%5 = add i32 %3, 1, !dbg !16
%6 = srem i32 %5, %4, !dbg !17
%7 = call i32 @nvshmem_int_p(ptr addrspace(1) %0, i32 %3, i32 %6), !dbg !18
ret void, !dbg !19
}
; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define internal fastcc i32 @nvshmem_my_pe() unnamed_addr #0 {
%1 = load i32, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 1), align 4, !tbaa !20
ret i32 %1
}
; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(none)
define internal fastcc i32 @nvshmem_n_pes() unnamed_addr #0 {
%1 = load i32, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 2), align 8, !tbaa !30
ret i32 %1
}
; Function Attrs: alwaysinline mustprogress norecurse nounwind
define internal void @nvshmem_int_p(ptr noundef %0, i32 noundef %1, i32 noundef %2) unnamed_addr #1 {
%4 = load ptr, ptr addrspace(4) getelementptr inbounds nuw (i8, ptr addrspace(4) @nvshmemi_device_state_d, i64 56), align 8, !tbaa !31
%5 = sext i32 %2 to i64
%6 = getelementptr inbounds i64, ptr %4, i64 %5
%7 = addrspacecast ptr %6 to ptr addrspace(1)
%8 = load i64, ptr addrspace(1) %7, align 8, !invariant.load !13
%9 = icmp eq i64 %8, 0
br i1 %9, label %17, label %10
10: ; preds = %3
%11 = inttoptr i64 %8 to ptr
%12 = load ptr, ptr addrspace(4) getelementptr inbounds nuw (i8, ptr addrspace(4) @nvshmemi_device_state_d, i64 40), align 8, !tbaa !32
%13 = ptrtoint ptr %0 to i64
%14 = ptrtoint ptr %12 to i64
%15 = sub i64 %13, %14
%16 = getelementptr inbounds i8, ptr %11, i64 %15
store i32 %1, ptr %16, align 4, !tbaa !33
br label %18
17: ; preds = %3
notail call fastcc void @_Z23nvshmemi_transfer_rma_pIiEvPvT_i(ptr noundef %0, i32 noundef %1, i32 noundef %2) #4
br label %18
18: ; preds = %17, %10
ret void
}
; Function Attrs: mustprogress noinline norecurse nounwind
define internal fastcc void @_Z23nvshmemi_transfer_rma_pIiEvPvT_i(ptr noundef %0, i32 noundef %1, i32 noundef %2) unnamed_addr #2 section ".text.compute" {
%4 = load ptr, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 24), align 8, !tbaa !34
%5 = load ptr, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 11), align 8, !tbaa !32
%6 = load ptr, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 30), align 8, !tbaa !35
%7 = atomicrmw add ptr %6, i64 24 seq_cst, align 8
%8 = add i64 %7, 23
%9 = load ptr, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 32), align 8, !tbaa !36
%10 = load volatile i64, ptr %9, align 8, !tbaa !37
%11 = load i64, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 37), align 8, !tbaa !38
%12 = add i64 %11, -1
%13 = add i64 %12, %10
%14 = icmp ult i64 %13, %8
br i1 %14, label %15, label %23
15: ; preds = %3
%16 = load ptr, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 31), align 8, !tbaa !39
br label %17
17: ; preds = %17, %15
%18 = load volatile i64, ptr %16, align 8, !tbaa !37
%19 = add i64 %12, %18
%20 = icmp ult i64 %19, %8
br i1 %20, label %17, label %21, !llvm.loop !40
21: ; preds = %17
%22 = atomicrmw umax ptr %9, i64 %18 seq_cst, align 8
tail call void @llvm.nvvm.membar.sys()
br label %23
23: ; preds = %21, %3
%24 = and i64 %7, 4194303
%25 = getelementptr inbounds nuw i8, ptr %4, i64 %24
%26 = load i32, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 38), align 8, !tbaa !43
%27 = zext nneg i32 %26 to i64
%28 = xor i64 %7, -1
%29 = lshr i64 %28, %27
%30 = and i64 %29, 1
%31 = ptrtoint ptr %0 to i64
%32 = ptrtoint ptr %5 to i64
%33 = sub i64 %31, %32
%34 = zext i32 %1 to i64
%35 = shl i64 %33, 24
%36 = or disjoint i64 %30, %35
%37 = or disjoint i64 %36, 131328
store volatile i64 %37, ptr %25, align 8, !tbaa !37
%38 = add i64 %7, 8
%39 = and i64 %38, 4194303
%40 = getelementptr inbounds nuw i8, ptr %4, i64 %39
%41 = sub i64 -9, %7
%42 = lshr i64 %41, %27
%43 = and i64 %42, 1
%44 = shl nuw i64 %34, 32
%45 = shl i32 %2, 16
%46 = zext i32 %45 to i64
%47 = or disjoint i64 %44, %46
%48 = or disjoint i64 %43, %47
store volatile i64 %48, ptr %40, align 8, !tbaa !37
%49 = add i64 %7, 16
%50 = and i64 %49, 4194303
%51 = getelementptr inbounds nuw i8, ptr %4, i64 %50
%52 = sub i64 -17, %7
%53 = lshr i64 %52, %27
%54 = and i64 %53, 1
%55 = or disjoint i64 %54, 262144
store volatile i64 %55, ptr %51, align 8, !tbaa !37
ret void
}
; Function Attrs: nocallback nounwind
declare void @llvm.nvvm.membar.sys() #3
attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(none) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_90" "target-features"="+ptx83,+sm_90" }
attributes #1 = { alwaysinline mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_90" "target-features"="+ptx83,+sm_90" }
attributes #2 = { mustprogress noinline norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_90" "target-features"="+ptx83,+sm_90" }
attributes #3 = { nocallback nounwind }
attributes #4 = { nounwind }
!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.dbg.cu = !{!5}
!nvvm.annotations = !{!7}
!llvm.ident = !{!8, !9, !8}
!nvvmir.version = !{!10}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
!2 = !{i32 2, !"SDK Version", [2 x i32] [i32 12, i32 3]}
!3 = !{i32 1, !"wchar_size", i32 4}
!4 = !{i32 7, !"frame-pointer", i32 2}
!5 = distinct !DICompileUnit(language: DW_LANG_C, file: !6, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
!6 = !DIFile(filename: "test_ring_put.py", directory: "test")
!7 = !{ptr @ring_put, !"reqntidx", i32 128}
!8 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!9 = !{!"clang version 18.1.4"}
!10 = !{i32 2, i32 0}
!11 = distinct !DISubprogram(name: "ring_put", linkageName: "ring_put", scope: !6, file: !6, line: 51, type: !12, scopeLine: 51, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !5)
!12 = !DISubroutineType(cc: DW_CC_normal, types: !13)
!13 = !{}
!14 = !DILocation(line: 52, column: 11, scope: !11)
!15 = !DILocation(line: 53, column: 11, scope: !11)
!16 = !DILocation(line: 54, column: 19, scope: !11)
!17 = !DILocation(line: 54, column: 24, scope: !11)
!18 = !DILocation(line: 55, column: 37, scope: !11)
!19 = !DILocation(line: 55, column: 4, scope: !11)
!20 = !{!21, !22, i64 4}
!21 = !{!"_ZTS29nvshmemi_device_host_state_v1", !22, i64 0, !22, i64 4, !22, i64 8, !22, i64 12, !22, i64 16, !25, i64 20, !22, i64 24, !22, i64 28, !22, i64 32, !26, i64 36, !26, i64 37, !27, i64 40, !28, i64 48, !27, i64 56, !27, i64 64, !26, i64 72, !26, i64 73, !22, i64 76, !27, i64 80, !27, i64 88, !27, i64 96, !27, i64 104, !27, i64 112, !29, i64 120, !27, i64 632, !27, i64 640, !27, i64 648, !27, i64 656, !28, i64 664, !28, i64 672, !27, i64 680, !27, i64 688, !27, i64 696, !27, i64 704, !27, i64 712, !27, i64 720, !27, i64 728, !28, i64 736, !22, i64 744, !27, i64 752, !27, i64 760, !26, i64 768, !26, i64 769, !26, i64 770}
!22 = !{!"int", !23, i64 0}
!23 = !{!"omnipotent char", !24, i64 0}
!24 = !{!"Simple C++ TBAA"}
!25 = !{!"_ZTS18nvshmemi_pe_dist_t", !23, i64 0}
!26 = !{!"bool", !23, i64 0}
!27 = !{!"any pointer", !23, i64 0}
!28 = !{!"long", !23, i64 0}
!29 = !{!"_ZTS22gpu_coll_env_params_v2", !22, i64 0, !22, i64 4, !22, i64 8, !22, i64 12, !22, i64 16, !22, i64 20, !22, i64 24, !28, i64 32, !28, i64 40, !28, i64 48, !22, i64 56, !28, i64 64, !22, i64 72, !22, i64 76, !28, i64 80, !28, i64 88, !23, i64 96}
!30 = !{!21, !22, i64 8}
!31 = !{!21, !27, i64 56}
!32 = !{!21, !27, i64 40}
!33 = !{!22, !22, i64 0}
!34 = !{!21, !27, i64 632}
!35 = !{!21, !27, i64 680}
!36 = !{!21, !27, i64 696}
!37 = !{!28, !28, i64 0}
!38 = !{!21, !28, i64 736}
!39 = !{!21, !27, i64 688}
!40 = distinct !{!40, !41, !42}
!41 = !{!"llvm.loop.mustprogress"}
!42 = !{!"llvm.loop.unroll.disable"}
!43 = !{!21, !22, i64 744}
after always inline pass(opt -passes=always-inline):
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
%struct.nvshmemi_device_host_state_v1 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i8, i8, ptr, i64, ptr, ptr, i8, i8, i32, ptr, ptr, ptr, ptr, ptr, %struct.gpu_coll_env_params_v2, ptr, ptr, ptr, ptr, i64, i64, ptr, ptr, ptr, ptr, ptr, ptr, ptr, i64, i32, ptr, ptr, i8, i8, i8 }
%struct.gpu_coll_env_params_v2 = type { i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i32, i64, i32, i32, i64, i64, [416 x i8] }
%struct.nvshmemi_version_t = type { i32, i32, i32 }
@nvshmemi_device_state_d = dso_local addrspace(4) externally_initialized global %struct.nvshmemi_device_host_state_v1 zeroinitializer, align 8
@nvshmemi_device_lib_version_d = dso_local addrspace(4) externally_initialized global %struct.nvshmemi_version_t { i32 3, i32 2, i32 5 }, align 4
@llvm.compiler.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(4) @nvshmemi_device_lib_version_d to ptr), ptr addrspacecast (ptr addrspace(4) @nvshmemi_device_state_d to ptr)], section "llvm.metadata"
define ptx_kernel void @ring_put(ptr addrspace(1) %0, ptr addrspace(1) %1) local_unnamed_addr !dbg !11 {
%3 = load i32, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 1), align 4, !dbg !14, !tbaa !15
%4 = load i32, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 2), align 8, !dbg !25, !tbaa !26
%5 = add i32 %3, 1, !dbg !27
%6 = srem i32 %5, %4, !dbg !28
%7 = call i32 @nvshmem_int_p(ptr addrspace(1) %0, i32 %3, i32 %6), !dbg !29
ret void, !dbg !30
}
; Function Attrs: alwaysinline mustprogress norecurse nounwind
define internal void @nvshmem_int_p(ptr noundef %0, i32 noundef %1, i32 noundef %2) unnamed_addr #0 {
%4 = load ptr, ptr addrspace(4) getelementptr inbounds nuw (i8, ptr addrspace(4) @nvshmemi_device_state_d, i64 56), align 8, !tbaa !31
%5 = sext i32 %2 to i64
%6 = getelementptr inbounds i64, ptr %4, i64 %5
%7 = addrspacecast ptr %6 to ptr addrspace(1)
%8 = load i64, ptr addrspace(1) %7, align 8, !invariant.load !13
%9 = icmp eq i64 %8, 0
br i1 %9, label %17, label %10
10: ; preds = %3
%11 = inttoptr i64 %8 to ptr
%12 = load ptr, ptr addrspace(4) getelementptr inbounds nuw (i8, ptr addrspace(4) @nvshmemi_device_state_d, i64 40), align 8, !tbaa !32
%13 = ptrtoint ptr %0 to i64
%14 = ptrtoint ptr %12 to i64
%15 = sub i64 %13, %14
%16 = getelementptr inbounds i8, ptr %11, i64 %15
store i32 %1, ptr %16, align 4, !tbaa !33
br label %18
17: ; preds = %3
notail call fastcc void @_Z23nvshmemi_transfer_rma_pIiEvPvT_i(ptr noundef %0, i32 noundef %1, i32 noundef %2) #3
br label %18
18: ; preds = %17, %10
ret void
}
; Function Attrs: mustprogress noinline norecurse nounwind
define internal fastcc void @_Z23nvshmemi_transfer_rma_pIiEvPvT_i(ptr noundef %0, i32 noundef %1, i32 noundef %2) unnamed_addr #1 section ".text.compute" {
%4 = load ptr, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 24), align 8, !tbaa !34
%5 = load ptr, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 11), align 8, !tbaa !32
%6 = load ptr, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 30), align 8, !tbaa !35
%7 = atomicrmw add ptr %6, i64 24 seq_cst, align 8
%8 = add i64 %7, 23
%9 = load ptr, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 32), align 8, !tbaa !36
%10 = load volatile i64, ptr %9, align 8, !tbaa !37
%11 = load i64, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 37), align 8, !tbaa !38
%12 = add i64 %11, -1
%13 = add i64 %12, %10
%14 = icmp ult i64 %13, %8
br i1 %14, label %15, label %23
15: ; preds = %3
%16 = load ptr, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 31), align 8, !tbaa !39
br label %17
17: ; preds = %17, %15
%18 = load volatile i64, ptr %16, align 8, !tbaa !37
%19 = add i64 %12, %18
%20 = icmp ult i64 %19, %8
br i1 %20, label %17, label %21, !llvm.loop !40
21: ; preds = %17
%22 = atomicrmw umax ptr %9, i64 %18 seq_cst, align 8
tail call void @llvm.nvvm.membar.sys()
br label %23
23: ; preds = %21, %3
%24 = and i64 %7, 4194303
%25 = getelementptr inbounds nuw i8, ptr %4, i64 %24
%26 = load i32, ptr addrspace(4) getelementptr inbounds (%struct.nvshmemi_device_host_state_v1, ptr addrspace(4) @nvshmemi_device_state_d, i64 0, i32 38), align 8, !tbaa !43
%27 = zext nneg i32 %26 to i64
%28 = xor i64 %7, -1
%29 = lshr i64 %28, %27
%30 = and i64 %29, 1
%31 = ptrtoint ptr %0 to i64
%32 = ptrtoint ptr %5 to i64
%33 = sub i64 %31, %32
%34 = zext i32 %1 to i64
%35 = shl i64 %33, 24
%36 = or disjoint i64 %30, %35
%37 = or disjoint i64 %36, 131328
store volatile i64 %37, ptr %25, align 8, !tbaa !37
%38 = add i64 %7, 8
%39 = and i64 %38, 4194303
%40 = getelementptr inbounds nuw i8, ptr %4, i64 %39
%41 = sub i64 -9, %7
%42 = lshr i64 %41, %27
%43 = and i64 %42, 1
%44 = shl nuw i64 %34, 32
%45 = shl i32 %2, 16
%46 = zext i32 %45 to i64
%47 = or disjoint i64 %44, %46
%48 = or disjoint i64 %43, %47
store volatile i64 %48, ptr %40, align 8, !tbaa !37
%49 = add i64 %7, 16
%50 = and i64 %49, 4194303
%51 = getelementptr inbounds nuw i8, ptr %4, i64 %50
%52 = sub i64 -17, %7
%53 = lshr i64 %52, %27
%54 = and i64 %53, 1
%55 = or disjoint i64 %54, 262144
store volatile i64 %55, ptr %51, align 8, !tbaa !37
ret void
}
; Function Attrs: nocallback nounwind
declare void @llvm.nvvm.membar.sys() #2
attributes #0 = { alwaysinline mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_90" "target-features"="+ptx83,+sm_90" }
attributes #1 = { mustprogress noinline norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_90" "target-features"="+ptx83,+sm_90" }
attributes #2 = { nocallback nounwind }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.dbg.cu = !{!5}
!nvvm.annotations = !{!7}
!llvm.ident = !{!8, !9, !8}
!nvvmir.version = !{!10}
!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = !{i32 4, !"nvvm-reflect-ftz", i32 1}
!2 = !{i32 2, !"SDK Version", [2 x i32] [i32 12, i32 3]}
!3 = !{i32 1, !"wchar_size", i32 4}
!4 = !{i32 7, !"frame-pointer", i32 2}
!5 = distinct !DICompileUnit(language: DW_LANG_C, file: !6, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
!6 = !DIFile(filename: "test_ring_put.py", directory: "test")
!7 = !{ptr @ring_put, !"reqntidx", i32 128}
!8 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
!9 = !{!"clang version 18.1.4"}
!10 = !{i32 2, i32 0}
!11 = distinct !DISubprogram(name: "ring_put", linkageName: "ring_put", scope: !6, file: !6, line: 51, type: !12, scopeLine: 51, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !5)
!12 = !DISubroutineType(cc: DW_CC_normal, types: !13)
!13 = !{}
!14 = !DILocation(line: 52, column: 11, scope: !11)
!15 = !{!16, !17, i64 4}
!16 = !{!"_ZTS29nvshmemi_device_host_state_v1", !17, i64 0, !17, i64 4, !17, i64 8, !17, i64 12, !17, i64 16, !20, i64 20, !17, i64 24, !17, i64 28, !17, i64 32, !21, i64 36, !21, i64 37, !22, i64 40, !23, i64 48, !22, i64 56, !22, i64 64, !21, i64 72, !21, i64 73, !17, i64 76, !22, i64 80, !22, i64 88, !22, i64 96, !22, i64 104, !22, i64 112, !24, i64 120, !22, i64 632, !22, i64 640, !22, i64 648, !22, i64 656, !23, i64 664, !23, i64 672, !22, i64 680, !22, i64 688, !22, i64 696, !22, i64 704, !22, i64 712, !22, i64 720, !22, i64 728, !23, i64 736, !17, i64 744, !22, i64 752, !22, i64 760, !21, i64 768, !21, i64 769, !21, i64 770}
!17 = !{!"int", !18, i64 0}
!18 = !{!"omnipotent char", !19, i64 0}
!19 = !{!"Simple C++ TBAA"}
!20 = !{!"_ZTS18nvshmemi_pe_dist_t", !18, i64 0}
!21 = !{!"bool", !18, i64 0}
!22 = !{!"any pointer", !18, i64 0}
!23 = !{!"long", !18, i64 0}
!24 = !{!"_ZTS22gpu_coll_env_params_v2", !17, i64 0, !17, i64 4, !17, i64 8, !17, i64 12, !17, i64 16, !17, i64 20, !17, i64 24, !23, i64 32, !23, i64 40, !23, i64 48, !17, i64 56, !23, i64 64, !17, i64 72, !17, i64 76, !23, i64 80, !23, i64 88, !18, i64 96}
!25 = !DILocation(line: 53, column: 11, scope: !11)
!26 = !{!16, !17, i64 8}
!27 = !DILocation(line: 54, column: 19, scope: !11)
!28 = !DILocation(line: 54, column: 24, scope: !11)
!29 = !DILocation(line: 55, column: 37, scope: !11)
!30 = !DILocation(line: 55, column: 4, scope: !11)
!31 = !{!16, !22, i64 56}
!32 = !{!16, !22, i64 40}
!33 = !{!17, !17, i64 0}
!34 = !{!16, !22, i64 632}
!35 = !{!16, !22, i64 680}
!36 = !{!16, !22, i64 696}
!37 = !{!23, !23, i64 0}
!38 = !{!16, !23, i64 736}
!39 = !{!16, !22, i64 688}
!40 = distinct !{!40, !41, !42}
!41 = !{!"llvm.loop.mustprogress"}
!42 = !{!"llvm.loop.unroll.disable"}
!43 = !{!16, !17, i64 744}
we can see that only nvshmem_int_p
is not inlined.
Thanks!