NCU : ==ERROR== The application returned an error code (9)

Hello,

I am trying to launch my code with NCU for profiling. The code does the following, runs 4 streams of CUTLASS GEMV. GPU I am using is H100 and the total occupied memory shouldn’t be more than that of the H100.

When I try to profile the 4 streams, I run into this error:

==PROF== Profiling "Kernel" - 0 (1/10): 0%....50%....100% - 1 pass
==PROF== Profiling "Kernel" - 1 (2/10): 0%....50%....100% - 1 pass
==PROF== Profiling "Kernel" - 2 (3/10): 0%....50%....100% - 1 pass
==PROF== Profiling "Kernel" - 3 (4/10): 0%....50%....100% - 1 pass
==ERROR== An error was reported by the driver:
==ERROR== LaunchFailed
==PROF== Trying to shutdown target application
==ERROR== The application returned an error code (9).

I am unsure why exactly that is happening… I have turned down my compiling flags as well in my CMAKE, to not aggressively optimise

cmake_minimum_required(VERSION 3.18)
project(LINEAR_ALGEBRA_PREFETCHER LANGUAGES CXX CUDA)

# Use C++17
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

# Set CUDA compiler
set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
enable_language(CUDA)

# Force Release build for maximum performance
if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release)
endif()

# CUTLASS directory (points to parent of "cutlass/")
set(CUTLASS_MAIN_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/include/cutlass/include")
set(CUTLASS_UTIL_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/cutlass/tools/util/include")


# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -use_fast_math")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ftz=true")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --prec-div=false")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --prec-sqrt=false")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=true")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -ffast-math")

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")  
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -g --generate-line-info")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")

# Source files
set(SOURCES
    src/main.cu
    src/markov_structure.cu
    src/prefetcher_kernels.cu
    src/utils.cu
    src/math.cu
)

# Executable
add_executable(main ${SOURCES})
set_source_files_properties(${SOURCES} PROPERTIES LANGUAGE CUDA)

# H100 CUDA target properties
set_target_properties(main PROPERTIES
    CUDA_SEPARABLE_COMPILATION ON
    CUDA_ARCHITECTURES 90  # Perfect for H100
    CUDA_RESOLVE_DEVICE_SYMBOLS ON
)

# Include directories
target_include_directories(main PRIVATE
    ${CMAKE_CURRENT_SOURCE_DIR}/include
    ${CMAKE_CURRENT_SOURCE_DIR}/include/cutlass/include
    ${CMAKE_CURRENT_SOURCE_DIR}/include/cutlass/tools/util/include
    ${CNPY_DIR}/include
    ${CMAKE_CURRENT_SOURCE_DIR}/cnpy
)


# Find packages
find_package(ZLIB REQUIRED)
find_package(CUDAToolkit REQUIRED)

# Link libraries
target_link_directories(main PRIVATE ${CNPY_LIB_DIR})
target_link_libraries(main PRIVATE 
    cnpy 
    ZLIB::ZLIB 
    CUDA::cublas 
    CUDA::cudart
    CUDA::cublasLt
)

I am sure the command I use is correct:

ncu -o "${OUTFILE_50%.csv}_profile" \
        --target-processes all \
        --log-file "${OUTFILE_50%.csv}_profile.txt" \
        --force-overwrite \
        --launch-skip 1 --launch-count 10 \
        --profile-from-start yes \
        ./main $CMD_CODE "$OUTFILE_50" $TRAINING_FILE 0 0 0 $M 1 $K $MATRIX_FILE

Thank you very much, any help or insight is appreciated

What exact version of ncu (ncu --version), driver (nvidia-smi) and cuda toolkit are you using?

Hi Felix:

I am using

NVIDIA (R) Nsight Compute Command Line Profiler
Copyright (c) 2018-2024 NVIDIA Corporation
Version 2025.1.0.0 (build 35237751) (public-release)

and for the driver:

NVIDIA-SMI 535.86.10 Driver Version: 535.86.10 CUDA Version: 12.2

My wrapper function for the cutlass_gemv:

template <typename Gemv>
void run_cutlass_gemv_row_major(
    int n,                 // M dimension (rows of A)
    int k,                 // K dimension (cols of A)
    float alpha,
    float beta,
    const float* A_device, 
    const float* x_device, 
    const float* c_device, 
    float* d_device,      
    cudaStream_t stream = nullptr
) {
    using ElementA = float;
    using ElementB = float;
    using ElementC = float;
    using LayoutA = cutlass::layout::RowMajor;     


    cutlass::MatrixCoord problem_size(n, k);
    int batch_count = 1;

    int64_t batch_stride_A = int64_t(n) * int64_t(k); // row-major layout
    int64_t batch_stride_B = k;
    int64_t batch_stride_C = n;
    int64_t batch_stride_D = n;

    // TensorRef for A in row-major
    cutlass::TensorRef<ElementA, LayoutA> ref_A(
        const_cast<ElementA *>(A_device),
        LayoutA::packed(cutlass::make_Coord(n, k))
    );

    const void* ptr_B = static_cast<const void*>(x_device);
    const void* ptr_C = static_cast<const void*>(c_device);
    void* ptr_D = static_cast<void*>(d_device);

    typename Gemv::Arguments arguments{
        problem_size,
        batch_count,
        typename Gemv::EpilogueOutputOp::Params(alpha, beta),
        ref_A,
        ptr_B,
        ptr_C,
        ptr_D,
        batch_stride_A,
        batch_stride_B,
        batch_stride_C,
        batch_stride_D
    };

    // Construct GEMV operator
    Gemv gemv_op;

    auto status = gemv_op.can_implement(arguments);
    printf("can_implement: %d\n", int(status));

    size_t workspace_size = gemv_op.get_workspace_size(arguments);
    void* workspace = nullptr;
    if (workspace_size > 0) {
        cudaMalloc(&workspace, workspace_size);
        printf("Allocated workspace of size %lu\n", workspace_size);
    }

    status = gemv_op.initialize(arguments, workspace, stream);
    printf("initialize: %d\n", int(status));

    status = gemv_op(stream);
    printf("gemv kernel launch: %d\n", int(status));

    if (workspace) {
        cudaFree(workspace);
    }

    printf("GEMV operation completed\n");
}

Another thing, if I use my optimised compiling flags:

cmake_minimum_required(VERSION 3.18)
project(LINEAR_ALGEBRA_PREFETCHER LANGUAGES CXX CUDA)

# Use C++17
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

# Set CUDA compiler
set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
enable_language(CUDA)

# Force Release build for maximum performance
if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release)
endif()

# CUTLASS directory (points to parent of "cutlass/")
set(CUTLASS_MAIN_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/include/cutlass/include")
set(CUTLASS_UTIL_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/cutlass/tools/util/include")

# cnpy directories
set(CNPY_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cnpy/install/include")
set(CNPY_LIB_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cnpy/install/lib")

# H100-optimized CUDA flags
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -use_fast_math")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ftz=true")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --prec-div=false")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --prec-sqrt=false")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --fmad=true")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -ffast-math")

# H100 specific optimizations
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -maxrregcount=255")  # Maximize register usage
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --ptxas-options=-v")  # Verbose PTX info

# Release optimization flags
set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG -use_fast_math --ftz=true --prec-div=false --prec-sqrt=false -maxrregcount=255")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -ffast-math -march=native")

# Source files
set(SOURCES
    src/main.cu
    src/markov_structure.cu
    src/prefetcher_kernels.cu
    src/utils.cu
    src/math.cu
)

# Executable
add_executable(main ${SOURCES})
set_source_files_properties(${SOURCES} PROPERTIES LANGUAGE CUDA)

# H100 CUDA target properties
set_target_properties(main PROPERTIES
    CUDA_SEPARABLE_COMPILATION ON
    CUDA_ARCHITECTURES 90  # Perfect for H100
    CUDA_RESOLVE_DEVICE_SYMBOLS ON
)

# Include directories
target_include_directories(main PRIVATE
    ${CMAKE_CURRENT_SOURCE_DIR}/include
    ${CMAKE_CURRENT_SOURCE_DIR}/include/cutlass/include
    ${CMAKE_CURRENT_SOURCE_DIR}/include/cutlass/tools/util/include
    ${CNPY_DIR}/include
    ${CMAKE_CURRENT_SOURCE_DIR}/cnpy
)

# H100-optimized compiler definitions
target_compile_definitions(main PRIVATE
    CUTLASS_ENABLE_TENSOR_CORE_MMA=1
    CUTLASS_DEBUG_TRACE_LEVEL=0
    CUTLASS_NAMESPACE=cutlass
    CUTLASS_ENABLE_F16C=1
    CUTLASS_ENABLE_BF16=1
    NDEBUG  # Ensure release optimizations
)

# Find packages
find_package(ZLIB REQUIRED)
find_package(CUDAToolkit REQUIRED)

# Link libraries
target_link_directories(main PRIVATE ${CNPY_LIB_DIR})
target_link_libraries(main PRIVATE 
    cnpy 
    ZLIB::ZLIB 
    CUDA::cublas 
    CUDA::cudart
    CUDA::cublasLt  # Add cuBLASLt for H100 optimizations
)

# H100 specific compile options
target_compile_options(main PRIVATE
    $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
    $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>
    $<$<COMPILE_LANGUAGE:CUDA>:--ftz=true>
    $<$<COMPILE_LANGUAGE:CUDA>:--prec-div=false>
    $<$<COMPILE_LANGUAGE:CUDA>:--prec-sqrt=false>
    $<$<COMPILE_LANGUAGE:CUDA>:-maxrregcount=255>
    $<$<COMPILE_LANGUAGE:CXX>:-ffast-math>
    $<$<COMPILE_LANGUAGE:CXX>:-march=native>
)

and run ncu like this:

ncu -o "${OUTFILE_50%.csv}_profile" \
        --target-processes all \
        --log-file "${OUTFILE_50%.csv}_profile.txt" \
        --force-overwrite \
        --launch-skip 1 --launch-count 10 \
        --profile-from-start yes \
        --replay-mode application \
        ./main $CMD_CODE "$OUTFILE_50" $TRAINING_FILE 0 0 0 $M 1 $K $MATRIX_FILE

I get a different error:

==PROF== Connected to process 3096133 (/linear_algebra_prefetcher/build/main)
==PROF== Profiling "Kernel" - 0 (1/10): Application replay pass 1
==PROF== Profiling "Kernel" - 1 (2/10): Application replay pass 1
==PROF== Profiling "Kernel" - 2 (3/10): Application replay pass 1
==PROF== Profiling "Kernel" - 3 (4/10): Application replay pass 1
==PROF== Disconnected from process 3096133
==PROF== Connected to process 3096158 (/build/main)
==PROF== Profiling "Kernel" - 0 (1/10): Application replay pass 2
==ERROR== An error was reported by the driver:
==ERROR== LaunchFailed
==PROF== Trying to shutdown target application
==ERROR== An error occurred while trying to profile.
==ERROR== Unexpected number of profiled kernels. Application replay requires that the execution, combined with selected filters, guarantees a consistent set of kernels in all passes.
==ERROR== Check the --app-replay-match and --app-replay-mode options for different matching strategies.

Given all my updates below, do you have any idea why my profiling could be failing? I feel like I have tried every option so far.

Since you are using cutlass, do you know if your kernel is using cooperative groups (CGA) in combination with CUDA graphs? If so, there is a known issue that will be resolved in the next driver and tool release. In the meantime, you can try to launch your kernel outside of a CUDA graph to profile it with ncu.

Hi Felix,

I am unaware if my cutlass code is using CGA with CUDA Graphs, I have not defined any of those at least. I am using CUTLASS GEMV, you can see the code snippet above of my wrapper function. The only other API I use is streams and events for execution time benchmarks.

Does everything else look correct? Is this just a Nsight Compute issue?

Are my compiling flags too heavy on optimisation?

It might be a bit of a big ask, but would you like to try to run the code I snippet I have given you of my CUTLASS GEMV wrapper together with 4 Streams executing 4 different GEMVs asynchronously and try profiling with NCU Profiler?

My experiment runs 4 different GEMVs at the same with the same dimensions of relatively large matrices [16384,16384].

You can also try running 1 GEMV with 1 Stream. It’s not working for me either…

Do you think that would be feasible for you to try?

Is there anything else I could help with to make it easier for you?

I have provided also the main() function here for you to try, you would just need to create a matrix for d_matrix and d_input vector on device which would be doing 1 stream. For the other experiment you would need to add 3 more streams:

int main(){
// Create Matrix and Input Vector here for d_matrix and d_input_vector
      using ElementInput = float;
      using ElementOutput = float;
      using LayoutA = cutlass::layout::RowMajor;
      using ElementAccumulator = float;
      int const kElementsPerAccess =  4;
      int const kThreadCount =  1024;
      int const kThreadsPerRow =  64;
      
      using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
          ElementOutput,
          1,
          ElementAccumulator,
          ElementAccumulator>;
  
      using Gemv = cutlass::gemm::device::Gemv<
          cutlass::gemm::kernel::Gemv<
              ElementInput,
              LayoutA,
              ElementInput,
              ElementOutput,
              ElementAccumulator,
              EpilogueOp,
              kElementsPerAccess,
              kThreadCount,
              kThreadsPerRow
          >
      >;
  
      float* output;
      float* empty;
      size_t size = N * M;
      cudaMalloc(&output, sizeof(float) * size);
      cudaMalloc(&empty, sizeof(float) * size);
  
      cudaStream_t stream;
      cudaStreamCreate(&stream);
  
      // Timing setup
      cudaEvent_t start, stop;
      cudaEventCreate(&start);
      cudaEventCreate(&stop);
      cudaEventRecord(start, stream);
      
      cudaProfilerStart();

      run_cutlass_gemv_row_major<Gemv>(M, K, alpha, beta, d_matrix, d_input_vector, output, empty, stream);
      cudaDeviceSynchronize();

      cudaProfilerStop();

      cudaEventRecord(stop, stream);
      cudaEventSynchronize(stop);
  
      float milliseconds = 0;
      cudaEventElapsedTime(&milliseconds, start, stop);
}

Thank you very much for your help, hope to hear from you soon!

I asked the team to see if it can be reproduced internally. I will update here once I have more information.

Hi, @qikch

Does this error only happen for your CUTLASS sample ?
If possible, can you use another simple CUDA sample to have a try ? GitHub - NVIDIA/cuda-samples: Samples for CUDA Developers which demonstrates features in CUDA Toolkit

Also I noticed you are using NCU released for CUDA 12.8, but driver for CUDA 12.2.
Can you update your driver also to have a try ?

Hi @veraj,

So far, I have only tested the CUTLASS GEMV sample I shared earlier. I will try to use NCU to profile with the other CUDA samples from the GitHub repository and by the end of the week and will follow up with more detailed information.

Please let me know if there’s anything else I can assist with in the meantime.
Thank you again for your response.