cuptiActivityEnableLatencyTimestamps(1) hangs the process unexpectedly

Hello, I’m using the CUPTI with InitializeInjection to trace the activity of kernels in requeted user process.

When I enable the cuptiActivityEnableLatencyTimestamps(1), my process hangs unexpectedly.

Following is my codes and the results(logs).

injection.cpp

#include <time.h>
#include <unistd.h>

#include <iostream>
using ::std::cerr;
using ::std::cout;
using ::std::endl;

#include <mutex>
using ::std::mutex;

#include <string>
using ::std::string;

#include <unordered_map>
using ::std::unordered_map;

#include <unordered_set>
using ::std::unordered_set;

#include <vector>
using ::std::vector;

#include <stdlib.h>

// CUDA headers
#include <cuda.h>
#include <cuda_runtime_api.h>

// CUPTI headers
#include <cupti_callbacks.h>
#include <cupti_profiler_target.h>
#include <cupti_target.h>
#include <cupti_activity.h>
#include "helper_cupti_activity.h"

// Macros
// Export InitializeInjection symbol.
#ifdef _WIN32
#define DLLEXPORT __declspec(dllexport)
#define HIDDEN
#else
#define DLLEXPORT __attribute__((visibility("default")))
#define HIDDEN __attribute__((visibility("hidden")))
#endif

static int currentFileIndex = 0;
static char globalTimestamp[24] = {0}; // Global timestamp variable
static bool injectionInitialized = false;
static FILE*
SetupCupti(int file_index)
{
    pid_t pid = getpid();
    std::string baseFilename = "cuda_activity_log";
    std::string filename = "/tmp/" + baseFilename + "_" + globalTimestamp + "_" + std::to_string(pid) + "_" + std::to_string(file_index) + ".log";
    FILE* pFileHandle = fopen(filename.c_str(), "w");
    UserData *pUserData = (UserData *)malloc(sizeof(UserData));
    MEMORY_ALLOCATION_CALL(pUserData);

    memset(pUserData, 0, sizeof(UserData));
    pUserData->pPostProcessActivityRecords = NULL;
    pUserData->printActivityRecords = 1;
    //printf("SetupCupti\n");
    // Common CUPTI Initialization
    InitCuptiTrace(pUserData, NULL, pFileHandle);

    // Device activity record is created when CUDA initializes, so we
    // want to enable it before cuInit() or any CUDA runtime call.

    // CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
    // CUPTI_API_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
    return pFileHandle;
}



// InitializeInjection will be called by the driver when the tool is loaded
// by CUDA_INJECTION64_PATH
extern "C" DLLEXPORT int
InitializeInjection()
{   
    if (injectionInitialized == false)
    {
        completedFlag = false;
        injectionInitialized = true;
        FILE* pFileHandle = SetupCupti(0);
        CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
        CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
        CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));

        CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION));

    }
    return 1;
}

helper_cupti_activity.h (only shows modified parts)


// Buffer Management Functions
static void CUPTIAPI BufferRequested(uint8_t **ppBuffer, size_t *pSize, size_t *pMaxNumRecords) {
    printf("BufferRequested\n");
    
    uint8_t *pBuffer = (uint8_t *) malloc(globals.activityBufferSize + ALIGN_SIZE);
    MEMORY_ALLOCATION_CALL(pBuffer);
    *pSize = globals.activityBufferSize;
    *ppBuffer = (uint8_t *)ALIGN_BUFFER(pBuffer, ALIGN_SIZE);
    *pMaxNumRecords = 500;

    
}

static void CUPTIAPI BufferCompleted(CUcontext context, uint32_t streamId, uint8_t *pBuffer, size_t size, size_t validSize) {
    //pthread_mutex_lock(&bufferMutex);
    
    if (validSize > 0 ) {
        FILE *pOutputFile = globals.pOutputFile;
        if (!pOutputFile) {
            pOutputFile = stdout;
        }
        
        PrintActivityBuffer(pBuffer, validSize, pOutputFile, globals.pUserData);
        printf("BufferCompleted\n");

    }
    else{
        //printf("valid size is empty \n");
        if (pBuffer != NULL){
            free(pBuffer);
        }
    }
    
 
}

// CUPTI Trace Setup
void
InitCuptiTrace(
    void *pUserData,
    void *pTraceCallback,
    FILE *pFileHandle)
{
    if (!pUserData)
    {
        fprintf(stderr, "Invalid parameter userData\n");
        exit(EXIT_FAILURE);
    }

    globals.pOutputFile  = pFileHandle;
    globals.pUserData    = pUserData;
    globals.activityBufferSize = BUF_SIZE;

    CUPTI_API_CALL(cuptiActivityRegisterCallbacks(BufferRequested, BufferCompleted));
    cuptiActivityEnableLatencyTimestamps(1);        
}

Log (Cifar10 Training)
env CUDA_INJECTION64_PATH=/home/ysh/cupti-kernel-trace/libinjection.so /home/ysh/gpuutil/bin/python main_wo_cudaprofiler.py --batch 8 --gpu 0

BufferCompleted
BufferRequested............... 6/6250 .............................]  Step: 19ms | Tot: 108ms                                                                                                            
BufferCompleted
BufferRequested
BufferCompleted
BufferRequested
BufferCompleted
BufferRequested............... 7/6250 .............................]  Step: 19ms | Tot: 128ms                                                                                                            
BufferCompleted
BufferRequested
BufferCompleted
BufferRequested............... 8/6250 .............................]  Step: 19ms | Tot: 147ms                                                                                                            
BufferCompleted
BufferRequested
BufferCompleted
BufferRequested
BufferCompleted
BufferRequested............... 9/6250 .............................]  Step: 19ms | Tot: 166ms                                                                                                            
BufferCompleted
...
...
BufferRequested
BufferRequested............... 151/6250 ...........................]  Step: 19ms | Tot: 3s58ms                                                                                                           
BufferRequested
BufferRequested
BufferRequested............... 152/6250 ...........................]  Step: 19ms | Tot: 3s78ms                                                                                                           
BufferRequested
BufferRequested............... 153/6250 ...........................]  Step: 18ms | Tot: 3s97ms                                                                                                           
BufferRequested
BufferRequested
BufferRequested............... 154/6250 ...........................]  Step: 19ms | Tot: 3s116ms                                                                                                          
BufferRequested
BufferRequested............... 155/6250 ...........................]  Step: 18ms | Tot: 3s135ms                                                                                                          
BufferRequested
BufferRequested
BufferRequested............... 156/6250 ...........................]  Step: 19ms | Tot: 3s155ms                                                                                                          
BufferRequested
BufferRequested
BufferRequested............... 157/6250 ...........................]  Step: 19ms | Tot: 3s174ms                                                                                                          
BufferRequested
BufferRequested............... 158/6250 ...........................]  Step: 19ms | Tot: 3s193ms                                                                                                          
BufferRequested
BufferRequested
BufferRequested............... 159/6250 ...........................]  Step: 19ms | Tot: 3s212ms                                                                                                          
BufferRequested

(It hangs and can’t be terminated)

Log (VectorAdd)
env CUDA_INJECTION64_PATH=/home/ysh/cupti-kernel-trace/libinjection.so /home/ysh/gpuutil/bin/python main_wo_cudaprofiler.py --batch 8 --gpu 0

BufferCompleted
BufferRequested
BufferCompleted
BufferRequested
BufferCompleted
BufferRequested
BufferCompleted
BufferRequested
BufferCompleted
BufferRequested
BufferCompleted
BufferRequested
BufferCompleted
BufferRequested
BufferCompleted
BufferRequested
...
...
BufferRequested 
BufferRequested 
BufferRequested 
BufferRequested 
...

(In this case, the process can be terminated and BufferRequested print infinitely)

For this issue, it was somewhat resolved by using the code pattern incupti_finalize sample. However, it seems that problems still exist when used in conjunction with cuptiActivityLatencyTimestamps(1).

The previous response seemed off, so I have reorganized it.

At that point, I encountered two issues. The first issue was that not using cuptiFinalize() as shown in the sample code led to errors such as segmentation faults or malloc fastbin errors after termination. The second issue was related to using the cuptiActivityLatencyTimestamps(1) option.

The first issue was resolved by following the pattern in the sample code, but the profiling issue caused by cuptiActivityLatencyTimestamps(1) still persists.

Hi, @SanghoYeo

Sorry for the late reply !

Can you clarify what’s the issue when you use cuptiActivityLatencyTimestamps(1)?

Hello, @veraj !
I have been waiting for your response. There are two issues that occur when using cuptiActivityLatencyTimestamps(1):

Issue 1

If the profiling session is maintained continuously without calling cuptiFinalize, the process stops working within a few seconds. In this situation, the process did not terminate but can only be stopped. Additionally, when this happens, it appeared that the BufferRequest Callback keeps executing, whereas the BufferCompleted Callback was not called.

To reproduce symptom, just add cuptiAcitivytLatencyTimestamps(1); to cupti_finalize sample code and disable dynamic detach attach thread

If you enable CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL, you will see only BufferRequested is executed.

Furthermore, if you enable both CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL and CUPTI_ACITIVITY_KIND_MEMSET, it will cause process hang.

Here is the modified parts of cupti_finalize.cpp


    // Enable CUPTI activities.
    //CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
    //CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
    CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));

    CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
    //CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
    //CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
    //CUPTI_API_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_SYNCHRONIZATION));
}

extern "C" int
InitializeInjection(void)
{
    if (injectionGlobals.initialized)
    {
        return 1;
    }

    // Initialize InjectionGlobals structure.
    InitializeInjectionGlobals();

    // Initialize Mutex.
    PTHREAD_CALL(pthread_mutex_init(&injectionGlobals.mutexFinalize, 0));

    RegisterAtExitHandler();
    

    // Initialize CUPTI.
    SetupCupti();
    CUPTI_API_CALL(cuptiActivityEnableLatencyTimestamps(1));
    injectionGlobals.tracingEnabled = 1;

    // Launch the thread.
    //PTHREAD_CALL(pthread_create(&injectionGlobals.dynamicThread, NULL, DynamicAttachDetach, NULL));
    injectionGlobals.initialized = 1;

    return 1;
}

and I add print function of both BufferRequest and BufferCompleted in helper_cupti_activity.h

// Buffer Management Functions
static void CUPTIAPI
BufferRequested(
    uint8_t **ppBuffer,
    size_t *pSize,
    size_t *pMaxNumRecords)
{
    uint8_t *pBuffer = (uint8_t *) malloc(globals.activityBufferSize + ALIGN_SIZE);
    MEMORY_ALLOCATION_CALL(pBuffer);
    printf("request\n");
    *pSize = globals.activityBufferSize;
    *ppBuffer = ALIGN_BUFFER(pBuffer, ALIGN_SIZE);
    *pMaxNumRecords = 1000;
}

static void CUPTIAPI
BufferCompleted(
    CUcontext context,
    uint32_t streamId,
    uint8_t *pBuffer,
    size_t size,
    size_t validSize)
{
    if (validSize > 0)
    {
        printf("completed\n");
        FILE *pOutputFile = globals.pOutputFile;
        if (!pOutputFile)
        {
            pOutputFile = stdout;
        }

        PrintActivityBuffer(pBuffer, validSize, pOutputFile, globals.pUserData);
    }

    free(pBuffer);
}

Logs of above code

request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request
completed
request

....(after few seconds)
....
request
request
request
request
request
request
request
request
request
request
request
request
request
request
request
request
request
request
request
request
request
request

Issue 2

To resolve issue 1, if the profiling session is periodically terminated and restarted, the process runs well for several hours, but eventually a Segmentation Fault error occurs and the process terminates. It is not certain whether this issue occurs when stopping the profiling or restarting the profiling. I’ll test the code, and let you know how reproduce the second issue.

Reproduction Method for Issue 2

Upon investigation, I confirmed that using CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL with dynamic attaching through cuptiFinalize results in a process hang. Although this outcome is different from the segmentation fault I previously encountered, it still makes real-time profiling unusable.

I have backed up the reproduction code here: GitHub - Yeosangho/samples

The library specified in CUDA_INJECTION64_PATH in this code is available at samples/cupti_finalize at master · Yeosangho/samples · GitHub. While building this library, I modified parts of cupti_finalize.cpp and common/helper_cupti_activity.h.

Additionally, to increase the probability of error occurrence, I simultaneously ran 12 containers using samples/cupti_finalize/docker-compose.yaml at master · Yeosangho/samples · GitHub. The task code executed in these containers is the same as samples/pytorch-cifar10 at master · Yeosangho/samples · GitHub.

The error cases vary depending on the activated activity at samples/cupti_finalize/cupti_finalize.cpp at d12ec0037439025f2f064341a1d242359236279c · Yeosangho/samples · GitHub and samples/cupti_finalize/cupti_finalize.cpp at d12ec0037439025f2f064341a1d242359236279c · Yeosangho/samples · GitHub.

For example, activating CUPTI_ACTIVITY_KIND_MEMCPY along with CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL causes the process to hang within tens of seconds after starting.

On the other hand, if only CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL is activated, or if CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL and CUPTI_ACTIVITY_KIND_SYNCHRONIZATION are activated together, the error occurs after several hours. The following images show the timestamps and nvidia-smi logs when only CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL is executed and when CUPTI_ACTIVITY_KIND_SYNCHRONIZATION is executed together with CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.

In the case where the process is hanging , the log stops updating about 5 hours after execution. (You can know running time of process by checking the timestamp difference between clear_tmp.log and other logs). and NVIDIA_SMI shows hanging process does not leak memory anymore.

  • Only CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL is enabled (gpu9.log is hanging case and it used gpu1)
    image

  • Both CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL and CUPTI_ACTIVITY_KIND_SYNCHRONIZATION is enabled
    image

Since all the above tests were conducted with cuptiActivityLatencyTimestamp enabled, I will check whether the same issue occurs without enabling cuptiActivityLatencyTimestamp and provide an update later.

Hi, @SanghoYeo

We tried to repro internally, but failed.
bash -c "python3 main_wo_cudaprofiler.py --batch 2 --iters 111111111111 --gpu 0 will cost long time to run.

We didn’t see obvious hang issue when run this with cupti injection using the .so you provided.

Hello, thank you for your response.

As you mentioned, I rechecked my code and couldn’t reproduce the hang by simply running it. In fact, when I tried to reproduce it earlier, I think I had the same test running as a batch job in another container. So it seems other tasks might have influenced the reproduction results.

To reproduce the problem I encountered, you just need to run one container and execute the process you tested earlier in different shells.

For example, after running the docker container through the docker-compose file, enter each shell using the docker exec command, and then execute:

  1. Clone the samples repo in my github (I update the .so file to increase the frequency for dynamic attaching to 10 seconds to occur hang more clearly)
git clone https://github.com/Yeosangho/samples.git

  1. make docker-compose.yaml and docker-compose up -d (env in CUDA_INJECTION64_PATH need to specify cupti_finalize/libCuptiInjection.so in my github repo)
version: '3.8'

services:
  gpu0:
    image: pytorch/pytorch:2.2.2-cuda12.1-cudnn8-devel
    environment:
      #- CUDA_INJECTION64_PATH=/home/ysh/cupti-kernel-trace/cupti_finalize/libCuptiFinalize.so
      - CUDA_INJECTION64_PATH=/home/ysh/samples/cupti_finalize/libCuptiFinalize.so
      - GPU_INDEX=0
    volumes:
      - /tmp:/tmp
      - /home/ysh:/home/ysh
    working_dir: /home/ysh/cupti-kernel-trace/pytorch-cifar10
    command: bash -c "sleep infinity"
    deploy:
      resources:
        reservations:
          devices:
            - capabilities: [gpu]
              driver: nvidia
              device_ids: ['0']

  1. docker exec -it [docker-id] /bin/bash and run bash -c "python /home/ysh/cupti-kernel-trace/pytorch-cifar10/main_wo_cudaprofiler.py --batch 2 --iters 111111111111 --gpu 0"

  2. Do 3 step again in another shell (I’m not sure the number of processes which is concurrently running on a GPU is releated to the possiblity of hang. If It won’t cause hang, it might be help to repeat 3 step and increase the number of process)

This will cause the hang phenomenon to occur.
image

Hi, @SanghoYeo

Thanks for the clarification. So you mean the issue only happens when several containers run together. Then this seems a corner case.

I will deliver this message internally. And let dev to decide the priority.

Hello, @veraj

Regarding the experiment I tested in my previous answer, the problem occurred even when running multiple processes in the same container.

Also, to verify, I conducted another test today. In today’s test, I confirmed that the problem occurs even when following the procedure I described above and running only a single process in a container, without launching multiple processes.

In today’s test, I conducted all experiments after rebooting to minimize variables in the GPU environments available to me. The hang phenomenon was consistently observed across all nodes I can access (each containing L40, A40, and L4).

To reproduce,

modify docker-compose.yaml to remove working_dir.

version: '3.8'

services:
  gpu0:
    image: pytorch/pytorch:2.2.2-cuda12.1-cudnn8-devel
    environment:
      - CUDA_INJECTION64_PATH=[github procject path]/cupti_finalize/libCuptiFinalize.so
      - GPU_INDEX=0
    volumes:
      - /tmp:/tmp
      - [github project path]:[github project path]
    command: bash -c "sleep infinity"
    deploy:
      resources:
        reservations:
          devices:
            - capabilities: [gpu]
              driver: nvidia
              device_ids: ['0']

and, in the third step above, please modify the command to run the sample PyTorch code from the GitHub project inside the container

bash -c "python [github project path]/pytorch-cifar10/main_wo_cudaprofiler.py --batch 2 --iters 111111111111 --gpu 0"

If it’s possible to reproduce the issue using the method I’ve currently tried, I think it would be more helpful in determining the priority of this issue.

Thank you for your attention to this issue.
Sangho Yeo.

Thanks! We’ll check.