NVHPC 22.11/23.1 -- OMPT methods can cause SegFault when offloading

Heya,

I’ve been looking at the OMPT callback interface of different compilers and tried to compare the given parameters based on the OpenMP Examples. When examining the NVHPC compilers, I noticed that a large number of the examples crashed with a segmentation fault during execution. Disabling the flag -mp=ompt caused the program to run without issues.
The issue seems to be quite similar to the one here, which should be fixed in 22.11.

I’ve been using the following two C files:
target.1.c

#define N 100

void vec_mult()
{
    int i;
    float p[N], v1[N], v2[N];
#pragma omp target map(p, v1, v2)
    for (i = 0; i < N; i++)
        p[i] = v1[i] * v2[i];
}

int main()
{
    vec_mult();
}

ompt_printf.c

#include <omp-tools.h>
#include <stdio.h>
#include <assert.h>

#define REGISTER_CALLBACK(PREFIX, NAME) \
    setCallback(ompt_callback_##NAME,   \
                (ompt_callback_t)&ompt_##PREFIX##NAME)

void ompt_host_implicit_task(ompt_scope_endpoint_t endpoint,
                             ompt_data_t *parallel_data,
                             ompt_data_t *task_data,
                             unsigned int actual_parallelism,
                             unsigned int index,
                             int flags)
{
    printf("[ompt_host_implicit_task]\n");
}

void ompt_host_thread_begin(ompt_thread_t thread_type,
                            ompt_data_t *thread_data)
{
    printf("[ompt_host_thread_begin]\n");
}

void ompt_host_thread_end(ompt_data_t *thread_data)
{
    printf("[ompt_host_thread_end]\n");
}

void ompt_host_mutex_acquire(ompt_mutex_t kind,
                             unsigned int hint,
                             unsigned int impl,
                             ompt_wait_id_t wait_id,
                             const void *codeptr_ra)
{
    printf("[ompt_host_mutex_acquire]\n");
}

void ompt_host_mutex_acquired(ompt_mutex_t kind,
                              ompt_wait_id_t wait_id,
                              const void *codeptr_ra)
{
    printf("[ompt_host_mutex_acquired]\n");
}

void ompt_host_mutex_released(ompt_mutex_t kind,
                              ompt_wait_id_t wait_id,
                              const void *codeptr_ra)
{
    printf("[ompt_host_mutex_released]\n");
}

void ompt_device_initialize(int device_num,
                            const char *type,
                            ompt_device_t *device,
                            ompt_function_lookup_t lookup,
                            const char *documentation)
{

    printf("[ompt_device_initialize]\n");
}

void ompt_device_finalize(int device_num)
{

    printf("[ompt_device_finalize]\n");
}

void ompt_device_load(int device_num,
                      const char *filename,
                      int64_t offset_in_line,
                      void *vma_in_file,
                      size_t bytes,
                      void *host_addr,
                      void *device_addr,
                      uint64_t module_id)
{

    printf("[ompt_device_load]\n");
}

void ompt_device_unload(int device_num,
                        uint64_t module_id)
{

    printf("[ompt_device_unload]\n");
}

void ompt_device_target_data_op(ompt_id_t target_id,
                                ompt_id_t host_op_id,
                                ompt_target_data_op_t optype,
                                void *src_addr,
                                int src_device_num,
                                void *dest_addr,
                                int dest_device_num,
                                size_t bytes,
                                const void *codeptr_ra)
{

    printf("[ompt_device_target_data_op]\n");
}

void ompt_device_target(ompt_target_t kind,
                        ompt_scope_endpoint_t endpoint,
                        int device_num,
                        ompt_data_t *task_data,
                        ompt_id_t target_id,
                        const void *codeptr_ra)
{
    printf("[ompt_device_target]\n");
}

void ompt_device_target_map(ompt_id_t target_id,
                            unsigned int nitems,
                            void **host_addr,
                            void **device_addr,
                            size_t *bytes,
                            unsigned int *mapping_flags,
                            const void *codeptr_ra)
{

    printf("[ompt_device_target_map]\n");
}

void ompt_device_target_submit(ompt_id_t target_id,
                               ompt_id_t host_op_id,
                               unsigned int requested_num_teams)
{

    printf("[ompt_device_target_submit]\n");
}

static void
register_event_callbacks_host(ompt_set_callback_t setCallback)
{
    /* sort alphabetically */
    REGISTER_CALLBACK(host_, implicit_task);
    REGISTER_CALLBACK(host_, thread_begin);
    REGISTER_CALLBACK(host_, thread_end);
    REGISTER_CALLBACK(host_, mutex_acquire);
    REGISTER_CALLBACK(host_, mutex_acquired);
    REGISTER_CALLBACK(host_, mutex_released);
}

static void
register_event_callbacks_device(ompt_set_callback_t setCallback)
{
    REGISTER_CALLBACK(, device_initialize);
    REGISTER_CALLBACK(, device_finalize);
    REGISTER_CALLBACK(, device_load);
    REGISTER_CALLBACK(, device_unload);
    REGISTER_CALLBACK(device_, target_data_op);
    REGISTER_CALLBACK(device_, target);
    REGISTER_CALLBACK(device_, target_map);
    REGISTER_CALLBACK(device_, target_submit);
}

static int
ompt_initialize(ompt_function_lookup_t lookup,
                int initialDeviceNum,
                ompt_data_t *toolData)
{
    printf("[ompt_initialize] lookup = %p | initialDeviceNum = %d, toolData = %p\n", lookup, initialDeviceNum, toolData);
    ompt_set_callback_t set_callback =
        (ompt_set_callback_t)lookup("ompt_set_callback");
    assert(set_callback != 0);
    ompt_get_task_info_t ompt_get_task_info =
        (ompt_get_task_info_t)lookup("ompt_get_task_info");
    assert(ompt_get_task_info != 0);
    ompt_finalize_tool_t ompt_finalize_tool =
        (ompt_finalize_tool_t)lookup("ompt_finalize_tool");
    assert(ompt_finalize_tool != 0);

    register_event_callbacks_host(set_callback);
    register_event_callbacks_device(set_callback);

    return 1; /* non-zero indicates success */
}

static void
ompt_finalize(ompt_data_t *toolData)
{
    printf("[ompt_finalize]\n");
}

ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version, /* == _OPENMP */
                                          const char *runtime_version)
{
    printf("[ompt_start_tool] omp_version = %d | runtime_version = %s\n",
           omp_version, runtime_version);
    static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
                                                              &ompt_finalize,
                                                              ompt_data_none};
    return &ompt_start_tool_result;
}

I then compiled the tool using the following command:

nvc -mp=gpu,ompt -Minfo ./target.1.c ./ompt_printf.c

During the execution, the callbacks [ompt_host_mutex_acquire], [ompt_host_mutex_acquired] and [ompt_host_mutex_released] are shown a lot, way more often than I would expect. The program runs into a SIGSEGV at the end, showing the following content in GDB

Thread 1 "a.out" received signal SIGSEGV, Segmentation fault.
0x00007ffff624875d in ompt_callback_target_map_wrapper (targetData=<optimized out>, nItems=1, hostAddr=0x7fffffffd4a8, deviceAddr=0x7fffffffd4a0, bytes=0x7fffffffd498, mappingFlags=0x7fffffffd4fc, codePtr=0x4017ce <vec_mult()+1294>) at ompt.c:165
165	ompt.c: No such file or directory.
(gdb) bt
#0  0x00007ffff624875d in ompt_callback_target_map_wrapper (targetData=<optimized out>, nItems=1, hostAddr=0x7fffffffd4a8, deviceAddr=0x7fffffffd4a0, bytes=0x7fffffffd498, mappingFlags=0x7fffffffd4fc, codePtr=0x4017ce <vec_mult()+1294>)
    at ompt.c:165
#1  0x00007ffff6221ec5 in __nvomp_dataoff (filename=<optimized out>, funcname=<optimized out>, pdevptr=<optimized out>, hostptr=<optimized out>, hostptrptr=<optimized out>, poffset=<optimized out>, dims=<optimized out>, 
    desc=<optimized out>, elementsize=<optimized out>, hostdescptr=<optimized out>, hostdescsize=<optimized out>, lineno=<optimized out>, name=<optimized out>, flags=<optimized out>, nowait=<optimized out>, devid=<optimized out>)
    at nvomp_target.c:2451
#2  0x00000000004017ce in vec_mult () at ./target.1.c:9
#3  0x00000000004019c8 in main () at ./target.1.c:14

I’ve been using the following hardware to test this problem:

  1. Ryzen 7 3700X, NVIDIA RTX 3070, Arch Linux
  2. Intel Core i7-1260P, NVIDIA MX550, Ubuntu 22.04

Jan -

We have found the same thing - that the OpenMP runtime crashes for some target offload regions. FYI, I wrote a test library to evaluate the vendor compilers for OMPT support/compliance - see https://github.com/khuck/OMPT-demo-tool for details. Also, there are some host-side OMPT events that NVIDIA doesn’t return even when it doesn’t crash, for example the on_ompt_callback_work callback for endpoint=ompt_scope_end. I think NVIDIA is aware of these problems, but thanks for reporting the bug here, too.

1 Like

Hi Kevin,

thanks for your answer. You’re right, not all host side events are present as well. For Score-P, we’re (at least) missing that overdue events are dispatched when the tool finalizes. That’s currently not the case. There is a post for that here.
I guess that you’re facing similar problems for OMPT support in the NVHPC compilers in TAU.

Thanks for the link to your demo tool. I will check it out.
Right now, I use modified versions of the OpenMP API 5.2.1 Examples to check basically the same thing.

Small update. I‘ve installed NVHPC 23.1 on one of my systems and tried to run the same examples again. The issue is still present.

~/Projects/OpenMP/OMPT/bin/C (main*) » nvc --version                                                   

nvc 23.1-0 64-bit target on x86-64 Linux -tp haswell 
NVIDIA Compilers and Tools
Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
------------------------------------------------------------------------------------------------------------------------------
~/Projects/OpenMP/OMPT/bin/C (main*) » nvc -mp=gpu,ompt target.1.c ../ompt_printf.c                    
target.1.c:
../ompt_printf.c:
------------------------------------------------------------------------------------------------------------------------------
~/Projects/OpenMP/OMPT/bin/C (main*) » ./a.out                                                         
[ompt_start_tool] omp_version = 202011 | runtime_version = NVOMP 202011
[ompt_initialize] lookup = 0x7fe46be46af0 | initialDeviceNum = -1, toolData = 0x40c250
implicit_task : always
parallel_begin : always
parallel_end : always
sync_region : always
task_create : always
task_schedule : always
thread_begin : always
thread_end : always
work : always
masked : always
mutex_acquire : always
mutex_acquired : always
mutex_released : always
lock_init : always
lock_destroy : always
nest_lock : always
dispatch : never
flush : always
device_initialize : always
device_finalize : always
device_load : always
device_unload : always
target_data_op : always
target : always
target_map : always
target_submit : always
[ompt_host_thread_begin] thread_type = 1 | thread_data = 0x7fe46ca00948
[ompt_host_implicit_task] endpoint = ompt_scope_begin | parallel_data = 0x7fe46c2048c8 | task_data = 0x7fe46c2048e0 | actual_parallelism = 1 | index = 1 | flags = 1
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_device_target] kind = ompt_target_exit_data | endpoint = ompt_scope_begin | device_num = 0 | task_data = 0x7fe46c2048e0 | target_id = 0 | codeptr_ra = 0x40144d
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_device_target_data_op] target_id = 0 | host_op_id = 0 | optype = ompt_target_data_transfer_to_device | src_addr = 0x7ffef38a7508 | src_device_num = -1 | dest_addr = 0x7fe445afa000 | dest_device_num = 0 | bytes = 400 | codeptr_ra = 0x4014fc
Actual optype = 2
[ompt_device_target_map] target_id = 0 | nitems = 1 | host_addr = 0x7ffef38a72d8 | device_addr = 0x7ffef38a72d0 | bytes = 0x7ffef38a72c8 | mapping_flags = 0x7ffef38a72fc | codeptr_ra = 0x4014fc
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_device_target_data_op] target_id = 0 | host_op_id = 1 | optype = ompt_target_data_transfer_to_device | src_addr = 0x7ffef38a7828 | src_device_num = -1 | dest_addr = 0x7fe445afa200 | dest_device_num = 0 | bytes = 400 | codeptr_ra = 0x40159b
Actual optype = 2
[ompt_device_target_map] target_id = 0 | nitems = 1 | host_addr = 0x7ffef38a72d8 | device_addr = 0x7ffef38a72d0 | bytes = 0x7ffef38a72c8 | mapping_flags = 0x7ffef38a72fc | codeptr_ra = 0x40159b
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_device_target_data_op] target_id = 0 | host_op_id = 2 | optype = ompt_target_data_transfer_to_device | src_addr = 0x7ffef38a7698 | src_device_num = -1 | dest_addr = 0x7fe445afa400 | dest_device_num = 0 | bytes = 400 | codeptr_ra = 0x40163d
Actual optype = 2
[ompt_device_target_map] target_id = 0 | nitems = 1 | host_addr = 0x7ffef38a72d8 | device_addr = 0x7ffef38a72d0 | bytes = 0x7ffef38a72c8 | mapping_flags = 0x7ffef38a72fc | codeptr_ra = 0x40163d
[ompt_device_target_submit] target_id = 1 | host_op_id = 3 | requested_num_teams = -1
[ompt_device_initialize] device_num = 0 | type = NVIDIA GeForce MX550 | device = 0x7fe46c2049e0 | lookup = (nil) | documentation = 
[ompt_device_load] device_num = 0 | filename = (null) | offset_in_line = -1 | vma_in_line = 0xffffffffffffffff | bytes = 0 | host_addr = 0x40c140 | device_addr = 0xffffffffffffffff | module_id = 29153568
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_acquire]
[ompt_host_mutex_acquired]
[ompt_host_mutex_released]
[ompt_host_mutex_released]
[1]    989203 segmentation fault (core dumped)  ./a.out

Yes, we’ve had some issues with the new OMPT so glad you both are exercising it.

We’ve fixed a few issues, notably Kevin’s TPR #32358 in 22.11, and Christian’s TPR #32571 in 23.1.

However, I don’t see any open issue with “map” so this is likely new or possibly a similar problem to Kevin’s where the callback wasn’t being properly registered. I’ve add TPR #32982 and sent to engineering for investigation.

-Mat

Hi Mat,

thanks for your update! It would be great if the bug gets fixed soon.

Like Kevin wrote in his comment, I also noticed that there are cases where callbacks with ompt_scope_endpoint_t aren’t called with an endpoint of ompt_scope_end.
I noticed this while writing a bug report for another compiler vendor. With NVHPC, the target callback is only called once with ompt_scope_begin, but never with ompt_scope_end. I can provide a small demo code if you’re interested.