Heya,
I’ve been looking at the OMPT callback interface of different compilers and tried to compare the given parameters based on the OpenMP Examples. When examining the NVHPC compilers, I noticed that a large number of the examples crashed with a segmentation fault during execution. Disabling the flag -mp=ompt caused the program to run without issues.
The issue seems to be quite similar to the one here, which should be fixed in 22.11.
I’ve been using the following two C files:
target.1.c
#define N 100
void vec_mult()
{
int i;
float p[N], v1[N], v2[N];
#pragma omp target map(p, v1, v2)
for (i = 0; i < N; i++)
p[i] = v1[i] * v2[i];
}
int main()
{
vec_mult();
}
ompt_printf.c
#include <omp-tools.h>
#include <stdio.h>
#include <assert.h>
#define REGISTER_CALLBACK(PREFIX, NAME) \
setCallback(ompt_callback_##NAME, \
(ompt_callback_t)&ompt_##PREFIX##NAME)
void ompt_host_implicit_task(ompt_scope_endpoint_t endpoint,
ompt_data_t *parallel_data,
ompt_data_t *task_data,
unsigned int actual_parallelism,
unsigned int index,
int flags)
{
printf("[ompt_host_implicit_task]\n");
}
void ompt_host_thread_begin(ompt_thread_t thread_type,
ompt_data_t *thread_data)
{
printf("[ompt_host_thread_begin]\n");
}
void ompt_host_thread_end(ompt_data_t *thread_data)
{
printf("[ompt_host_thread_end]\n");
}
void ompt_host_mutex_acquire(ompt_mutex_t kind,
unsigned int hint,
unsigned int impl,
ompt_wait_id_t wait_id,
const void *codeptr_ra)
{
printf("[ompt_host_mutex_acquire]\n");
}
void ompt_host_mutex_acquired(ompt_mutex_t kind,
ompt_wait_id_t wait_id,
const void *codeptr_ra)
{
printf("[ompt_host_mutex_acquired]\n");
}
void ompt_host_mutex_released(ompt_mutex_t kind,
ompt_wait_id_t wait_id,
const void *codeptr_ra)
{
printf("[ompt_host_mutex_released]\n");
}
void ompt_device_initialize(int device_num,
const char *type,
ompt_device_t *device,
ompt_function_lookup_t lookup,
const char *documentation)
{
printf("[ompt_device_initialize]\n");
}
void ompt_device_finalize(int device_num)
{
printf("[ompt_device_finalize]\n");
}
void ompt_device_load(int device_num,
const char *filename,
int64_t offset_in_line,
void *vma_in_file,
size_t bytes,
void *host_addr,
void *device_addr,
uint64_t module_id)
{
printf("[ompt_device_load]\n");
}
void ompt_device_unload(int device_num,
uint64_t module_id)
{
printf("[ompt_device_unload]\n");
}
void ompt_device_target_data_op(ompt_id_t target_id,
ompt_id_t host_op_id,
ompt_target_data_op_t optype,
void *src_addr,
int src_device_num,
void *dest_addr,
int dest_device_num,
size_t bytes,
const void *codeptr_ra)
{
printf("[ompt_device_target_data_op]\n");
}
void ompt_device_target(ompt_target_t kind,
ompt_scope_endpoint_t endpoint,
int device_num,
ompt_data_t *task_data,
ompt_id_t target_id,
const void *codeptr_ra)
{
printf("[ompt_device_target]\n");
}
void ompt_device_target_map(ompt_id_t target_id,
unsigned int nitems,
void **host_addr,
void **device_addr,
size_t *bytes,
unsigned int *mapping_flags,
const void *codeptr_ra)
{
printf("[ompt_device_target_map]\n");
}
void ompt_device_target_submit(ompt_id_t target_id,
ompt_id_t host_op_id,
unsigned int requested_num_teams)
{
printf("[ompt_device_target_submit]\n");
}
static void
register_event_callbacks_host(ompt_set_callback_t setCallback)
{
/* sort alphabetically */
REGISTER_CALLBACK(host_, implicit_task);
REGISTER_CALLBACK(host_, thread_begin);
REGISTER_CALLBACK(host_, thread_end);
REGISTER_CALLBACK(host_, mutex_acquire);
REGISTER_CALLBACK(host_, mutex_acquired);
REGISTER_CALLBACK(host_, mutex_released);
}
static void
register_event_callbacks_device(ompt_set_callback_t setCallback)
{
REGISTER_CALLBACK(, device_initialize);
REGISTER_CALLBACK(, device_finalize);
REGISTER_CALLBACK(, device_load);
REGISTER_CALLBACK(, device_unload);
REGISTER_CALLBACK(device_, target_data_op);
REGISTER_CALLBACK(device_, target);
REGISTER_CALLBACK(device_, target_map);
REGISTER_CALLBACK(device_, target_submit);
}
static int
ompt_initialize(ompt_function_lookup_t lookup,
int initialDeviceNum,
ompt_data_t *toolData)
{
printf("[ompt_initialize] lookup = %p | initialDeviceNum = %d, toolData = %p\n", lookup, initialDeviceNum, toolData);
ompt_set_callback_t set_callback =
(ompt_set_callback_t)lookup("ompt_set_callback");
assert(set_callback != 0);
ompt_get_task_info_t ompt_get_task_info =
(ompt_get_task_info_t)lookup("ompt_get_task_info");
assert(ompt_get_task_info != 0);
ompt_finalize_tool_t ompt_finalize_tool =
(ompt_finalize_tool_t)lookup("ompt_finalize_tool");
assert(ompt_finalize_tool != 0);
register_event_callbacks_host(set_callback);
register_event_callbacks_device(set_callback);
return 1; /* non-zero indicates success */
}
static void
ompt_finalize(ompt_data_t *toolData)
{
printf("[ompt_finalize]\n");
}
ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version, /* == _OPENMP */
const char *runtime_version)
{
printf("[ompt_start_tool] omp_version = %d | runtime_version = %s\n",
omp_version, runtime_version);
static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,
&ompt_finalize,
ompt_data_none};
return &ompt_start_tool_result;
}
I then compiled the tool using the following command:
nvc -mp=gpu,ompt -Minfo ./target.1.c ./ompt_printf.c
During the execution, the callbacks [ompt_host_mutex_acquire], [ompt_host_mutex_acquired] and [ompt_host_mutex_released] are shown a lot, way more often than I would expect. The program runs into a SIGSEGV at the end, showing the following content in GDB
Thread 1 "a.out" received signal SIGSEGV, Segmentation fault.
0x00007ffff624875d in ompt_callback_target_map_wrapper (targetData=<optimized out>, nItems=1, hostAddr=0x7fffffffd4a8, deviceAddr=0x7fffffffd4a0, bytes=0x7fffffffd498, mappingFlags=0x7fffffffd4fc, codePtr=0x4017ce <vec_mult()+1294>) at ompt.c:165
165 ompt.c: No such file or directory.
(gdb) bt
#0 0x00007ffff624875d in ompt_callback_target_map_wrapper (targetData=<optimized out>, nItems=1, hostAddr=0x7fffffffd4a8, deviceAddr=0x7fffffffd4a0, bytes=0x7fffffffd498, mappingFlags=0x7fffffffd4fc, codePtr=0x4017ce <vec_mult()+1294>)
at ompt.c:165
#1 0x00007ffff6221ec5 in __nvomp_dataoff (filename=<optimized out>, funcname=<optimized out>, pdevptr=<optimized out>, hostptr=<optimized out>, hostptrptr=<optimized out>, poffset=<optimized out>, dims=<optimized out>,
desc=<optimized out>, elementsize=<optimized out>, hostdescptr=<optimized out>, hostdescsize=<optimized out>, lineno=<optimized out>, name=<optimized out>, flags=<optimized out>, nowait=<optimized out>, devid=<optimized out>)
at nvomp_target.c:2451
#2 0x00000000004017ce in vec_mult () at ./target.1.c:9
#3 0x00000000004019c8 in main () at ./target.1.c:14
I’ve been using the following hardware to test this problem:
- Ryzen 7 3700X, NVIDIA RTX 3070, Arch Linux
- Intel Core i7-1260P, NVIDIA MX550, Ubuntu 22.04