I’m trying to implement a multi-threaded signal processing pipeline, but my code gives a segmentation fault on a Jetson AGX Xavier. The same code runs fine on a Linux desktop with a GeForce GTX 1080 Ti. Here’s a toy example illustrating the issue:
#include <cassert>
#include <chrono>
#include <complex>
#include <cufftw.h>
#include <iostream>
#include <mutex>
#include <openacc.h>
#include <thread>
#include <vector>
using namespace std;
static const size_t SIGNAL_SIZE = 2048;
static const size_t SCAN_SIZE = 1000;
static const size_t FRAME_SIZE = SCAN_SIZE * SIGNAL_SIZE;
static const size_t N_FRAMES = 100;
using complex32 = complex<float>;
void processFrame(const uint16_t *spectra, float *R_A, const float *win,
complex32 *C_A, cufftHandle hand) {
#pragma acc data present( \
spectra[:FRAME_SIZE], R_A[:FRAME_SIZE], win[:SIGNAL_SIZE], C_A \
[:FRAME_SIZE])
{
#pragma acc parallel loop
for (size_t i = 0; i < FRAME_SIZE; i++) {
C_A[i] = spectra[i] * win[i % SIGNAL_SIZE];
}
cufftResult status =
cufftExecC2C(hand, reinterpret_cast<cufftComplex *>(C_A),
reinterpret_cast<cufftComplex *>(C_A), CUFFT_FORWARD);
assert(status == CUFFT_SUCCESS);
#pragma acc parallel loop
for (size_t i = 0; i < FRAME_SIZE; i++) {
R_A[i] = abs(C_A[i]);
}
}
}
int main(int argc, char const *argv[]) {
auto spectra = vector<uint16_t>(FRAME_SIZE * N_FRAMES);
auto R_A = vector<float>(FRAME_SIZE * N_FRAMES);
auto win = vector<float>(SIGNAL_SIZE);
static const size_t N_THREADS = 1; // The program crash even when N_THREADS == 1
auto C_A =
vector<vector<complex32>>(N_THREADS, vector<complex32>(FRAME_SIZE));
mutex mutC_A[N_THREADS];
thread *threads[N_FRAMES] = {};
cufftHandle hand;
cufftResult status;
status = cufftCreate(&hand);
assert(status == CUFFT_SUCCESS);
status = cufftPlan1d(&hand, SIGNAL_SIZE, CUFFT_C2C, SCAN_SIZE);
assert(status == CUFFT_SUCCESS);
for (size_t i = 0; i < SIGNAL_SIZE; i++) {
win[i] = 1 / (i + 1) * 1 / (i + 3);
}
for (size_t i = 0; i < FRAME_SIZE; i++) {
spectra[i] = (i * 7) % 4096;
}
auto start = chrono::system_clock::now();
for (size_t n = 0; n < N_FRAMES; n++) {
size_t buf = n % N_THREADS;
{
lock_guard<mutex> lock(mutC_A[buf]);
threads[n] =
new thread(processFrame, &spectra[n * FRAME_SIZE],
&R_A[n * FRAME_SIZE], &win[0], &C_A[buf][0], hand);
// The following works
// processFrame(&spectra[n * FRAME_SIZE], &R_A[n * FRAME_SIZE], &win[0],
// &C_A[buf][0], hand);
}
}
for (size_t n = 0; n < N_FRAMES; n++) {
threads[n]->join();
delete threads[n];
}
auto end = chrono::system_clock::now();
chrono::duration<double> elapsed_seconds = end - start;
time_t end_time = chrono::system_clock::to_time_t(end);
cout << "Processing time per frame "
<< (elapsed_seconds.count() / N_FRAMES) * 1000 << "ms" << endl;
cufftDestroy(hand);
}
I compile the code with
nvc++ -acc -fast -gpu=cc72,managed,cuda10.2 -Minfo=accel -Mcudalib=cufft -std=c++17 -o test main.cpp
I’m using HPC SDK 20.7 on a Jetson AGX Xavier with L4T R32 (release), REVISION: 5.2
On the Jetson, the code crashes even when N_THREADS == 1
. On the desktop, I can run multiple threads and I can see some performance improvement.
On the Jetson cuda-gdb
gives the following output:
Thread 97 "test" received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7e5f7f5730 (LWP 20199)]
0x0000007fabb4ce94 in std::error_code::default_error_condition() const () from /usr/lib/aarch64-linux-gnu/libstdc++.so.6
Any suggestions?