I tried porting a program from CUDA 10.2 to CUDA 11.4 and the program is roughly twice as slow with the newer release.
Here’s a minimal example showing the issue:
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cmath>
#include <complex>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <cufftw.h>
#include <iostream>
#include <numeric>
#include <openacc.h>
#include <pthread.h>
#include <sstream>
#include <sys/time.h>
#include <vector>
using namespace std;
static const uint32_t SIGNAL_SIZE = 2048;
static const uint32_t SCAN_SIZE = 1000;
static const uint32_t FRAME_SIZE = SCAN_SIZE * SIGNAL_SIZE;
static const uint32_t N_FRAMES = 100;
void processFrame(const uint16_t *spectra, float *R_A,
const float *win, std::complex<float> *C_A, cufftHandle hand)
{
#pragma acc data present(spectra[ : FRAME_SIZE], R_A[ : FRAME_SIZE], win[ : SIGNAL_SIZE], C_A[ : FRAME_SIZE])
{
#pragma acc parallel loop
for (uint32_t i = 0; i < FRAME_SIZE; i++)
{
C_A[i] = spectra[i] * win[i % SIGNAL_SIZE];
}
cufftResult status =
cufftExecC2C(hand, reinterpret_cast<cufftComplex *>(C_A),
reinterpret_cast<cufftComplex *>(C_A), CUFFT_FORWARD);
assert(status == CUFFT_SUCCESS);
#pragma acc parallel loop
for (uint32_t i = 0; i < FRAME_SIZE; i++)
{
R_A[i] = std::abs(C_A[i]);
}
}
}
int main(int argc, char const *argv[])
{
auto spectra = std::vector<uint16_t>(FRAME_SIZE * N_FRAMES);
auto R_A = std::vector<float>(FRAME_SIZE);
auto C_A = std::vector<std::complex<float>>(FRAME_SIZE);
auto win = std::vector<float>(SIGNAL_SIZE);
cufftHandle hand;
cufftResult status;
status = cufftCreate(&hand);
assert(status == CUFFT_SUCCESS);
status = cufftPlan1d(&hand, SIGNAL_SIZE, CUFFT_C2C, SCAN_SIZE);
assert(status == CUFFT_SUCCESS);
for (uint32_t i = 0; i < SIGNAL_SIZE; i++)
{
win[i] = 1 / (i + 1) * 1 / (i + 3);
}
for (uint32_t i = 0; i < FRAME_SIZE; i++)
{
spectra[i] = (i * 7) % 4096;
}
auto start = std::chrono::system_clock::now();
for (uint32_t n = 0; n < N_FRAMES; n++)
{
processFrame(&spectra[n * FRAME_SIZE], &R_A[0], &win[0], &C_A[0], hand);
}
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
std::time_t end_time = std::chrono::system_clock::to_time_t(end);
std::cout << "Processing time per frame "
<< (elapsed_seconds.count() / N_FRAMES) * 1000 << "ms" << std::endl;
cufftDestroy(hand);
}
I compile the program with the following options:
nvc++ -acc -fast -gpu=cc72,managed,cuda11.4 -Minfo=accel -Mcudalib=cufft -std=c++17 -o timing main.cpp
I’m running the code on a Jetson AGX Xavier DevKit, using HPC SDK v20.7 for CUDA 10.2 and v21.9 for CUDA 11.4
With CUDA 10.2 I get an average of 4 ms per frame. With CUDA 11.4 I get an average of 8 ms per frame.
Am I doing something wrong?