Hi All,
Fair warning, my CUDA knowledge is pretty entry-level. I can get things working, and have in the past had a fair amount of success getting some very respectable speedups when porting C/C++ into CUDA.
At the moment, I’m working on a project that will involve processing a video stream. I have a proof-of-concept running on some test data, but it is slow running on CPU (single thread). I’ve therefore started trying to parallelise it using CUDA.
The issue is that it relies heavily on large quantities of adding image frames together, i.e.:
2D_SUM[width][height] = 2D_Im1[width][height] + 2D_Im2[width][height]
So far, all my attempts to get this running at an acceptable speed have been fruitless.
I am hoping someone could take a look at a minimum functional sample I wrote, and let me know why the performance I’m getting is so bad? It includes doing the sums as both 2D and equivalent 1D, running on both CPU and GPU.
I’m running it on a Jetson Xavier NX (384 Cuda Cores).
Times I’m getting on this are:
- CPU 1D Arrays = 554ms
- CPU 2D Arrays = 669ms
- GPU 1D Arrays = 1278ms
- GPU 2D Arrays = 1622ms to 4175ms (not sure if timer is reliable)
i.e. GPU is distinctly worse.
Code below should be copy/paste/compile/run-able.
#include <iostream>
#include <chrono>
#include <math.h>
using std::chrono::time_point;
using std::chrono::duration_cast;
using std::chrono::high_resolution_clock;
using std::chrono::milliseconds;
class timer {
public:
timer(void) : start(high_resolution_clock::now()) {}
template<class To>
auto time_past(void) {
auto now = high_resolution_clock::now();
auto d = duration_cast<To>(now - start);
return d;
}
private:
time_point<high_resolution_clock> start;
};
__global__ void Cu_1D_ADD(short * CUMULATIVE_MTX, short * NEW_MTX, ushort FrameHeight, ushort FrameWidth) {
unsigned int Column = blockIdx.x * blockDim.x + threadIdx.x;
if (Column >= FrameWidth) return;
int Start = FrameHeight*Column;
for (int Row = 0; Row < FrameHeight; ++Row) {
CUMULATIVE_MTX[Start + Row] += NEW_MTX[Start + Row];
}
}
__global__ void Cu_2D_ADD(short ** CUMULATIVE_MTX, short ** NEW_MTX, ushort FrameHeight, ushort FrameWidth) {
unsigned int Column = blockIdx.x * blockDim.x + threadIdx.x;
if (Column >= FrameWidth) return;
for (int Row = 0; Row < FrameHeight; ++Row) {
CUMULATIVE_MTX[Column][Row] += NEW_MTX[Column][Row];
}
}
int main () {
ushort FrameWidth = 6400;
ushort FrameHeight = 4800;
int BlockSize = 384;
int NumBlocks = (int)ceil( ((double)FrameWidth) / ((double)BlockSize) );
std::cout << "NumBlocks = " << NumBlocks << "\n\n";
long Check;
// Define 1D Arrays
short* CUMULATIVE_MTX_1D;
cudaMallocManaged(&CUMULATIVE_MTX_1D, FrameWidth*FrameHeight*sizeof(short));
short* NEW_MTX_1D;
cudaMallocManaged(&NEW_MTX_1D, FrameWidth*FrameHeight*sizeof(short));
// Define 2D Arrays
short** CUMULATIVE_MTX_2D;
cudaMallocManaged(&CUMULATIVE_MTX_2D, FrameWidth*sizeof(short *));
cudaMallocManaged(&CUMULATIVE_MTX_2D[0], FrameWidth*FrameHeight*sizeof(short));
for (int col = 1; col < FrameWidth; ++ col) {
CUMULATIVE_MTX_2D[col] = CUMULATIVE_MTX_2D[col-1] + FrameHeight;
}
short** NEW_MTX_2D;
cudaMallocManaged(&NEW_MTX_2D, FrameWidth*sizeof(short *));
cudaMallocManaged(&NEW_MTX_2D[0], FrameWidth*FrameHeight*sizeof(short));
for (int col = 1; col < FrameWidth; ++ col) {
NEW_MTX_2D[col] = NEW_MTX_2D[col-1] + FrameHeight;
}
// Test 1D Runtime on CPU
std::fill(CUMULATIVE_MTX_1D, CUMULATIVE_MTX_1D + FrameWidth*FrameHeight, 5);
std::fill(NEW_MTX_1D, NEW_MTX_1D + FrameWidth*FrameHeight, 2);
timer t_CPU1;
for (int Repeat = 0; Repeat < 10; ++Repeat) {
for (int Px = 0; Px < FrameWidth*FrameHeight; ++Px) {
CUMULATIVE_MTX_1D[Px] += NEW_MTX_1D[Px];
}
}
std::cout << "CPU 1D Time = " << t_CPU1.time_past<milliseconds>().count() << " ms\n";
Check = 0;
for (int Px = 0; Px < FrameWidth*FrameHeight; ++Px) {
if (!(CUMULATIVE_MTX_1D[Px] == 25)) ++Check;
}
std::cout << "CPU 1D [0] = " << CUMULATIVE_MTX_1D[0] << " First Element\n";
std::cout << "CPU 1D Sum = " << Check << " Values Incorrect\n\n";
// Test 2D Runtime on CPU
std::fill(CUMULATIVE_MTX_2D[0], CUMULATIVE_MTX_2D[0] + FrameWidth*FrameHeight, 5);
std::fill(NEW_MTX_2D[0], NEW_MTX_2D[0] + FrameWidth*FrameHeight, 2);
timer t_CPU2;
for (int Repeat = 0; Repeat < 10; ++Repeat) {
for (int Column = 0; Column < FrameWidth; ++Column) {
for (int Row = 0; Row < FrameHeight; ++Row) {
CUMULATIVE_MTX_2D[Column][Row] += NEW_MTX_2D[Column][Row];
}
}
}
std::cout << "CPU Time 2D = " << t_CPU2.time_past<milliseconds>().count() << " ms\n";
Check = 0;
for (int Column = 0; Column < FrameWidth; ++Column) {
for (int Row = 0; Row < FrameHeight; ++Row) {
if (!(CUMULATIVE_MTX_2D[Column][Row] == 25)) ++Check;
}
}
std::cout << "CPU 2D [0] = " << CUMULATIVE_MTX_2D[0][0] << " First Element\n";
std::cout << "CPU 2D Sum = " << Check << " Values Incorrect\n\n";
// Test 1D Runtime on GPU
std::fill(CUMULATIVE_MTX_1D, CUMULATIVE_MTX_1D + FrameWidth*FrameHeight, 5);
std::fill(NEW_MTX_1D, NEW_MTX_1D + FrameWidth*FrameHeight, 2);
timer t_GPU1;
for (int Repeat = 0; Repeat < 10; ++Repeat) {
Cu_1D_ADD<<<NumBlocks,BlockSize>>>(CUMULATIVE_MTX_1D, NEW_MTX_1D, FrameHeight, FrameWidth);
cudaDeviceSynchronize();
}
std::cout << "GPU Time 1D = " << t_GPU1.time_past<milliseconds>().count() << " ms\n";
Check = 0;
for (int Px = 0; Px < FrameWidth*FrameHeight; ++Px) {
if (!(CUMULATIVE_MTX_1D[Px] == 25)) ++Check;
}
std::cout << "GPU 1D [0] = " << CUMULATIVE_MTX_1D[0] << " First Element\n";
std::cout << "GPU 1D Sum = " << Check << " Values Incorrect\n\n";
// Test 2D Runtime on GPU
std::fill(CUMULATIVE_MTX_2D[0], CUMULATIVE_MTX_2D[0] + FrameWidth*FrameHeight, 5);
std::fill(NEW_MTX_2D[0], NEW_MTX_2D[0] + FrameWidth*FrameHeight, 2);
timer t_GPU2;
for (int Repeat = 0; Repeat < 10; ++Repeat) {
Cu_2D_ADD<<<NumBlocks,BlockSize>>>(CUMULATIVE_MTX_2D, NEW_MTX_2D, FrameHeight, FrameWidth);
cudaDeviceSynchronize();
}
std::cout << "GPU Time 2D = " << t_CPU2.time_past<milliseconds>().count() << " ms\n";
Check = 0;
for (int Column = 0; Column < FrameWidth; ++Column) {
for (int Row = 0; Row < FrameHeight; ++Row) {
if (!(CUMULATIVE_MTX_2D[Column][Row] == 25)) ++Check;
}
}
std::cout << "GPU 2D [0] = " << CUMULATIVE_MTX_2D[0][0] << " First Element\n";
std::cout << "GPU 2D Sum = " << Check << " Values Incorrect\n\n";
return 0;
}
Thank you for any assistance, I’m at the point where I don’t really know where to start looking for an answer.