Problems with OpenCV GPU accelerated HOG detect

I am running a TX1 with OpenCV4Tegra 2.4.13 alongside ROS Kinetic on Ubuntu 16.04. I am attempting to get a very primitive people detector setup and am scratching my head at the error I am receiving when the application is run:

OpenCV Error: Assertion failed (padding == Size(0, 0)) in detect, file /hdd/buildbot/slave_jetson_tx_3/35-O4T-L4T-R24/opencv/modules/gpu/src/hog.cpp, line 365
terminate called after throwing an instance of 'cv::Exception'
  what():  /hdd/buildbot/slave_jetson_tx_3/35-O4T-L4T-R24/opencv/modules/gpu/src/hog.cpp:365: error: (-215) padding == Size(0, 0) in function detect

The error seems as if it would be clear enough, however padding should be getting explicitly set to Size(16, 16) and I am unable to track down why it is being seen as equivalent to Size(0,0). I have done simple sanity checks such as creating a variable of type Size and doing the comparison myself before passing it to the function showing that it should not show as equivalent to Size(0,0)

Hacked together source:

using namespace std;

#include <opencv2/opencv.hpp>
#include <opencv2/gpu/gpu.hpp>
#include <stdio.h>
#include <vector>
#include <iostream>
#include "ros/ros.h"
#include "sensor_msgs/Image.h"

cv::Mat image_;
bool new_img_ = false;

void callback(const sensor_msgs::Image::ConstPtr& msg)
{
    if (new_img_) return;
    image_ = cv::Mat((int)msg->width, (int)msg->height, CV_8U, (uint8_t*)&msg->data[0]).clone().reshape(0, msg->height);
    new_img_ = true;
}

int main (int argc, char * argv[])
{
    printf("Using OpenCV %i.%i.%i\n", CV_MAJOR_VERSION, CV_MINOR_VERSION, CV_SUBMINOR_VERSION);
    cv::gpu::HOGDescriptor hog;
    hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());

    ros::init(argc, argv, "peopleDectector_node");
    ros::NodeHandle n;

    ros::Subscriber sub = n.subscribe("/sensors/camera/color_image", 3, callback);

    while (true)
    {
        if (new_img_) {
            vector<cv::Rect> found;

            cv::gpu::GpuMat gpu_img;
            gpu_img.upload(image_);

            // This next line is the source of all the issues, it claims cv::Size(16,16) == Size(0,0)
            hog.detectMultiScale(gpu_img, found, 1.1, cv::Size(8,8), cv::Size(16,16), 1.05, 2);
           
            printf("Found %lu people\n", found.size());
            new_img_ = false;
        }
        ros::spinOnce();
    }
    return 0;
}

I’ve been scratching my head on this one for far too long now. The code works if I transition back to doing the computations on the CPU rather than the GPU which leads me to believe everything is linking appropriately, but to be honest I’m not entirely sure. Thank you in advance for any help!

EDIT:
For extra context, here are the 2 functions pulled from OpenCV. As far as I can tell, padding isn’t even used so the check should be removed.

void cv::gpu::HOGDescriptor::detect(const GpuMat& img, vector<Point>& hits, double hit_threshold, Size win_stride, Size padding)
{
    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);
    CV_Assert(padding == Size(0, 0));

    hits.clear();
    if (detector.empty())
        return;

    computeBlockHistograms(img);

    if (win_stride == Size())
        win_stride = block_stride;
    else
        CV_Assert(win_stride.width % block_stride.width == 0 && win_stride.height % block_stride.height == 0);

    Size wins_per_img = numPartsWithin(img.size(), win_size, win_stride);
    //   labels.create(1, wins_per_img.area(), CV_8U);
    labels = getBuffer(1, wins_per_img.area(), CV_8U, labels_buf);

    hog::classify_hists(win_size.height, win_size.width, block_stride.height, block_stride.width,
                        win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(),
                        detector.ptr<float>(), (float)free_coef, (float)hit_threshold, labels.ptr());

    labels.download(labels_host);
    unsigned char* vec = labels_host.ptr();
    for (int i = 0; i < wins_per_img.area(); i++)
    {
        int y = i / wins_per_img.width;
        int x = i - wins_per_img.width * y;
        if (vec[i])
            hits.push_back(Point(x * win_stride.width, y * win_stride.height));
    }
}

void cv::gpu::HOGDescriptor::detectMultiScale(const GpuMat& img, vector<Rect>& found_locations, double hit_threshold,
                                              Size win_stride, Size padding, double scale0, int group_threshold)
{

    CV_Assert(img.type() == CV_8UC1 || img.type() == CV_8UC4);

    vector<double> level_scale;
    double scale = 1.;
    int levels = 0;

    for (levels = 0; levels < nlevels; levels++)
    {
        level_scale.push_back(scale);
        if (cvRound(img.cols/scale) < win_size.width ||
            cvRound(img.rows/scale) < win_size.height || scale0 <= 1)
            break;
        scale *= scale0;
    }
    levels = std::max(levels, 1);
    level_scale.resize(levels);
    image_scales.resize(levels);

    std::vector<Rect> all_candidates;
    vector<Point> locations;

    for (size_t i = 0; i < level_scale.size(); i++)
    {
        scale = level_scale[i];
        Size sz(cvRound(img.cols / scale), cvRound(img.rows / scale));
        GpuMat smaller_img;

        if (sz == img.size())
            smaller_img = img;
        else
        {
            image_scales[i].create(sz, img.type());
            switch (img.type())
            {
                case CV_8UC1: hog::resize_8UC1(img, image_scales[i]); break;
                case CV_8UC4: hog::resize_8UC4(img, image_scales[i]); break;
            }
            smaller_img = image_scales[i];
        }

        detect(smaller_img, locations, hit_threshold, win_stride, padding);
        Size scaled_win_size(cvRound(win_size.width * scale), cvRound(win_size.height * scale));
        for (size_t j = 0; j < locations.size(); j++)
            all_candidates.push_back(Rect(Point2d((CvPoint)locations[j]) * scale, scaled_win_size));
    }

    found_locations.assign(all_candidates.begin(), all_candidates.end());
    groupRectangles(found_locations, group_threshold, 0.2/*magic number copied from CPU version*/);
}

Could you make sure that gpuMat functions are okay? You could try some simple gpuMat example first to verify it.

Through some desperate attempts at figuring this out I came to the conclusion that I was misinterpreting the error message being delayed. The function detect() requires input Padding to be equivalent to Size(0,0) EVENTHOUGH this value is not ever used internally as far as I can tell. This appears to be remenace from a previous version but ideally this is something that should be removed as it is a blue being passed to a function that Is required to be a specific value even though it appears it has no purpose.