I’m using the opentld library with cuda 10.2.89 & OpenCV 4.5.2 in Jetson Xavier NX following the instructions here which uses cuda to speed it up, but when I use it in the tracking section of opentld thid I get the error terminate called after throwing an instance of (the old code used opencv 3.0 and cuda 4 very old)
'thrust::system::system_error'
what(): parallel_for failed: cudaErrorInvalidValue: invalid argument
Aborted (core dumped)
I guess the faulty part is in
getCPUTick(&procInit);
cv::cuda::GpuMat gpuImg(img);
printf("d_inWinIndices : %d - numWindows : %d\n", d_inWinIndices, numWindows);
createIndexArray(d_inWinIndices, numWindows); // error in here
int numInWins = numWindows;
dynamic_cast<CuVarianceFilter *>(varianceFilter)->filter(gpuImg, d_inWinIndices, numInWins);
dynamic_cast<CuEnsembleClassifier *>(ensembleClassifier)->filter(gpuImg, d_inWinIndices, numInWins);
cudaMemcpy(qualifiedWins, d_inWinIndices, numInWins * sizeof(int), cudaMemcpyDeviceToHost);
Function error in here
#undef __SSE2__
#include "CUDA.h"
#include <thrust/sequence.h>
#include <thrust/remove.h>
#include <thrust/device_ptr.h>
// #include <opencv2/gpu/gpu.hpp>
#include <opencv2/core.hpp>
#include <opencv2/core/cuda.hpp>
void createIndexArray(int * idxArr, int n) {
// try{
printf("n : %d\n", n);
printf("________function________ : createIndexArray1\n");
thrust::device_ptr<int> dev_ptr;
printf("________function________ : createIndexArray2\n");
dev_ptr = thrust::device_pointer_cast(idxArr);
printf("________function________ : createIndexArray3\n");
thrust::sequence(dev_ptr, dev_ptr + n);
printf("________function________ : createIndexArray4\n");
// }
// catch(thrust::system_error e)
// {
// std::cerr << "Error inside sort: " << e.what() << std::endl;
// exit(1);
// }
}
__global__ void __cudaVarianceFilter(cv::cuda::PtrStep<int> integralImg, cv::cuda::PtrStep<unsigned long long> integralImg_squared,
int * windows_d, int * d_inWinIndices, int numInWins, float minVar)
{
printf("________function________ : __cudaVarianceFilter\n");
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if(idx < numInWins) {
int winIdx = d_inWinIndices[idx];
int * win = &windows_d[winIdx * TLD_WINDOW_SIZE];
int x1 = win[0]+1;
int y1 = win[1]+1;
int w = win[2];
int h = win[3];
int x2 = x1 + w - 1;
int y2 = y1 + h - 1;
float area = w * h;
float mX = (integralImg(y2, x2) - integralImg(y1-1, x2) - integralImg(y2, x1-1) + integralImg(y1-1, x1-1)) / area;
unsigned long long l = (integralImg_squared(y2, x2) - integralImg_squared(y1-1, x2) - integralImg_squared(y2, x1-1) + integralImg_squared(y1-1, x1-1));
float mX2 = l / area;
float variance = mX2 - mX * mX;
if(variance < minVar)
d_inWinIndices[idx] = -1;
}
}
void cudaVarianceFilter(cv::cuda::GpuMat integralImg, cv::cuda::GpuMat integralImg_squared,
int * windows_d, int * d_inWinIndices, int &numInWins, float minVar)
{
printf("________function________ : cudaVarianceFilter\n");
cudaEvent_t finished;
cudaEventCreate(&finished);
dim3 gridSize(ceil(numInWins / (float)VAR_FILT_BLOCK_SIZE));
dim3 blockSize(VAR_FILT_BLOCK_SIZE);
__cudaVarianceFilter<<<gridSize, blockSize>>>(integralImg, integralImg_squared, windows_d, d_inWinIndices, numInWins, minVar);
cudaCheckErrors(0);
cudaEventRecord(finished);
cudaEventSynchronize(finished);
thrust::device_ptr<int> idxArr = thrust::device_pointer_cast(d_inWinIndices);
thrust::device_ptr<int> end = thrust::remove_if(idxArr, idxArr + numInWins, is_negative());
numInWins = end - idxArr;
}
My english is not good please forgive me, if you know any solution to solve the problem please help me. Thank you, have a nice day!