So I have a class like this:
class InferenceObject
{
public:
// CUDA stream created with cudaStreamNonBlocking
cudaStream_t stream;
// Pointers to GPU memory allocated with cudaMalloc()
void* bindings[3];
// Flag of started enqueue() with this stream and these bindings
bool started = false;
// Network class has ICudaEngine with dims for bindings* allocation
void create(Network* network);
// Deallocates memory and destroys CUDA stream
void clear();
// Returns true if (started == true && cudaStreamQuery(stream) == cudaError::cudaSuccess)
bool cudaExecutionCompleted();
InferenceObject();
InferenceObject(Network* network);
~InferenceObject();
};
I process frames from video stream and create cudaStream_t stream
and void* bindings
for each processed frame to track down when execution of separate frames is completed.
All objects of InferenceObject
class I store in std::map.
But.
When I try to use std::map<int, InferenceObject> I get CUDA error 17 invalid device pointer.
When I try to use std::map<int, InferenceObject*> I get no errors but network sees nothing.
Usage in detector with std::map<int, InferenceObject*>:
int Detector::startObjectsDetection(const cv::Mat& image)
{
// Generate new object id
int objectsDetectionId = generateObjectsDetectionId();
// Create object
InferenceObject* inferObject = new InferenceObject();
// Create stream and allocate buffers
inferObject->create(fNetwork);
// Convert data
auto blob = convertImageToBlob(image, fNetwork->fInputDims.d[1], fNetwork->fInputDims.d[2]);
// Copy HtD
CHECK(cudaMemcpy(inferObject->bindings[0], (void*)(blob.data()), fNetwork->inputBufferSize() * sizeof(float), cudaMemcpyHostToDevice));
// Start inference
fContext->enqueue(1, inferObject->bindings, inferObject->fStream, NULL);
// Set flag
inferObject->started = true;
// Add into map
fCurrentObjectsDetectionsMap.insert(std::make_pair(objectsDetectionId, inferObject));
// Return generated id
return objectsDetectionId;
}
and
std::vector<DetectedObject> Detector::retrieveDetectedObjects(int objectsDetectionId, float detectionThreshold, float intersectionOverUnionThreshold)
{
// Vector of detected objects
std::vector<DetectedObject> detectedObjects;
// Find exact InferenceObject
auto objectsDetection = fCurrentObjectsDetectionsMap.find(objectsDetectionId);
// If found one
if (objectsDetection != fCurrentObjectsDetectionsMap.end())
{
// Copy output data buffers DtH
CHECK(cudaMemcpy(fOutBuffers[0], objectsDetection->second->bindings[1], fNetwork->outputBufferSize(0) * sizeof(float), cudaMemcpyDeviceToHost));
CHECK(cudaMemcpy(fOutBuffers[1], objectsDetection->second->bindings[2], fNetwork->outputBufferSize(1) * sizeof(float), cudaMemcpyDeviceToHost));
// Process buffers
processNetworkOutput(fOutBuffers, detectionThreshold, detectedObjects);
// Set flag
started = false;
std::cout << "\t\tBefore filtering: " << detectedObjects.size() << "\n";
// Filter objects by IOU
filterDetectedObjects(detectedObjects, detectionThreshold, intersectionOverUnionThreshold);
std::cout << "\t\tAfter filtering: " << detectedObjects.size() << "\n";
objectsDetection->second->clear();
// Remove InferenceObject from map
fCurrentObjectsDetectionsMap.erase(objectsDetection);
}
// Return detected objects
return detectedObjects;
}
When I used only one CUDA stream per detector all worked just fine.
But when I tried to separate streams into another object I got all these problems.
Any ideas what it is caused by?
Thanks in advance.