NVJPEG #6 - cudaErrorInvalidValue (error 1) due to "invalid argument" on CUDA API call to cudaMemcpyAsync

Currently I am getting this error every so often. my program could run for 2 days Straight taking images and then suddenly Cuda throws this error NVJPEG #6.

When I run the program with compute sanitizer, I get this:
Program hit cudaErrorInvalidValue (error 1) due to “invalid argument” on CUDA API call to cudaMemcpyAsync.

or

Program hit cudaErrorInvalidValue (error 1) due to “invalid argument” on CUDA API call to cudaGetLastError.

the only argument that I have changing per compression is the image buffer. the images are grey scale

This is how I am adding images to a queue.

CIStStreamBufferPtr pIStStreamBuffer(dataStream->RetrieveBuffer(10000))


if (pIStStreamBuffer->GetIStStreamBufferInfo()->IsImagePresent())
{

	Images *images = new Images;
	IStImage* pResultImage = pIStStreamBuffer->GetIStImage();

	memcpy(images->pImageBuffer, pResultImage->GetImageBuffer(), w * h);

	images->jpegFileName = folderName.str();

	threadMtx->lock();
		imageQueue->push(*images);
	threadMtx->unlock();

	delete images;

}

Once I have images in my queue, I then start running those images through nvjpeg. This is in another thread that is waiting for the queue to be filled up before it starts again.

cudaStreamCreate(&localStream);

CHECK_NVJPEG(nvjpegCreateSimple(&nvjpeg_handle));
CHECK_NVJPEG(nvjpegEncoderStateCreate(nvjpeg_handle, &nvjpeg_encoder_state, localStream));
CHECK_NVJPEG(nvjpegEncoderParamsCreate(nvjpeg_handle, &nvjpeg_encode_params, localStream));

CHECK_NVJPEG(nvjpegEncoderParamsSetQuality(nvjpeg_encode_params, 90, localStream));
CHECK_NVJPEG(nvjpegEncoderParamsSetSamplingFactors(nvjpeg_encode_params, NVJPEG_CSS_444, localStream));

uint8_t* pinnedBuf, * rawBuf;
nvjpegImage_t devRaw;
cudaMallocHost((void**)&pinnedBuf, w * h);
CHECK_CUDA(cudaMalloc((void**)&rawBuf, w * h));

while (true) {
if (!imageQueue->empty()) {

threadMtx->lock();
Images * image = new Images(imageQueue->front());
imageQueue->pop();
	
threadMtx->unlock();

memcpy(pinnedBuf, image->pImageBuffer, w * h); //Pass image buffer here	


devRaw.channel[0] = rawBuf; // no color
devRaw.channel[1] = rawBuf; //no color
devRaw.channel[2] = rawBuf; // no color
devRaw.pitch[0] = w;
devRaw.pitch[1] = w;
devRaw.pitch[2] = w;


CHECK_CUDA(cudaMemcpyAsync(rawBuf, pinnedBuf, w * h, cudaMemcpyHostToDevice, localStream));
	
CHECK_NVJPEG(nvjpegEncodeImage(nvjpeg_handle,
	nvjpeg_encoder_state,
	nvjpeg_encode_params,
	&devRaw,
	NVJPEG_INPUT_BGR,
	w,
	h,
	localStream));
	
	
cudaStreamSynchronize(localStream); // wait for stream's memory to arrive

// fails here 	
CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream( 
	nvjpeg_handle,
	nvjpeg_encoder_state,
	nullptr,
	&length,
	localStream));
	
// fails here 
CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream(
	nvjpeg_handle,
	nvjpeg_encoder_state,
	pinnedBuf,
	&length,
	localStream));
	
cudaStreamSynchronize(localStream);
					

what are my possible issues? Also, I use a try catch on this so if an image fails it goes to the next image in the queue and that image will fail but after a few images they will all work fine.

I can save the buffer as a .bin file and load it into another program I created to just do NVJPEG compression like the function above and it works with no problems.