Question about CUDA import NvSciBuf as external memory

Hi,
I have developed a program that uses NvSciBuf as an external memory for CUDA. Below is the code that I have written. In this code, inputBufObjBuffer_cpu refers to a CPU pointer that points to inputBufObj, and outputBufObjBuffer_cpu refers to a CPU pointer that points to outputBufObj.

I initialized the kernel input data by setting inputBufObjBuffer_cpu. After the kernel has finished, I expected to be able to access the output data by using outputBufObjBuffer_cpu. However, when I printed the data in it, all values were zero. It was not until I copied outputBufObjBuffer to outputBufObjBuffer_cpu using cudaMemcpy that I was able to get the output data.

As I am new to NvSciBuf, I might have misunderstood some concepts. Could you please help me understand why I cannot access outputBufObjBuffer by using outputBufObjBuffer_cpu?

#include "cudaNvSci.h"
#include <cstdio>
#include <cuda.h>
#include <helper_cuda.h>
#include <helper_image.h>
#include <time.h>
#include <vector>
#define DPRINTF(...) printf(__VA_ARGS__)
void launchMyKernel(float *d_in, float *d_out, int numThreadsPerBlock,
                    int numElements, cudaStream_t stream);
void initialData(float *ip, int size) {
  time_t t;
  srand((unsigned)time(&t));
  for (int i = 0; i < size; i++) {
    ip[i] = (float)(rand() & 0xffff) / 1000.0f;
  }
}
void checkResult(float *hostRef, float *gpuRef, const int N) {
  double epsilon = 1.0E-8;
  for (int i = 0; i < N; i++) {
    // printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n", hostRef[i], i, gpuRef[i],
    // i);
    if (abs(hostRef[i] - gpuRef[i]) > epsilon) {
      printf("Results don\'t match!\n");
      printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n", hostRef[i], i, gpuRef[i],
             i);
      return;
    }
  }
  printf("Check result success!\n");
}

void doubleElements(float *in, float *out, int size) {
  for (int i = 0; i < size; i++) {
    out[i] = in[i] * 2;
  }
}
NvSciError createAndSetAttrList(NvSciBufModule module, uint64_t bufSize,
                                NvSciBufAttrList *attrList, CUuuid devUUID) {
  NvSciError sciStatus = NvSciError_Success;

  sciStatus = NvSciBufAttrListCreate(module, attrList);
  if (sciStatus != NvSciError_Success) {
    DPRINTF("Error in creating NvSciBuf attribute list\n");
    return sciStatus;
  }

  bool needCpuAccess = true;
  NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite;
  uint32_t dimcount = 1;
  printf("[debug] %s:%d bufSize: %ld.\n", __FILE__, __LINE__, bufSize);
  uint64_t sizes[] = {bufSize};
  uint32_t alignment[] = {1};
  uint32_t dataType = NvSciDataType_Float32;
  // uint32_t dataType = NvSciDataType_Int8;
  NvSciBufType type = NvSciBufType_RawBuffer;
  uint64_t baseAddrAlign = 512;

  NvSciBufAttrKeyValuePair setAttrs[] = {
      {NvSciBufRawBufferAttrKey_Size, &sizes, sizeof(sizes)},
      {NvSciBufGeneralAttrKey_Types, &type, sizeof(type)},
      {NvSciBufGeneralAttrKey_NeedCpuAccess, &needCpuAccess,
       sizeof(needCpuAccess)},
      {NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm)},
      {NvSciBufGeneralAttrKey_GpuId, &devUUID, sizeof(devUUID)},
  };
  size_t length = sizeof(setAttrs) / sizeof(NvSciBufAttrKeyValuePair);

  sciStatus = NvSciBufAttrListSetAttrs(*attrList, setAttrs, length);
  if (sciStatus != NvSciError_Success) {
    DPRINTF("Error in setting NvSciBuf attribute list\n");
    return sciStatus;
  }

  return sciStatus;
}

int main(int argc, const char **argv) {
  int numOfGPUs = 0;
  std::vector<int> deviceIds;
  checkCudaErrors(cudaGetDeviceCount(&numOfGPUs));

  printf("%d GPUs found\n", numOfGPUs);
  if (!numOfGPUs) {
    exit(EXIT_WAIVED);
  } else {
    for (int devID = 0; devID < numOfGPUs; devID++) {
      int major = 0, minor = 0;
      checkCudaErrors(cudaDeviceGetAttribute(
          &major, cudaDevAttrComputeCapabilityMajor, devID));
      checkCudaErrors(cudaDeviceGetAttribute(
          &minor, cudaDevAttrComputeCapabilityMinor, devID));
      if (major >= 6) {
        deviceIds.push_back(devID);
      }
    }
    if (deviceIds.size() == 0) {
      printf("cudaNvSci requires one or more GPUs of Pascal(SM 6.0) or higher "
             "archs\nWaiving..\n");
      exit(EXIT_WAIVED);
    }
  }
  CUuuid m_devUUID;
  CUresult res = cuDeviceGetUuid_v2(&m_devUUID, 0);

  if (res != CUDA_SUCCESS) {
    fprintf(stderr, "Driver API error = %04d \n", res);
    exit(EXIT_FAILURE);
  }
  int32_t vector_size = 1024;
  NvSciBufModule bufModule = NULL;
  NvSciBufAttrList inputAttrList = NULL;
  NvSciBufAttrList outputAttrList = NULL;
  NvSciBufAttrList reconciledInputAttrList = NULL;
  NvSciBufAttrList reconciledOutputAttrList = NULL;
  NvSciBufAttrList inputConflictList = NULL;
  NvSciBufAttrList outputConflictList = NULL;
  NvSciError sciError = NvSciError_Success;
  sciError = NvSciBufModuleOpen(&bufModule);
  if (sciError != NvSciError_Success) {
    DPRINTF("Error in initializing NvSciBufModule\n");
    return 1;
  }
  sciError = createAndSetAttrList(bufModule, vector_size * sizeof(float),
                                  &inputAttrList, m_devUUID);
  if (sciError != NvSciError_Success) {
    DPRINTF("Error in creating NvSciBuf attribute list\n");
    return 1;
  }
  sciError = NvSciBufAttrListReconcile(
      &inputAttrList, 1, &reconciledInputAttrList, &inputConflictList);
  if (sciError != NvSciError_Success) {
    DPRINTF("Error in reconciling NvSciBuf attribute list\n");
    return 1;
  }

  sciError = createAndSetAttrList(bufModule, vector_size * sizeof(float),
                                  &outputAttrList, m_devUUID);
  if (sciError != NvSciError_Success) {
    DPRINTF("Error in creating NvSciBuf attribute list\n");
    return 1;
  }
  sciError = NvSciBufAttrListReconcile(
      &outputAttrList, 1, &reconciledOutputAttrList, &outputConflictList);
  if (sciError != NvSciError_Success) {
    DPRINTF("Error in reconciling NvSciBuf attribute list\n");
    return 1;
  }
  NvSciBufObj inputBufObj, outputBufObj;
  sciError = NvSciBufObjAlloc(reconciledInputAttrList, &inputBufObj);
  printf("Address of inputBufObj : %p.\n", (void *)inputBufObj);
  if (sciError != NvSciError_Success) {
    DPRINTF("Error in allocating NvSciBuf object\n");
    return 1;
  }

  sciError = NvSciBufObjAlloc(reconciledOutputAttrList, &outputBufObj);
  if (sciError != NvSciError_Success) {
    DPRINTF("Error in allocating NvSciBuf object\n");
    return 1;
  }

  void *inputBufObjBuffer = NULL;
  void *outputBufObjBuffer = NULL;
  void *inputBufObjBuffer_cpu;
  void *outputBufObjBuffer_cpu;
  /*************** Import input to cuda **************/

  // Query NvSciBuf Object
  NvSciBufAttrList m_buffAttrListOut[1];
  NvSciBufObjGetAttrList(inputBufObj, &m_buffAttrListOut[0]);

  NvSciBufAttrKeyValuePair bufattrs[1];
  memset(bufattrs, 0, sizeof(NvSciBufAttrKeyValuePair));
  bufattrs[0].key = NvSciBufRawBufferAttrKey_Size;
  NvSciBufAttrListGetAttrs(m_buffAttrListOut[0], bufattrs, 1);
  if (sciError != NvSciError_Success) {
    DPRINTF("Error in NvSciBufAttrListGetAttrs\n");
    return 1;
  }
  printf("address of bufattrs[0].key : %d\n", bufattrs[0].key);
  printf("address of bufattrs[0].value : %p\n", bufattrs[0].value);
  uint64_t ret_size = *(static_cast<const uint64_t *>(bufattrs[0].value));
  printf("address of bufattrs[0].ret_size : %ld\n", ret_size);
  // NvSciBuf Registration With CUDA
  cudaExternalMemory_t extMemBuffer;
  // Fill up CUDA_EXTERNAL_MEMORY_HANDLE_DESC
  cudaExternalMemoryHandleDesc memHandleDesc;
  memset(&memHandleDesc, 0, sizeof(memHandleDesc));
  memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf;
  printf("Address of inputBufObj : %p.\n", (void *)inputBufObj);
  memHandleDesc.handle.nvSciBufObject = inputBufObj;
  memHandleDesc.size = ret_size;
  checkCudaErrors(cudaImportExternalMemory(&extMemBuffer, &memHandleDesc));
  // Mapping to CUDA
  cudaExternalMemoryBufferDesc bufferDesc;
  memset(&bufferDesc, 0, sizeof(bufferDesc));
  bufferDesc.offset = 0;
  bufferDesc.size = ret_size;
  cudaExternalMemoryGetMappedBuffer(&inputBufObjBuffer, extMemBuffer,
                                    &bufferDesc);
  /*************** Import output to cuda **************/
  // Query NvSciBuf Object
  NvSciBufObjGetAttrList(outputBufObj, &m_buffAttrListOut[0]);
  memset(bufattrs, 0, sizeof(NvSciBufAttrKeyValuePair));
  bufattrs[0].key = NvSciBufRawBufferAttrKey_Size;
  NvSciBufAttrListGetAttrs(m_buffAttrListOut[0], bufattrs, 1);
  printf("address of bufattrs[0].value : %p\n", bufattrs[0].value);
  ret_size = *(static_cast<const uint64_t *>(bufattrs[0].value));
  // NvSciBuf Registration With CUDA
  cudaExternalMemory_t extMemBuffer_1;
  // Fill up CUDA_EXTERNAL_MEMORY_HANDLE_DESC
  cudaExternalMemoryHandleDesc memHandleDesc_1;
  memset(&memHandleDesc_1, 0, sizeof(memHandleDesc_1));
  memHandleDesc_1.type = cudaExternalMemoryHandleTypeNvSciBuf;
  memHandleDesc_1.handle.nvSciBufObject = outputBufObj;
  memHandleDesc_1.size = ret_size;
  checkCudaErrors(cudaImportExternalMemory(&extMemBuffer_1, &memHandleDesc_1));
  // Mapping to CUDA
  cudaExternalMemoryBufferDesc bufferDesc_1;
  memset(&bufferDesc_1, 0, sizeof(bufferDesc_1));
  bufferDesc_1.offset = 0;
  bufferDesc_1.size = ret_size;
  cudaExternalMemoryGetMappedBuffer(&outputBufObjBuffer, extMemBuffer_1,
                                    &bufferDesc_1);

  sciError = NvSciBufObjGetCpuPtr(inputBufObj, &inputBufObjBuffer_cpu);
  if (sciError != NvSciError_Success) {
    DPRINTF("Error in getting NvSciBuf CPU pointer\n");
    return 1;
  }
  memset(inputBufObjBuffer_cpu, 0, vector_size * sizeof(float));
  sciError = NvSciBufObjGetCpuPtr(outputBufObj, &outputBufObjBuffer_cpu);
  if (sciError != NvSciError_Success) {
    DPRINTF("Error in getting NvSciBuf CPU pointer\n");
    return 1;
  }
  memset(outputBufObjBuffer_cpu, 0, vector_size * sizeof(float));
  // Initialize the input data
  initialData(static_cast<float *>(inputBufObjBuffer_cpu), vector_size);
  cudaStream_t stream;
  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
  int threadPerBlock = 1024;
  launchMyKernel(static_cast<float *>(inputBufObjBuffer),
                 static_cast<float *>(outputBufObjBuffer), threadPerBlock,
                 vector_size, stream);
  cudaDeviceSynchronize();
  float *res_cpu = new float[vector_size];
  memset(res_cpu, 0, sizeof(float) * vector_size);
  doubleElements(static_cast<float *>(inputBufObjBuffer_cpu), res_cpu,
                 vector_size);
  checkResult(res_cpu, static_cast<float *>(outputBufObjBuffer_cpu),
              vector_size);
  //  release resources
  delete[] res_cpu;
  cudaStreamDestroy(stream);
  NvSciBufObjFree(inputBufObj);
  NvSciBufObjFree(outputBufObj);
  NvSciBufAttrListFree(reconciledInputAttrList);
  NvSciBufAttrListFree(reconciledOutputAttrList);
  NvSciBufAttrListFree(inputAttrList);
  NvSciBufAttrListFree(outputAttrList);
  NvSciBufAttrListFree(inputConflictList);
  NvSciBufAttrListFree(outputConflictList);
  cudaFree(inputBufObjBuffer);
  cudaFree(outputBufObjBuffer);
  return EXIT_SUCCESS;
}

The reason why is due to the fact that NvSciBuf uses a separate memory space for GPU and CPU. When you launch a CUDA kernel and modify the memory through the GPU, the changes are not directly visible on the CPU side. That’s why you need to copy the output from the GPU memory space to the CPU memory space using cudaMemcpy.

Try using this modified version of the main function that adds cudaMemcpy to synchronize the GPU and CPU memory spaces for the output buffer:

int main(int argc, const char **argv) {
// … (The rest of the code remains unchanged)

cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
int threadPerBlock = 1024;
launchMyKernel(static_cast<float *>(inputBufObjBuffer),
static_cast<float *>(outputBufObjBuffer), threadPerBlock,
vector_size, stream);
cudaDeviceSynchronize();

// Copy the output buffer from GPU to CPU memory
checkCudaErrors(cudaMemcpy(outputBufObjBuffer_cpu, outputBufObjBuffer,
vector_size * sizeof(float),
cudaMemcpyDeviceToHost));

float *res_cpu = new float[vector_size];
memset(res_cpu, 0, sizeof(float) * vector_size);
doubleElements(static_cast<float *>(inputBufObjBuffer_cpu), res_cpu,
vector_size);
checkResult(res_cpu, static_cast<float *>(outputBufObjBuffer_cpu),
vector_size);

// release resources
delete res_cpu;
cudaStreamDestroy(stream);
NvSciBufObjFree(inputBufObj);
NvSciBufObjFree(outputBufObj);
NvSciBufAttrListFree(reconciledInputAttrList);
NvSciBufAttrListFree(reconciledOutputAttrList);
NvSciBufAttrListFree(inputAttrList);
NvSciBufAttrListFree(outputAttrList);
NvSciBufAttrListFree(inputConflictList);
NvSciBufAttrListFree(outputConflictList);
cudaFree(inputBufObjBuffer);
cudaFree(outputBufObjBuffer);
return EXIT_SUCCESS;
}

By adding cudaMemcpy after the kernel execution and device synchronization, you ensure that the GPU output buffer’s content is properly copied to the CPU memory space, and you should be able to see the correct output values when printing the data.

Let me know how it goes…

Hope this helped you!