Hi,
I have developed a program that uses NvSciBuf as an external memory for CUDA. Below is the code that I have written. In this code, inputBufObjBuffer_cpu
refers to a CPU pointer that points to inputBufObj
, and outputBufObjBuffer_cpu
refers to a CPU pointer that points to outputBufObj
.
I initialized the kernel input data by setting inputBufObjBuffer_cpu
. After the kernel has finished, I expected to be able to access the output data by using outputBufObjBuffer_cpu
. However, when I printed the data in it, all values were zero. It was not until I copied outputBufObjBuffer
to outputBufObjBuffer_cpu
using cudaMemcpy that I was able to get the output data.
As I am new to NvSciBuf, I might have misunderstood some concepts. Could you please help me understand why I cannot access outputBufObjBuffer
by using outputBufObjBuffer_cpu
?
#include "cudaNvSci.h"
#include <cstdio>
#include <cuda.h>
#include <helper_cuda.h>
#include <helper_image.h>
#include <time.h>
#include <vector>
#define DPRINTF(...) printf(__VA_ARGS__)
void launchMyKernel(float *d_in, float *d_out, int numThreadsPerBlock,
int numElements, cudaStream_t stream);
void initialData(float *ip, int size) {
time_t t;
srand((unsigned)time(&t));
for (int i = 0; i < size; i++) {
ip[i] = (float)(rand() & 0xffff) / 1000.0f;
}
}
void checkResult(float *hostRef, float *gpuRef, const int N) {
double epsilon = 1.0E-8;
for (int i = 0; i < N; i++) {
// printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n", hostRef[i], i, gpuRef[i],
// i);
if (abs(hostRef[i] - gpuRef[i]) > epsilon) {
printf("Results don\'t match!\n");
printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n", hostRef[i], i, gpuRef[i],
i);
return;
}
}
printf("Check result success!\n");
}
void doubleElements(float *in, float *out, int size) {
for (int i = 0; i < size; i++) {
out[i] = in[i] * 2;
}
}
NvSciError createAndSetAttrList(NvSciBufModule module, uint64_t bufSize,
NvSciBufAttrList *attrList, CUuuid devUUID) {
NvSciError sciStatus = NvSciError_Success;
sciStatus = NvSciBufAttrListCreate(module, attrList);
if (sciStatus != NvSciError_Success) {
DPRINTF("Error in creating NvSciBuf attribute list\n");
return sciStatus;
}
bool needCpuAccess = true;
NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite;
uint32_t dimcount = 1;
printf("[debug] %s:%d bufSize: %ld.\n", __FILE__, __LINE__, bufSize);
uint64_t sizes[] = {bufSize};
uint32_t alignment[] = {1};
uint32_t dataType = NvSciDataType_Float32;
// uint32_t dataType = NvSciDataType_Int8;
NvSciBufType type = NvSciBufType_RawBuffer;
uint64_t baseAddrAlign = 512;
NvSciBufAttrKeyValuePair setAttrs[] = {
{NvSciBufRawBufferAttrKey_Size, &sizes, sizeof(sizes)},
{NvSciBufGeneralAttrKey_Types, &type, sizeof(type)},
{NvSciBufGeneralAttrKey_NeedCpuAccess, &needCpuAccess,
sizeof(needCpuAccess)},
{NvSciBufGeneralAttrKey_RequiredPerm, &perm, sizeof(perm)},
{NvSciBufGeneralAttrKey_GpuId, &devUUID, sizeof(devUUID)},
};
size_t length = sizeof(setAttrs) / sizeof(NvSciBufAttrKeyValuePair);
sciStatus = NvSciBufAttrListSetAttrs(*attrList, setAttrs, length);
if (sciStatus != NvSciError_Success) {
DPRINTF("Error in setting NvSciBuf attribute list\n");
return sciStatus;
}
return sciStatus;
}
int main(int argc, const char **argv) {
int numOfGPUs = 0;
std::vector<int> deviceIds;
checkCudaErrors(cudaGetDeviceCount(&numOfGPUs));
printf("%d GPUs found\n", numOfGPUs);
if (!numOfGPUs) {
exit(EXIT_WAIVED);
} else {
for (int devID = 0; devID < numOfGPUs; devID++) {
int major = 0, minor = 0;
checkCudaErrors(cudaDeviceGetAttribute(
&major, cudaDevAttrComputeCapabilityMajor, devID));
checkCudaErrors(cudaDeviceGetAttribute(
&minor, cudaDevAttrComputeCapabilityMinor, devID));
if (major >= 6) {
deviceIds.push_back(devID);
}
}
if (deviceIds.size() == 0) {
printf("cudaNvSci requires one or more GPUs of Pascal(SM 6.0) or higher "
"archs\nWaiving..\n");
exit(EXIT_WAIVED);
}
}
CUuuid m_devUUID;
CUresult res = cuDeviceGetUuid_v2(&m_devUUID, 0);
if (res != CUDA_SUCCESS) {
fprintf(stderr, "Driver API error = %04d \n", res);
exit(EXIT_FAILURE);
}
int32_t vector_size = 1024;
NvSciBufModule bufModule = NULL;
NvSciBufAttrList inputAttrList = NULL;
NvSciBufAttrList outputAttrList = NULL;
NvSciBufAttrList reconciledInputAttrList = NULL;
NvSciBufAttrList reconciledOutputAttrList = NULL;
NvSciBufAttrList inputConflictList = NULL;
NvSciBufAttrList outputConflictList = NULL;
NvSciError sciError = NvSciError_Success;
sciError = NvSciBufModuleOpen(&bufModule);
if (sciError != NvSciError_Success) {
DPRINTF("Error in initializing NvSciBufModule\n");
return 1;
}
sciError = createAndSetAttrList(bufModule, vector_size * sizeof(float),
&inputAttrList, m_devUUID);
if (sciError != NvSciError_Success) {
DPRINTF("Error in creating NvSciBuf attribute list\n");
return 1;
}
sciError = NvSciBufAttrListReconcile(
&inputAttrList, 1, &reconciledInputAttrList, &inputConflictList);
if (sciError != NvSciError_Success) {
DPRINTF("Error in reconciling NvSciBuf attribute list\n");
return 1;
}
sciError = createAndSetAttrList(bufModule, vector_size * sizeof(float),
&outputAttrList, m_devUUID);
if (sciError != NvSciError_Success) {
DPRINTF("Error in creating NvSciBuf attribute list\n");
return 1;
}
sciError = NvSciBufAttrListReconcile(
&outputAttrList, 1, &reconciledOutputAttrList, &outputConflictList);
if (sciError != NvSciError_Success) {
DPRINTF("Error in reconciling NvSciBuf attribute list\n");
return 1;
}
NvSciBufObj inputBufObj, outputBufObj;
sciError = NvSciBufObjAlloc(reconciledInputAttrList, &inputBufObj);
printf("Address of inputBufObj : %p.\n", (void *)inputBufObj);
if (sciError != NvSciError_Success) {
DPRINTF("Error in allocating NvSciBuf object\n");
return 1;
}
sciError = NvSciBufObjAlloc(reconciledOutputAttrList, &outputBufObj);
if (sciError != NvSciError_Success) {
DPRINTF("Error in allocating NvSciBuf object\n");
return 1;
}
void *inputBufObjBuffer = NULL;
void *outputBufObjBuffer = NULL;
void *inputBufObjBuffer_cpu;
void *outputBufObjBuffer_cpu;
/*************** Import input to cuda **************/
// Query NvSciBuf Object
NvSciBufAttrList m_buffAttrListOut[1];
NvSciBufObjGetAttrList(inputBufObj, &m_buffAttrListOut[0]);
NvSciBufAttrKeyValuePair bufattrs[1];
memset(bufattrs, 0, sizeof(NvSciBufAttrKeyValuePair));
bufattrs[0].key = NvSciBufRawBufferAttrKey_Size;
NvSciBufAttrListGetAttrs(m_buffAttrListOut[0], bufattrs, 1);
if (sciError != NvSciError_Success) {
DPRINTF("Error in NvSciBufAttrListGetAttrs\n");
return 1;
}
printf("address of bufattrs[0].key : %d\n", bufattrs[0].key);
printf("address of bufattrs[0].value : %p\n", bufattrs[0].value);
uint64_t ret_size = *(static_cast<const uint64_t *>(bufattrs[0].value));
printf("address of bufattrs[0].ret_size : %ld\n", ret_size);
// NvSciBuf Registration With CUDA
cudaExternalMemory_t extMemBuffer;
// Fill up CUDA_EXTERNAL_MEMORY_HANDLE_DESC
cudaExternalMemoryHandleDesc memHandleDesc;
memset(&memHandleDesc, 0, sizeof(memHandleDesc));
memHandleDesc.type = cudaExternalMemoryHandleTypeNvSciBuf;
printf("Address of inputBufObj : %p.\n", (void *)inputBufObj);
memHandleDesc.handle.nvSciBufObject = inputBufObj;
memHandleDesc.size = ret_size;
checkCudaErrors(cudaImportExternalMemory(&extMemBuffer, &memHandleDesc));
// Mapping to CUDA
cudaExternalMemoryBufferDesc bufferDesc;
memset(&bufferDesc, 0, sizeof(bufferDesc));
bufferDesc.offset = 0;
bufferDesc.size = ret_size;
cudaExternalMemoryGetMappedBuffer(&inputBufObjBuffer, extMemBuffer,
&bufferDesc);
/*************** Import output to cuda **************/
// Query NvSciBuf Object
NvSciBufObjGetAttrList(outputBufObj, &m_buffAttrListOut[0]);
memset(bufattrs, 0, sizeof(NvSciBufAttrKeyValuePair));
bufattrs[0].key = NvSciBufRawBufferAttrKey_Size;
NvSciBufAttrListGetAttrs(m_buffAttrListOut[0], bufattrs, 1);
printf("address of bufattrs[0].value : %p\n", bufattrs[0].value);
ret_size = *(static_cast<const uint64_t *>(bufattrs[0].value));
// NvSciBuf Registration With CUDA
cudaExternalMemory_t extMemBuffer_1;
// Fill up CUDA_EXTERNAL_MEMORY_HANDLE_DESC
cudaExternalMemoryHandleDesc memHandleDesc_1;
memset(&memHandleDesc_1, 0, sizeof(memHandleDesc_1));
memHandleDesc_1.type = cudaExternalMemoryHandleTypeNvSciBuf;
memHandleDesc_1.handle.nvSciBufObject = outputBufObj;
memHandleDesc_1.size = ret_size;
checkCudaErrors(cudaImportExternalMemory(&extMemBuffer_1, &memHandleDesc_1));
// Mapping to CUDA
cudaExternalMemoryBufferDesc bufferDesc_1;
memset(&bufferDesc_1, 0, sizeof(bufferDesc_1));
bufferDesc_1.offset = 0;
bufferDesc_1.size = ret_size;
cudaExternalMemoryGetMappedBuffer(&outputBufObjBuffer, extMemBuffer_1,
&bufferDesc_1);
sciError = NvSciBufObjGetCpuPtr(inputBufObj, &inputBufObjBuffer_cpu);
if (sciError != NvSciError_Success) {
DPRINTF("Error in getting NvSciBuf CPU pointer\n");
return 1;
}
memset(inputBufObjBuffer_cpu, 0, vector_size * sizeof(float));
sciError = NvSciBufObjGetCpuPtr(outputBufObj, &outputBufObjBuffer_cpu);
if (sciError != NvSciError_Success) {
DPRINTF("Error in getting NvSciBuf CPU pointer\n");
return 1;
}
memset(outputBufObjBuffer_cpu, 0, vector_size * sizeof(float));
// Initialize the input data
initialData(static_cast<float *>(inputBufObjBuffer_cpu), vector_size);
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
int threadPerBlock = 1024;
launchMyKernel(static_cast<float *>(inputBufObjBuffer),
static_cast<float *>(outputBufObjBuffer), threadPerBlock,
vector_size, stream);
cudaDeviceSynchronize();
float *res_cpu = new float[vector_size];
memset(res_cpu, 0, sizeof(float) * vector_size);
doubleElements(static_cast<float *>(inputBufObjBuffer_cpu), res_cpu,
vector_size);
checkResult(res_cpu, static_cast<float *>(outputBufObjBuffer_cpu),
vector_size);
// release resources
delete[] res_cpu;
cudaStreamDestroy(stream);
NvSciBufObjFree(inputBufObj);
NvSciBufObjFree(outputBufObj);
NvSciBufAttrListFree(reconciledInputAttrList);
NvSciBufAttrListFree(reconciledOutputAttrList);
NvSciBufAttrListFree(inputAttrList);
NvSciBufAttrListFree(outputAttrList);
NvSciBufAttrListFree(inputConflictList);
NvSciBufAttrListFree(outputConflictList);
cudaFree(inputBufObjBuffer);
cudaFree(outputBufObjBuffer);
return EXIT_SUCCESS;
}