Hi there.
I’m relatively new to CUDA and have been working my way through a bunch of CUDA tutorials, documentation bits and forum posts over the last weeks. My C++ PoC code launching a kernel that calls a couple of device-only functions doing some geometrical operations works fine.
Right now, I’m trying to optimize data transfers and kernel execution to further improve the programme’s performance. Overlapping data transfers and kernel executions seems to be a good way as we’re talking about hundreds of millions of geometrical objects that can be processed independently. When I profile the programme in Nsight Systems 2024.7.1 though, transfers and kernel executions seem to happen strictly sequentially. I’d appreciate any suggestions on what I’m doing wrong here.
GPU specs:
NVIDIA RTX A2000 Laptop GPU
memory: 4095 MiB
deviceOverlap: 1
concurrentKernels : 1
asyncEngineCount: 5
computeCapability: 8.6
Relevant bits of PoC code:
long calcGPU(PointXY* ptRequest, long cPtRequest, PointXY* ptPoly, long cPtPoly, bool* bPtRequestIn, bool bOverlapTransactions) {
long begin = GetTickCount();
PointXY* ptRequestDev;
PointXY* ptPolyDev;
bool* bPtRequestInDev;
// Gerät setzen und Eigenschaften holen
cudaSetDevice(0);
cudaDeviceProp* devProps = new cudaDeviceProp;
cudaGetDeviceProperties(devProps, 0);
cudaMalloc((void**)&ptRequestDev, cPtRequest * sizeof(PointXY));
cudaMalloc((void**)&ptPolyDev, cPtPoly * sizeof(PointXY));
cudaMalloc((void**)&bPtRequestInDev, cPtRequest * sizeof(bool));
cudaMemcpy(ptPolyDev, ptPoly, cPtPoly * sizeof(PointXY), cudaMemcpyHostToDevice);
int blockSize = 512;
int numBlocks = (cPtRequest + blockSize - 1) / blockSize;
// *** Daten in kleineren Portionen asynchron übertragen und parallel berechnen ***
if (bOverlapTransactions && devProps->deviceOverlap) {
cudaError_t result;
int iNumGroups = devProps->asyncEngineCount;
int iChunkSize = (cPtRequest + iNumGroups - 1) / iNumGroups; // aufrunden, damit alle Daten verarbeitet werden
cudaStream_t* streams = new cudaStream_t[iNumGroups];
for (int iIndex = 0; iIndex < iNumGroups; iIndex++) {
cudaStreamCreate(&streams[iIndex]);
int offset = iIndex * iChunkSize;
int iCurrChunkSize = iChunkSize;
if (offset + iCurrChunkSize > cPtRequest) {
iCurrChunkSize = cPtRequest - offset;
}
// ptRequest and bPtRequestIn contain pinned memory allocated using cudaMallocHost((void**)&ptRequest, cPtRequest * sizeof(PointXY)) and cudaMallocHost((void**)&bPtRequestIn, cPtRequest * sizeof(bool))
cudaMemcpyAsync(&ptRequestDev[offset], &ptRequest[offset], iCurrChunkSize * sizeof(PointXY), cudaMemcpyHostToDevice, streams[iIndex]);
ptInPolyKernel << <(iCurrChunkSize + blockSize - 1) / blockSize, blockSize, 0, streams[iIndex] >> > (ptRequestDev, cPtRequest, ptPolyDev, cPtPoly, bPtRequestInDev, offset);
cudaMemcpyAsync(&bPtRequestIn[offset], &bPtRequestInDev[offset], iCurrChunkSize * sizeof(bool), cudaMemcpyDeviceToHost, streams[iIndex]);
}
cudaDeviceSynchronize();
// Streams aufräumen
for (int iIndex = 0; iIndex < iNumGroups; iIndex++) {
cudaStreamDestroy(streams[iIndex]);
}
delete[] streams;
}
cudaFree(ptRequestDev);
cudaFree(ptPolyDev);
cudaFree(bPtRequestInDev);
return GetTickCount() - begin;
}
Nsight Systems screenshot: