Compute and Data transfer not happening concurrently in cuda Streams on Iteration 2

Hi,
I have written a basic program where a chunk of data is loaded in CPU memory(Pinned), and then I transfer it in chunks to GPUs(Asynchronously), and then do computation on each chunk. So for each chunk, I have created a stream.
The issue I am facing is data transfer of the second chunk and computation on the first chunk are not happening concurrently, they are happening sequentially. For all other chunks, it is happening concurrently.

Any suggestion on how I can a get second the transfer of the second chunk and computation on the first chunk concurrently?

I have uploaded two photos (One for 5 iterations and another for 10 iterations).

(This is on a 16GB V100, tried on nvcc 11.7 and 12.3)


Code:

#include <bits/stdc++.h>
#include "driver_types.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
using namespace std;

#define checkCudaErrors(err)                                                     \
    do                                                                           \
    {                                                                            \
        if (err != cudaSuccess)                                                  \
        {                                                                        \
            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " \
                      << cudaGetErrorString(err) << std::endl;                   \
            exit(EXIT_FAILURE);                                                  \
        }                                                                        \
    } while (0)

__global__ void intialize_mark(char *mark, uint32_t num_e)
{
    uint32_t j;
    j = blockIdx.y * gridDim.x + blockIdx.x;
    j = j * 512 + threadIdx.x;
    if (j >= num_e)
        return;
    mark[j] = 0;
}

__global__ void intialize_mask(char *mask, uint32_t num_e)
{
    uint32_t j;
    j = blockIdx.y * gridDim.x + blockIdx.x;
    j = j * 512 + threadIdx.x;
    if (j >= num_e)
        return;
    mask[j] = 0;
}

int main()
{
    cudaFree(nullptr);
    uint32_t numIterations = 10;
    std::ios_base::sync_with_stdio(false);
    std::cin.tie(0);
    std::cout.tie(0);

    uint32_t numNodes = 1000000;
    uint64_t numEdges = 20000000;

    uint64_t *edgelist;
    cudaHostAlloc(&edgelist, numEdges * sizeof(uint64_t), cudaHostAllocDefault);

    std::random_device rd;
    std::mt19937_64 gen(rd());
    std::uniform_int_distribution<uint64_t> dis;
    for (uint64_t i = 0; i < numEdges; i++)
    {
        edgelist[i] = dis(gen);
    }

    uint32_t num_threads = 512;                   // -> number of threads per block
    uint32_t num_blocks_n = (numNodes / 512) + 1; // -> number of blocks for nodes
    uint32_t num_blocks_e = (numEdges / 512) + 1; // -> number of blocks for edges
    uint32_t nny = (num_blocks_n / 1000) + 1;     // -> y dimension for nodes
    uint32_t nnx = 1000;                          // -> x dimension for nodes
    uint32_t ney = (num_blocks_e / 1000) + 1;     // -> y dimension for edges
    uint32_t nex = 1000;                          // -> x dimension for edges

    dim3 grid_n(nnx, nny);        // -> grid for nodes
    dim3 grid_e(nex, ney);        // -> grid for edges
    dim3 threads(num_threads, 1); // -> threads per block

    uint64_t numEdgesIteration = (numEdges + numIterations - 1) / numIterations; // -> number of edges per iteration

    char *d_mark;
    char *mask;
    uint64_t *d_edgeList1;
    uint64_t *d_edgeList2;

    checkCudaErrors(cudaMalloc(&d_mark, (numEdgesIteration) * sizeof(char)));
    checkCudaErrors(cudaMalloc(&mask, (numEdgesIteration) * sizeof(char)));

    checkCudaErrors(cudaMalloc(&d_edgeList1, (numEdgesIteration) * sizeof(uint64_t)));
    checkCudaErrors(cudaMalloc(&d_edgeList2, (numEdgesIteration) * sizeof(uint64_t)));

    cudaStream_t cudaStreamArr[numIterations];
    for (int i = 0; i < numIterations; i++)
    {
        cudaStreamCreate(&cudaStreamArr[i]);
    }
    uint64_t currentNumEdges;
    for (uint32_t i = 0; i <= numIterations; i++)
    {
        if (i < numIterations)
        {
            if ((min(numEdges, (i + 1) * numEdgesIteration) - (i)*numEdgesIteration) > 0)
            {
                checkCudaErrors(cudaMemcpyAsync(d_edgeList2,
                                                edgelist + (i)*numEdgesIteration,
                                                (min(numEdges, (i + 1) * numEdgesIteration) - (i)*numEdgesIteration) * sizeof(uint64_t),
                                                cudaMemcpyHostToDevice,
                                                cudaStreamArr[i]));
            }
        }

        if (i > 0)
        {
            currentNumEdges = min(numEdges, (i)*numEdgesIteration) - (i - 1) * numEdgesIteration;

            intialize_mark<<<grid_e, threads, 0, cudaStreamArr[i - 1]>>>(d_mark, currentNumEdges);

            intialize_mask<<<grid_e, threads, 0, cudaStreamArr[i - 1]>>>(mask, currentNumEdges);
        }
        cudaDeviceSynchronize();
        swap(d_edgeList1, d_edgeList2);
    }
}

cross posting: gpu - Compute and Data transfer not happening concurrently in cuda Streams on Iteration 2 - Stack Overflow

it may be this: Persistent Kernel does not work properly on some GPUs - #5 by Robert_Crovella

1 Like

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.