Interaction between Green Contexts, MPS, and GPU resource allocation for parallel kernel execution

Hi everyone,

I am working with Green Contexts combined with MPS (Multi-Process Service) to demonstrate and analyze parallel CUDA kernel execution with resource guarantees.

My surprise is the following: when running multiple kernels in parallel—one using Green Contexts (GC) and the others without any resource control policy—I can clearly see resource guarantees for the GC kernel when MPS is enabled. However, when I repeat the experiment and assign 100% of the GPU SM resources to a single kernel using Green Contexts, the inference time for that kernel is worse than those kernels running without any resource allocation at all. This happens even when the entire GPU resource is assigned to that one kernel.

My question is:
How does Green Contexts actually interact with MPS when mixing GC-managed kernels and non-managed kernels?
I would like to understand if the behavior I observe is due to a conflict or interaction between Green Contexts’ resource allocation and MPS virtualization.

Here are the inference times (in milliseconds) for the kernels in both scenarios—with and without MPS. In all cases, the GC kernel is assigned all SMs:

  • Kernel Without MPS (ms) // With MPS (ms)
  • Kernel 1 15859.6 // 5088.07
  • Kernel 2 13007.6 // 4534.94
  • Kernel 3 11910.2 // 5336.46
  • Kernel GC 4458.17 // 5363.42
  • Kernel 5 9116.33 // 5267.52
  • Kernel 6 9957.15 // 5266.71
  • Kernel 7 12589.4 // 5007.74
  • Kernel 8 15168.6 // 5095.82

I attach the code that I have been using for testing. Thank you very much.

#include <iostream>
#include <cuda_runtime.h>
#include <cmath>
#include <cuda.h>
#include <vector>
#include <iomanip>   // for std::put_time
#include <chrono>    // for std::chrono
#include <ctime>     // for std::time_t
#include <nvToolsExt.h>

#define GREEN_TEXT "\033[32m"
#define RESET_TEXT "\033[0m"

// GPU kernel with heavy computations
__global__ void heavyKernel(float *data, int n, int iterations) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    float x = data[idx];
    for (int i = 0; i < iterations; ++i) {
        x = x * 1.0000001f + 0.0000001f;
        x = sinf(x);
        x = sqrtf(fabsf(x));
    }
    data[idx] = x;
}

// Macro for CUDA Runtime API error checking and success message
#define CUDA_RT(call)                                                   \
    do {                                                                \
        cudaError_t _err = (call);                                      \
        if ( cudaSuccess != _err ) {                                    \
            fprintf(stderr, "CUDA error in file '%s' at line %i: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(_err));      \
            return _err;                                                \
        } else {                                                        \
            printf("CUDA Runtime call at %s:%d succeeded.\n", __FILE__, __LINE__); \
        }                                                               \
    } while (0)

// Macro for CUDA Driver API error checking and success message
#define CUDA_DRV(call)                                                  \
    do {                                                                \
        CUresult _status = (call);                                      \
        if ( CUDA_SUCCESS != _status) {                                 \
            fprintf(stderr, "CUDA error in file '%s' at line %i: %i\n", \
                    __FILE__, __LINE__, _status);                       \
            return _status;                                             \
        } else {                                                        \
            printf("CUDA Driver call at %s:%d succeeded.\n", __FILE__, __LINE__); \
        }                                                               \
    } while (0)

int main() {
    CUdevResource input;
    CUdevResource resources[2];
    CUdevResourceDesc desc[2];
    CUgreenCtx gctx[2];
    CUstream streamA;

    const int num_kernels = 8;
    unsigned int nbGroups = 1; // number of groups to create
    unsigned int minCount = 3; // minimum SM count for green context

    int deviceCount = 0;
    cudaError_t err = cudaGetDeviceCount(&deviceCount);
    if (err != cudaSuccess) {
        std::cerr << "Error getting device count: " << cudaGetErrorString(err) << std::endl;
        return 1;
    }

    const int iterations = 10000000;
    const int total_runs = 10;

    const int threadsPerBlock = 256;
    const int blocksPerGrid = 24;  // 8 x 256 = 2048 threads
    const int n =  threadsPerBlock * blocksPerGrid; // one per thread

    // Events and timing
    bool use_stream_A = true;

    cudaEvent_t startA, stopA;
    cudaEventCreate(&startA);
    cudaEventCreate(&stopA);

    float *h_data_cpu = new float[n];
    for (int i = 0; i < n; ++i) {
        h_data_cpu[i] = static_cast<float>(i) / n;
    }

    float *d_dataA;
    cudaMalloc(&d_dataA, n * sizeof(float));
    cudaMemcpy(d_dataA, h_data_cpu, n * sizeof(float), cudaMemcpyHostToDevice);

    printf("Initializing device...\n");
    CUDA_RT(cudaInitDevice(0, 0, 0));

    printf("Obtaining SM resources from device...\n");
    CUDA_DRV(cuDeviceGetDevResource((CUdevice)0, &input, CU_DEV_RESOURCE_TYPE_SM));
    printf("Total number of SMs: %u\n", input.sm.smCount);

    printf("Splitting resources: (%u SMs) for the first green context.\n", minCount);

    CUDA_DRV(
        cuDevSmResourceSplitByCount(
            &resources[0],   // first group
            &nbGroups,
            &input,
            &resources[1],   // remainder
            0,
            minCount
        )
    );
    printf("Resources split.\n");

    printf("Generating descriptors\n");
    CUDA_DRV(cuDevResourceGenerateDesc(&desc[0], &resources[0], 1));

    printf("Creating green contexts...\n");
    CUDA_DRV(cuGreenCtxCreate(&gctx[0], desc[0], (CUdevice)0, CU_GREEN_CTX_DEFAULT_STREAM));
    printf("Green context A created.\n");

    printf("Creating and associating streams to the green context\n");
    CUDA_DRV(cuGreenCtxStreamCreate(&streamA, gctx[0], CU_STREAM_NON_BLOCKING, 0));
    printf("Streams successfully created and associated\n");

    // Data, streams and events for normal kernels
    float *h_data[num_kernels];
    float *d_data[num_kernels];
    cudaStream_t streams[num_kernels];
    cudaEvent_t start_events[num_kernels], stop_events[num_kernels];

    float times[num_kernels][total_runs];

    for (int k = 0; k < num_kernels; ++k) {
        h_data[k] = new float[n];
        for (int i = 0; i < n; ++i) {
            h_data[k][i] = static_cast<float>(i) / n;
        }
        cudaMalloc(&d_data[k], n * sizeof(float));
        cudaMemcpy(d_data[k], h_data[k], n * sizeof(float), cudaMemcpyHostToDevice);
        cudaStreamCreate(&streams[k]);
        cudaEventCreate(&start_events[k]);
        cudaEventCreate(&stop_events[k]);
    }

    // Run kernel only to check time 
    for (int run = 0; run < total_runs; ++run) {
        auto now_before = std::chrono::system_clock::now();
        std::time_t now_c_before = std::chrono::system_clock::to_time_t(now_before);
        std::cout << "Test Kernel Start: "
                  << std::put_time(std::localtime(&now_c_before), "%H:%M:%S")
                  << " (SMs: " << input.sm.smCount - minCount << ")"
                  << std::endl;

        cudaEventRecord(start_events[0], streams[0]);
        // heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, streams[0]>>>(d_data[0], n, iterations);
        cudaEventRecord(stop_events[0], streams[0]);
        cudaStreamSynchronize(streams[0]);

        float ms = 0.0f;
        cudaEventElapsedTime(&ms, start_events[0], stop_events[0]);

        auto now_after = std::chrono::system_clock::now();
        std::time_t now_c_after = std::chrono::system_clock::to_time_t(now_after);

        std::cout << "Kernel End (sequential): "
                  << std::put_time(std::localtime(&now_c_after), "%H:%M:%S")
                  << std::endl;

        std::cout << "Kernel Time (individual): " << ms << " ms\n";
    }

    // Initial launch for testing
    heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, (cudaStream_t)streamA>>>(d_dataA, n, iterations);
    for (int k = 0; k < num_kernels; ++k) {
        heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, streams[k]>>>(d_data[k], n, 10); // few iterations
    }
    cudaDeviceSynchronize();

   for (int run = 0; run < total_runs; ++run) {
    auto now_before = std::chrono::system_clock::now();
    std::time_t now_c_before = std::chrono::system_clock::to_time_t(now_before);
    std::cout << "Test Kernel start: "
              << std::put_time(std::localtime(&now_c_before), "%H:%M:%S")
              << " (SMs: " << input.sm.smCount - minCount << ")"
              << std::endl;

    cudaEventRecord(start_events[0], streams[0]);
    // heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, streams[0]>>>(d_data[0], n, iterations);
    cudaEventRecord(stop_events[0], streams[0]);
    cudaStreamSynchronize(streams[0]);

    float ms = 0.0f;
    cudaEventElapsedTime(&ms, start_events[0], stop_events[0]);

    auto now_after = std::chrono::system_clock::now();
    std::time_t now_c_after = std::chrono::system_clock::to_time_t(now_after);

    std::cout << "Kernel end (sequential): "
              << std::put_time(std::localtime(&now_c_after), "%H:%M:%S")
              << std::endl;

    std::cout << "Kernel time (individual): " << ms << " ms\n";
}

// Initial launch for testing
heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, (cudaStream_t)streamA>>>(d_dataA, n, iterations);
for (int k = 0; k < num_kernels; ++k) {
    heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, streams[k]>>>(d_data[k], n, 10); // few iterations
}
cudaDeviceSynchronize();

for (int run = 0; run < total_runs; ++run) {
    std::cout << "---- Iteration " << run << " ----\n";

    bool use_stream_A = true;
    heavyKernel<<<1, threadsPerBlock>>>(d_dataA, n, iterations);

    for (int k = 0; k < num_kernels; ++k) {
        std::string label = "Kernel_" + std::to_string(run) + "_S" + std::to_string(k);
        nvtxRangeId_t range = nvtxRangeStartA(label.c_str());

        auto now_before = std::chrono::system_clock::now();
        std::time_t now_c_before = std::chrono::system_clock::to_time_t(now_before);

        if (k == 3 && use_stream_A) {
            std::cout << GREEN_TEXT
                      << "Kernel start (green context, SMs: " << minCount << "): "
                      << std::put_time(std::localtime(&now_c_before), "%H:%M:%S")
                      << RESET_TEXT << std::endl;

            cudaEventRecord(start_events[k], streamA);
            heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, (cudaStream_t)streamA>>>(d_dataA, n, iterations);
            cudaEventRecord(stop_events[k], streamA);

            use_stream_A = false;
        } else {
            std::cout << "Kernel " << k << " start: "
                      << std::put_time(std::localtime(&now_c_before), "%H:%M:%S")
                      << " (SMs: " << input.sm.smCount - minCount << ")"
                      << std::endl;

            cudaEventRecord(start_events[k], streams[k]);
            heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, streams[k]>>>(d_data[k], n, iterations);
            cudaEventRecord(stop_events[k], streams[k]);
        }

        nvtxRangeEnd(range);
    }

    // Synchronize and measure times
    for (int k = 0; k < num_kernels; ++k) {
        cudaStreamSynchronize(streams[k]);
        cudaStreamSynchronize(streamA);
        float ms = 0.0f;
        cudaEventElapsedTime(&ms, start_events[k], stop_events[k]);

        auto now_after = std::chrono::system_clock::now();
        std::time_t now_c_after = std::chrono::system_clock::to_time_t(now_after);

        if (k == 0 && !use_stream_A) {
            std::cout << GREEN_TEXT
                      << "Kernel end (green context, SMs: " << minCount << "): "
                      << std::put_time(std::localtime(&now_c_after), "%H:%M:%S")
                      << RESET_TEXT << std::endl;
        } else {
            std::cout << "Kernel " << k << " end: "
                      << std::put_time(std::localtime(&now_c_after), "%H:%M:%S")
                      << " (SMs: " << input.sm.smCount - minCount << ")"
                      << std::endl;
        }

        std::cout << "Kernel " << k << " time: " << ms << " ms\n";
        tiempos[k][run] = ms;

        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) {
            std::cerr << "CUDA error (kernel " << k << "): " << cudaGetErrorString(err) << std::endl;
            return 1;
        }
    }
}

// Display averages
for (int k = 0; k < num_kernels; ++k) {
    float sum = 0.0f;
    for (int run = 0; run < total_runs; ++run) {
        sum += tiempos[k][run];
    }
    std::cout << "Average time Kernel " << k << ": " << (sum / total_runs) << " ms\n";
}

// Free memory and destroy resources
for (int k = 0; k < num_kernels; ++k) {
    cudaFree(d_data[k]);
    delete[] h_data[k];
    cudaEventDestroy(start_events[k]);
    cudaEventDestroy(stop_events[k]);
    cudaStreamDestroy(streams[k]);
}

cudaFree(d_dataA);
delete[] h_data_cpu;

cuStreamDestroy(streamA);
cuGreenCtxDestroy(gctx[0]);

return 0;