Interaction between Green Contexts, MPS, and GPU resource allocation for parallel kernel execution

juamaros · June 3, 2025, 1:22pm

Hi everyone,

I am working with Green Contexts combined with MPS (Multi-Process Service) to demonstrate and analyze parallel CUDA kernel execution with resource guarantees.

My surprise is the following: when running multiple kernels in parallel—one using Green Contexts (GC) and the others without any resource control policy—I can clearly see resource guarantees for the GC kernel when MPS is enabled. However, when I repeat the experiment and assign 100% of the GPU SM resources to a single kernel using Green Contexts, the inference time for that kernel is worse than those kernels running without any resource allocation at all. This happens even when the entire GPU resource is assigned to that one kernel.

My question is:
How does Green Contexts actually interact with MPS when mixing GC-managed kernels and non-managed kernels?
I would like to understand if the behavior I observe is due to a conflict or interaction between Green Contexts’ resource allocation and MPS virtualization.

Here are the inference times (in milliseconds) for the kernels in both scenarios—with and without MPS. In all cases, the GC kernel is assigned all SMs:

Kernel Without MPS (ms) // With MPS (ms)
Kernel 1 15859.6 // 5088.07
Kernel 2 13007.6 // 4534.94
Kernel 3 11910.2 // 5336.46
Kernel GC 4458.17 // 5363.42
Kernel 5 9116.33 // 5267.52
Kernel 6 9957.15 // 5266.71
Kernel 7 12589.4 // 5007.74
Kernel 8 15168.6 // 5095.82

I attach the code that I have been using for testing. Thank you very much.

#include <iostream>
#include <cuda_runtime.h>
#include <cmath>
#include <cuda.h>
#include <vector>
#include <iomanip>   // for std::put_time
#include <chrono>    // for std::chrono
#include <ctime>     // for std::time_t
#include <nvToolsExt.h>

#define GREEN_TEXT "\033[32m"
#define RESET_TEXT "\033[0m"

// GPU kernel with heavy computations
__global__ void heavyKernel(float *data, int n, int iterations) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    float x = data[idx];
    for (int i = 0; i < iterations; ++i) {
        x = x * 1.0000001f + 0.0000001f;
        x = sinf(x);
        x = sqrtf(fabsf(x));
    }
    data[idx] = x;
}

// Macro for CUDA Runtime API error checking and success message
#define CUDA_RT(call)                                                   \
    do {                                                                \
        cudaError_t _err = (call);                                      \
        if ( cudaSuccess != _err ) {                                    \
            fprintf(stderr, "CUDA error in file '%s' at line %i: %s\n", \
                    __FILE__, __LINE__, cudaGetErrorString(_err));      \
            return _err;                                                \
        } else {                                                        \
            printf("CUDA Runtime call at %s:%d succeeded.\n", __FILE__, __LINE__); \
        }                                                               \
    } while (0)

// Macro for CUDA Driver API error checking and success message
#define CUDA_DRV(call)                                                  \
    do {                                                                \
        CUresult _status = (call);                                      \
        if ( CUDA_SUCCESS != _status) {                                 \
            fprintf(stderr, "CUDA error in file '%s' at line %i: %i\n", \
                    __FILE__, __LINE__, _status);                       \
            return _status;                                             \
        } else {                                                        \
            printf("CUDA Driver call at %s:%d succeeded.\n", __FILE__, __LINE__); \
        }                                                               \
    } while (0)

int main() {
    CUdevResource input;
    CUdevResource resources[2];
    CUdevResourceDesc desc[2];
    CUgreenCtx gctx[2];
    CUstream streamA;

    const int num_kernels = 8;
    unsigned int nbGroups = 1; // number of groups to create
    unsigned int minCount = 3; // minimum SM count for green context

    int deviceCount = 0;
    cudaError_t err = cudaGetDeviceCount(&deviceCount);
    if (err != cudaSuccess) {
        std::cerr << "Error getting device count: " << cudaGetErrorString(err) << std::endl;
        return 1;
    }

    const int iterations = 10000000;
    const int total_runs = 10;

    const int threadsPerBlock = 256;
    const int blocksPerGrid = 24;  // 8 x 256 = 2048 threads
    const int n =  threadsPerBlock * blocksPerGrid; // one per thread

    // Events and timing
    bool use_stream_A = true;

    cudaEvent_t startA, stopA;
    cudaEventCreate(&startA);
    cudaEventCreate(&stopA);

    float *h_data_cpu = new float[n];
    for (int i = 0; i < n; ++i) {
        h_data_cpu[i] = static_cast<float>(i) / n;
    }

    float *d_dataA;
    cudaMalloc(&d_dataA, n * sizeof(float));
    cudaMemcpy(d_dataA, h_data_cpu, n * sizeof(float), cudaMemcpyHostToDevice);

    printf("Initializing device...\n");
    CUDA_RT(cudaInitDevice(0, 0, 0));

    printf("Obtaining SM resources from device...\n");
    CUDA_DRV(cuDeviceGetDevResource((CUdevice)0, &input, CU_DEV_RESOURCE_TYPE_SM));
    printf("Total number of SMs: %u\n", input.sm.smCount);

    printf("Splitting resources: (%u SMs) for the first green context.\n", minCount);

    CUDA_DRV(
        cuDevSmResourceSplitByCount(
            &resources[0],   // first group
            &nbGroups,
            &input,
            &resources[1],   // remainder
            0,
            minCount
        )
    );
    printf("Resources split.\n");

    printf("Generating descriptors\n");
    CUDA_DRV(cuDevResourceGenerateDesc(&desc[0], &resources[0], 1));

    printf("Creating green contexts...\n");
    CUDA_DRV(cuGreenCtxCreate(&gctx[0], desc[0], (CUdevice)0, CU_GREEN_CTX_DEFAULT_STREAM));
    printf("Green context A created.\n");

    printf("Creating and associating streams to the green context\n");
    CUDA_DRV(cuGreenCtxStreamCreate(&streamA, gctx[0], CU_STREAM_NON_BLOCKING, 0));
    printf("Streams successfully created and associated\n");

    // Data, streams and events for normal kernels
    float *h_data[num_kernels];
    float *d_data[num_kernels];
    cudaStream_t streams[num_kernels];
    cudaEvent_t start_events[num_kernels], stop_events[num_kernels];

    float times[num_kernels][total_runs];

    for (int k = 0; k < num_kernels; ++k) {
        h_data[k] = new float[n];
        for (int i = 0; i < n; ++i) {
            h_data[k][i] = static_cast<float>(i) / n;
        }
        cudaMalloc(&d_data[k], n * sizeof(float));
        cudaMemcpy(d_data[k], h_data[k], n * sizeof(float), cudaMemcpyHostToDevice);
        cudaStreamCreate(&streams[k]);
        cudaEventCreate(&start_events[k]);
        cudaEventCreate(&stop_events[k]);
    }

    // Run kernel only to check time 
    for (int run = 0; run < total_runs; ++run) {
        auto now_before = std::chrono::system_clock::now();
        std::time_t now_c_before = std::chrono::system_clock::to_time_t(now_before);
        std::cout << "Test Kernel Start: "
                  << std::put_time(std::localtime(&now_c_before), "%H:%M:%S")
                  << " (SMs: " << input.sm.smCount - minCount << ")"
                  << std::endl;

        cudaEventRecord(start_events[0], streams[0]);
        // heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, streams[0]>>>(d_data[0], n, iterations);
        cudaEventRecord(stop_events[0], streams[0]);
        cudaStreamSynchronize(streams[0]);

        float ms = 0.0f;
        cudaEventElapsedTime(&ms, start_events[0], stop_events[0]);

        auto now_after = std::chrono::system_clock::now();
        std::time_t now_c_after = std::chrono::system_clock::to_time_t(now_after);

        std::cout << "Kernel End (sequential): "
                  << std::put_time(std::localtime(&now_c_after), "%H:%M:%S")
                  << std::endl;

        std::cout << "Kernel Time (individual): " << ms << " ms\n";
    }

    // Initial launch for testing
    heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, (cudaStream_t)streamA>>>(d_dataA, n, iterations);
    for (int k = 0; k < num_kernels; ++k) {
        heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, streams[k]>>>(d_data[k], n, 10); // few iterations
    }
    cudaDeviceSynchronize();

   for (int run = 0; run < total_runs; ++run) {
    auto now_before = std::chrono::system_clock::now();
    std::time_t now_c_before = std::chrono::system_clock::to_time_t(now_before);
    std::cout << "Test Kernel start: "
              << std::put_time(std::localtime(&now_c_before), "%H:%M:%S")
              << " (SMs: " << input.sm.smCount - minCount << ")"
              << std::endl;

    cudaEventRecord(start_events[0], streams[0]);
    // heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, streams[0]>>>(d_data[0], n, iterations);
    cudaEventRecord(stop_events[0], streams[0]);
    cudaStreamSynchronize(streams[0]);

    float ms = 0.0f;
    cudaEventElapsedTime(&ms, start_events[0], stop_events[0]);

    auto now_after = std::chrono::system_clock::now();
    std::time_t now_c_after = std::chrono::system_clock::to_time_t(now_after);

    std::cout << "Kernel end (sequential): "
              << std::put_time(std::localtime(&now_c_after), "%H:%M:%S")
              << std::endl;

    std::cout << "Kernel time (individual): " << ms << " ms\n";
}

// Initial launch for testing
heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, (cudaStream_t)streamA>>>(d_dataA, n, iterations);
for (int k = 0; k < num_kernels; ++k) {
    heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, streams[k]>>>(d_data[k], n, 10); // few iterations
}
cudaDeviceSynchronize();

for (int run = 0; run < total_runs; ++run) {
    std::cout << "---- Iteration " << run << " ----\n";

    bool use_stream_A = true;
    heavyKernel<<<1, threadsPerBlock>>>(d_dataA, n, iterations);

    for (int k = 0; k < num_kernels; ++k) {
        std::string label = "Kernel_" + std::to_string(run) + "_S" + std::to_string(k);
        nvtxRangeId_t range = nvtxRangeStartA(label.c_str());

        auto now_before = std::chrono::system_clock::now();
        std::time_t now_c_before = std::chrono::system_clock::to_time_t(now_before);

        if (k == 3 && use_stream_A) {
            std::cout << GREEN_TEXT
                      << "Kernel start (green context, SMs: " << minCount << "): "
                      << std::put_time(std::localtime(&now_c_before), "%H:%M:%S")
                      << RESET_TEXT << std::endl;

            cudaEventRecord(start_events[k], streamA);
            heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, (cudaStream_t)streamA>>>(d_dataA, n, iterations);
            cudaEventRecord(stop_events[k], streamA);

            use_stream_A = false;
        } else {
            std::cout << "Kernel " << k << " start: "
                      << std::put_time(std::localtime(&now_c_before), "%H:%M:%S")
                      << " (SMs: " << input.sm.smCount - minCount << ")"
                      << std::endl;

            cudaEventRecord(start_events[k], streams[k]);
            heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, streams[k]>>>(d_data[k], n, iterations);
            cudaEventRecord(stop_events[k], streams[k]);
        }

        nvtxRangeEnd(range);
    }

    // Synchronize and measure times
    for (int k = 0; k < num_kernels; ++k) {
        cudaStreamSynchronize(streams[k]);
        cudaStreamSynchronize(streamA);
        float ms = 0.0f;
        cudaEventElapsedTime(&ms, start_events[k], stop_events[k]);

        auto now_after = std::chrono::system_clock::now();
        std::time_t now_c_after = std::chrono::system_clock::to_time_t(now_after);

        if (k == 0 && !use_stream_A) {
            std::cout << GREEN_TEXT
                      << "Kernel end (green context, SMs: " << minCount << "): "
                      << std::put_time(std::localtime(&now_c_after), "%H:%M:%S")
                      << RESET_TEXT << std::endl;
        } else {
            std::cout << "Kernel " << k << " end: "
                      << std::put_time(std::localtime(&now_c_after), "%H:%M:%S")
                      << " (SMs: " << input.sm.smCount - minCount << ")"
                      << std::endl;
        }

        std::cout << "Kernel " << k << " time: " << ms << " ms\n";
        tiempos[k][run] = ms;

        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) {
            std::cerr << "CUDA error (kernel " << k << "): " << cudaGetErrorString(err) << std::endl;
            return 1;
        }
    }
}

// Display averages
for (int k = 0; k < num_kernels; ++k) {
    float sum = 0.0f;
    for (int run = 0; run < total_runs; ++run) {
        sum += tiempos[k][run];
    }
    std::cout << "Average time Kernel " << k << ": " << (sum / total_runs) << " ms\n";
}

// Free memory and destroy resources
for (int k = 0; k < num_kernels; ++k) {
    cudaFree(d_data[k]);
    delete[] h_data[k];
    cudaEventDestroy(start_events[k]);
    cudaEventDestroy(stop_events[k]);
    cudaStreamDestroy(streams[k]);
}

cudaFree(d_dataA);
delete[] h_data_cpu;

cuStreamDestroy(streamA);
cuGreenCtxDestroy(gctx[0]);

return 0;

Topic		Replies	Views
Questions about Resource Isolation and Execution Control using CUDA Green Contexts + MPS CUDA Programming and Performance cuda	8	278	October 14, 2025
The Usage Scenarios of Green Context CUDA Programming and Performance cuda	5	226	October 17, 2025
Green-context-sm-allocation-not-affecting-kernel-runtime in Jetson Orina Jetson Orin Nano cuda	8	209	May 16, 2025
How to use CUDA Green Context with MPS CUDA Programming and Performance cuda , kernel	1	589	December 20, 2024
Green Context SM Allocation Not Affecting Kernel Runtime CUDA Programming and Performance cuda , jetson	9	371	May 6, 2025
Question about interoperability of CUDA Graphs Green Context across multiple processes CUDA Programming and Performance cuda	4	290	May 14, 2025
GPU sharing among different application with different CUDA context CUDA Programming and Performance	23	18941	December 17, 2020
Using Green Context in CUDA on Jetson Devices with Ampere Architecture CUDA Programming and Performance cuda , jetson	3	291	April 28, 2025
Question about CUDA MPS CUDA Programming and Performance	15	3226	August 22, 2022
concurrent execution of cuda kernels from different contexts CUDA Programming and Performance	1	677	April 18, 2019

Interaction between Green Contexts, MPS, and GPU resource allocation for parallel kernel execution

Related topics