Hi everyone,
I’m experimenting with CUDA Green Contexts to assign a specific number of SMs to each context and run different kernels in parallel. However, I’m observing that regardless of how many SMs I allocate to each Green Context (via cuDevSmResourceSplitByCount), the execution time for the kernel remains essentially the same in both contexts.
Below is a simplified version of my code. I’m launching a computationally heavy kernel using two different Green Contexts, each associated with its own stream. I allocate only minCount = 1 SM to one of the contexts, while the other context gets the remaining SMs. I expected the kernel in the smaller context to take noticeably longer, but both timings are almost identical.
Is there something I’m misunderstanding about how Green Contexts work, or is there an additional step required to enforce the SM limits?
Any insights would be greatly appreciated!
Thanks in advance!
🖥️ System Info:
- GPU: NVIDIA Orin (nvgpu)
- CUDA Version: 12.6
- Driver Version: 540.4.0
- OS: Ubuntu 20.04
- Compiler:
nvccfrom CUDA 12.6
Executed code
#include <iostream>
#include <cuda_runtime.h>
#include <cmath>
#include <cuda.h>
#include <vector>
__global__ void heavyKernel(float *data, int n, int iterations) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) return;
float x = data[idx];
for (int i = 0; i < iterations; ++i) {
x = x * 1.0000001f + 0.0000001f;
x = sinf(x);
x = sqrtf(fabsf(x));
}
data[idx] = x;
}
#define CUDA_RT(call) \
do { \
cudaError_t _err = (call); \
if ( cudaSuccess != _err ) { \
fprintf(stderr, "CUDA error in file '%s' at line %i: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(_err)); \
return _err; \
} else { \
printf("CUDA Runtime call at %s:%d succeeded.\n", __FILE__, __LINE__); \
} \
} while (0)
#define CUDA_DRV(call) \
do { \
CUresult _status = (call); \
if ( CUDA_SUCCESS != _status) { \
fprintf(stderr, "CUDA error in file '%s' at line %i: %i\n", \
__FILE__, __LINE__, _status); \
return _status; \
} else { \
printf("CUDA Driver call at %s:%d succeeded.\n", __FILE__, __LINE__); \
} \
} while (0)
int main() {
CUdevResource input;
CUdevResource resources[2];
CUdevResourceDesc desc[2];
CUgreenCtx gctx[2];
CUstream streamA, streamB;
unsigned int nbGroups = 1; // number of groups to create
unsigned int minCount = 1; // minimum SM count to assign to a green context
int deviceCount = 0;
cudaError_t err = cudaGetDeviceCount(&deviceCount); // error variable
const int n = 1 << 20; // 1 million elements
const int iterations = 100000;
const int total_runs = 10;
const int threadsPerBlock = 256;
const int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
float tiemposA[total_runs];
float tiemposB[total_runs];
cudaEvent_t startA, stopA, startB, stopB;
cudaEventCreate(&startA);
cudaEventCreate(&stopA);
cudaEventCreate(&startB);
cudaEventCreate(&stopB);
float *h_data = new float[n];
for (int i = 0; i < n; ++i) {
h_data[i] = static_cast<float>(i) / n;
}
float *d_data;
cudaMalloc(&d_data, n * sizeof(float));
cudaMemcpy(d_data, h_data, n * sizeof(float), cudaMemcpyHostToDevice);
float tiempos[total_runs];
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Preheating
heavyKernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, n, iterations);
for (int i = 0; i < total_runs; ++i) {
std::cout << "Launching kernel " << i << "...\n";
cudaEventRecord(start);
heavyKernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, n, iterations);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
return 1;
}
float ms = 0;
cudaEventElapsedTime(&ms, start, stop);
tiempos[i] = ms;
}
cudaMemcpy(h_data, d_data, n * sizeof(float), cudaMemcpyDeviceToHost);
float sum = 0.0f;
for (int i = 0; i < total_runs; ++i) sum += tiempos[i];
std::cout << "Average kernel time: " << (sum / total_runs) << " ms\n";
// Cleanup
delete[] h_data;
cudaFree(d_data);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Initializing device...\n");
CUDA_RT(cudaInitDevice(0, 0, 0));
printf("Getting SM resources from device...\n");
CUDA_DRV(cuDeviceGetDevResource((CUdevice)0, &input, CU_DEV_RESOURCE_TYPE_SM));
printf("Total number of SMs: %u\n", input.sm.smCount);
printf("Dividing resources: (%u SMs) for the first green context.\n", minCount);
CUDA_DRV(
cuDevSmResourceSplitByCount(
&resources[0], // Array where the groups are written (first group in this case)
&nbGroups, // Number of groups to create
&input, // Original resource (all SMs from the device)
&resources[1], // Remaining resource (SMs not assigned to the group)
0, // flags (usually 0)
minCount // Minimum number of SMs in the first group
)
);
printf("Resources divided.\n");
printf("Generating descriptors\n");
CUDA_DRV(cuDevResourceGenerateDesc(&desc[0], &resources[0], 1));
printf("Creating green contexts...\n");
CUDA_DRV(cuGreenCtxCreate(&gctx[0], desc[0], (CUdevice)0, CU_GREEN_CTX_DEFAULT_STREAM));
printf("Green context A created.\n");
CUDA_DRV(cuDevResourceGenerateDesc(&desc[1], &resources[1], 1));
CUDA_DRV(cuGreenCtxCreate(&gctx[1], desc[1], (CUdevice)0, CU_GREEN_CTX_DEFAULT_STREAM));
printf("Green context B created.\n");
printf("Creating and associating the streams to the GC\n");
CUDA_DRV(cuGreenCtxStreamCreate(&streamA, gctx[0], CU_STREAM_NON_BLOCKING, 0));
CUDA_DRV(cuGreenCtxStreamCreate(&streamB, gctx[1], CU_STREAM_NON_BLOCKING, 0));
printf("Successfully done\n");
for (int i = 0; i < total_runs; i++) {
printf("Launching kernel %d...\n", i);
// Kernel in streamA
cudaEventRecord(startA, (cudaStream_t)streamA);
heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, (cudaStream_t)streamA>>>(d_data, n, iterations);
cudaEventRecord(stopA, (cudaStream_t)streamA);
// Kernel in streamB
cudaEventRecord(startB, (cudaStream_t)streamB);
heavyKernel<<<blocksPerGrid, threadsPerBlock, 0, (cudaStream_t)streamB>>>(d_data, n, iterations); // heavier
cudaEventRecord(stopB, (cudaStream_t)streamB);
// Synchronization
cudaEventSynchronize(stopA);
cudaEventSynchronize(stopB);
float msA = 0.0f, msB = 0.0f;
cudaEventElapsedTime(&msA, startA, stopA);
cudaEventElapsedTime(&msB, startB, stopB);
tiemposA[i] = msA;
tiemposB[i] = msB;
// Error check
err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
return 1;
}
}
// Print averages
float sumA = 0.0f, sumB = 0.0f;
for (int i = 0; i < total_runs; ++i) {
sumA += tiemposA[i];
sumB += tiemposB[i];
}
std::cout << "Average time for kernel A: " << (sumA / total_runs) << " ms\n";
std::cout << "Average time for kernel B: " << (sumB / total_runs) << " ms\n";
// Destroy events
cudaEventDestroy(startA);
cudaEventDestroy(stopA);
cudaEventDestroy(startB);
cudaEventDestroy(stopB);
return 0;
}
Execution results
Average kernel time: 4482.25 ms
Initializing device…
CUDA Runtime call at expetimento_ingles.cu:124 succeeded.
Getting SM resources from device…
CUDA Driver call at expetimento_ingles.cu:128 succeeded.
Total number of SMs: 8
Splitting resources: (1 SMs) for the first green context.
CUDA Driver call at expetimento_ingles.cu:133 succeeded.
Resources split.
Generating descriptors
CUDA Driver call at expetimento_ingles.cu:147 succeeded.
Creating green contexts…
CUDA Driver call at expetimento_ingles.cu:151 succeeded.
Green context A created.
CUDA Driver call at expetimento_ingles.cu:154 succeeded.
CUDA Driver call at expetimento_ingles.cu:155 succeeded.
Green context B created.
Creating and binding streams to GCs
CUDA Driver call at expetimento_ingles.cu:160 succeeded.
CUDA Driver call at expetimento_ingles.cu:161 succeeded.
Completed successfully
Average kernel time (Context A): 8962.44 ms
Average kernel time (Context B): 8962.4 ms