Stream created by Green Context is not compatible with cudaMemcpyAsync

  float* tensor_device;
  float* tensor_host;
  printf("Testing the compatibility of CUDA streams with green context and CUDA Async Memcopy\n");
  CUDA_RT(cudaMalloc(&tensor_device, 1000 * 512 * sizeof(float)));
  CUDA_RT(cudaMallocHost(&tensor_host, 1000 * 512 * sizeof(float)));
  CUDA_RT(
      cudaMemcpy(tensor_device, tensor_host, 1000 * 512 * sizeof(float), cudaMemcpyHostToDevice));

  CUDA_RT(cudaMemcpyAsync(tensor_device, tensor_host, 1000 * 512 * sizeof(float),
                          cudaMemcpyHostToDevice, (cudaStream_t)streamA));

  CUDA_RT(cudaStreamSynchronize((cudaStream_t)streamA));

The above code generates a invalid device context error when calling cudaMemcpyAsync.

Could you post a more complete example including how streamA was created or any green context calls, which make the difference, please?

+1 ^^

The driver API (where green contexts are relevant) defines contexts which are not interoperable with the runtime API, unless the driver API is using the context that the runtime API uses by default (the so-called “primary” context). More specifically, CUDA entities that have a “context” or “device” dependency (including streams) are not usable/relevant except on the context (or device in the runtime API world) that they have an association to. Therefore your observation is probably expected. This isn’t really unique or specific to green contexts, but pertains to any stream created in the driver API when the the current context is not the primary context. In the driver API, typically the current context is not the primary context unless you do something explicitly like cuDevicePrimaryCtxRetain, or similar.

I think you are wrong. Green context is a lightweight context. The streams created though green contexts should be able to use directly without switching primary context. You can see how to use it here
Fortunately, after I upgrade my CUDA from 12.6 to 12.8, this issue has been fixed.

#include <cuda_runtime.h>
#include <cuda.h>
#include <cstdio>
#include <cassert>
#include <cstdint>
#include <sys/time.h>

#define CUDA_RT(call)                                                                 \
  do {                                                                                \
    cudaError_t _err = (call);                                                        \
    if (cudaSuccess != _err) {                                                        \
      fprintf(stderr, "CUDA error in file '%s' at line %i: %s\n", __FILE__, __LINE__, \
              cudaGetErrorString(_err));                                              \
      return _err;                                                                    \
    }                                                                                 \
  } while (0)

#define CUDA_DRV(call)                                                                          \
  do {                                                                                          \
    CUresult _status = (call);                                                                  \
    if (CUDA_SUCCESS != _status) {                                                              \
      fprintf(stderr, "CUDA error in file '%s' at line %i: %i\n", __FILE__, __LINE__, _status); \
      return _status;                                                                           \
    }                                                                                           \
  } while (0)


int main() {
  CUgreenCtx gctx[2];
  CUdevResourceDesc desc[2];
  CUdevResource input;
  CUdevResource resources[2];
  CUstream streamA;
  CUstream streamB;

  unsigned int nbGroups = 1;
  unsigned int minCount = 0;

  // Initialize device 0
  CUDA_RT(cudaInitDevice(0, 0, 0));

  // Query input SMs
  CUDA_DRV(cuDeviceGetDevResource((CUdevice)0, &input, CU_DEV_RESOURCE_TYPE_SM));
  // We want 3/4 the device for our green context
  minCount = (unsigned int)((float)input.sm.smCount * 0.4f);

  // Split my resources
  CUDA_DRV(cuDevSmResourceSplitByCount(&resources[0], &nbGroups, &input, &resources[1], 0,
                                       input.sm.smCount));
  CUDA_DRV(cuDevResourceGenerateDesc(&desc[0], &resources[0], 1));
  CUDA_DRV(cuGreenCtxCreate(&gctx[0], desc[0], (CUdevice)0, CU_GREEN_CTX_DEFAULT_STREAM));
  CUDA_DRV(cuGreenCtxStreamCreate(&streamA, gctx[0], CU_STREAM_NON_BLOCKING, 0));

  float* tensor_device;
  float* tensor_host;
  printf("Testing the compatibility of CUDA streams with green context and CUDA Async Memcopy\n");
  CUDA_RT(cudaMalloc(&tensor_device, 1000 * 512 * sizeof(float)));
  CUDA_RT(cudaMallocHost(&tensor_host, 1000 * 512 * sizeof(float)));
  CUDA_RT(
      cudaMemcpy(tensor_device, tensor_host, 1000 * 512 * sizeof(float), cudaMemcpyHostToDevice));

  CUDA_RT(cudaMemcpyAsync(tensor_device, tensor_host, 1000 * 512 * sizeof(float),
                          cudaMemcpyHostToDevice, (cudaStream_t)streamA));

  CUDA_RT(cudaStreamSynchronize((cudaStream_t)streamA));

  return (0);
}

The above is an complete example. The issue seems to be fixed in CUDA-12-8. Previously I was using CUDA-12-6.