Error propagation between different thread using different context

I want to use different threads to launch cuda kernel in single device. Each thread will bind its own cuda context like this:

__global__ void k2() {
    // correct kernel
}

__global__ void k1() {
    // wrong kernel
    float *err= NULL;
    err[1] = 1;
}

void *kernel1(void* arg) {
    CUcontext ctx;
    cuCtxCreate(&ctx, 0, 0);
    cuCtxSetCurrent(ctx);
    cudaMalloc();
    k1<<<>>>();
    int ret = cudaDeviceSynchronize();
    cout << ret << endl;
}

void *kernel2(void* arg) {
    CUcontext ctx;
    cuCtxCreate(&ctx, 0, 0);
    cuCtxSetCurrent(ctx);
    cudaMalloc();
    k2<<<>>>();
    int ret = cudaDeviceSynchronize();
    cout << ret << endl;
}


int main() {
    pthread_t p1, p2;
    pthread_create(&p1, NULL, kernel1, NULL);
    pthread_create(&p2, NULL, kernel2, NULL);

    pthread_join(p1, NULL);
    pthread_join(p2, NULL);

    return 0;
}

The output is that two kernel both return Error 700 and the results of two kernels are all wrong.

So why two threads use different cuda context in single device, the error still broadcast from error kernel to others? And if I use multiple thread and want to locate which kernel wrong, how to use it?

The runtime API, which you are using, creates/uses its own context (the so-called primary context), not the one(s) you create via the driver API.

If you use strictly the driver API (e.g. for allocations and to launch kernels), it should be possible to isolate errors to specific created contexts.

It is possible to use a runtime-API context (the primary context) in driver API code, via cuDevicePrimaryCtxRetain(). The reverse (getting the runtime API to use a context you create via the driver API) is not possible, AFAIK.

So use cuMemAlloc instead of cudaMalloc, launch kernels with cuLaunchKernel, etc.

void *kernel1(void* arg) {
    CUcontext ctx;
    cuCtxCreate(&ctx, 0, 0);
    cuCtxSetCurrent(ctx);
    printf("kernel1 set context %p\n", ctx);
    cudaMalloc();
    CUcontext c;
    cuCtxGetCurrent(&c);
    printf("kernel1 get context %p\n", c);
    k1<<<>>>();
    int ret = cudaDeviceSynchronize();
    cout << ret << endl;
}

void *kernel2(void* arg) {
    CUcontext ctx;
    cuCtxCreate(&ctx, 0, 0);
    cuCtxSetCurrent(ctx);
    printf("kernel2 set context %p\n", ctx);
    cudaMalloc();
    CUcontext c;
    cuCtxGetCurrent(&c);
    printf("kernel2 get context %p\n", c);
    k2<<<>>>();
    int ret = cudaDeviceSynchronize();
    cout << ret << endl;
}
  1. I get current context after cudaMalloc(), the context is equal to which set at the beginning.
  2. I get an error from dmesg. Is it related to this error? If this error may reset GPU hardware and all process or thread in this GPU will fail, it all make sense.
NVRM: Xid (PCI:0000:65:01): 31, pid=4046245, name=m, channel 0x00000009, intr 00000000. MMU Fault: ENGINE GRAPHICS GPC2 GPCCLIENT_T1_2 faulted @ 0x0_00000000. Fault is of type FAULT_PDE ACCESS_TYPE_VIRT_WRITE

That doesn’t matter. You’re querying the driver API context you created. It’s not the one being used by your kernel launch or cudaMalloc operation(s).

Yes, when you dereference an invalid pointer, or otherwise do something illegal in device code, it will corrupt the context and also (generally) issue an Xid error.

Just corrupt the context? Means the error in thread k1 will not broadcast to other thread.

I have changed the code to driver API.

success.cu

extern "C" __global__ void matrixMultiplication(float* A, float* B, float* C, int size) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (row < size && col < size) {
        float sum = 0.0f;
        for (int k = 0; k < size; k++) {
            sum += A[row * size + k] * B[k * size + col];
        }
        C[row * size + col] = sum;
    }
}

error.cu

extern "C" __global__ void matrixMultiplication_err(float* A, float* B, float* C, int size) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float *err = NULL;
    
    if (row < size && col < size) {
        float sum = 0.0f;
        for (int k = 0; k < size; k++) {
            sum += A[row * size + k] * B[k * size + col];
        }
        C[row * size + col] = sum;
        // error
        err[1] = sum;
    }
}

mat.cpp

#include <iostream>
#include <unistd.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <vector>

using namespace std;

void *kernel1(void* arg) {
    cuInit(0);
    CUresult result;
    CUcontext ctx;
    cuCtxCreate(&ctx, 0, 0);
    cuCtxSetCurrent(ctx);
    printf("kernel1 set context %p\n", ctx);

    CUmodule module;
    CUfunction kernel;
    cuModuleLoad(&module, "matrixMultiplication_err.ptx");
    cuModuleGetFunction(&kernel, module, "matrixMultiplication_err");

    long long int size = 4096 * 4;
    int numThreads = 16;
    int numBlocks = (size + numThreads - 1) / numThreads;
    cout << "threads: " << numThreads << ",blocks: " << numBlocks << endl;

    CUstream stream;
    cuStreamCreate(&stream, 0);

    float* d_A, *d_B, *d_C;
    cuMemAlloc((CUdeviceptr*)&d_A, size * size * sizeof(float));
    cuMemAlloc((CUdeviceptr*)&d_B, size * size * sizeof(float));
    cuMemAlloc((CUdeviceptr*)&d_C, size * size * sizeof(float));

    CUcontext c;
    cuCtxGetCurrent(&c);
    printf("kernel1 get context %p\n", c);

    // 初始化输入矩阵 A 和 B
    float* h_A = new float[size * size];
    float* h_B = new float[size * size];
    for (int i = 0; i < size * size; i++) {
        h_A[i] = 1.0f;
        h_B[i] = 2.0f;
    }
    
    cuMemcpyHtoD((CUdeviceptr)d_A, h_A, size * size * sizeof(float));
    cuMemcpyHtoD((CUdeviceptr)d_B, h_B, size * size * sizeof(float));

    void *args[] = {&d_A, &d_B, &d_C, &size};
    cuLaunchKernel(kernel, numBlocks, numBlocks, 1, numThreads, numThreads, 1, 0, stream, args, NULL);
    cuCtxSynchronize();

    float* h_C = new float[size * size];
    cuMemcpyDtoH(h_C, (CUdeviceptr)d_C, size * size * sizeof(float));
    
    std::cout << "Result:" << std::endl;
    for (int i = 0; i < 10; i++) {
        for (int j = 0; j < 10; j++) {
            std::cout << h_C[i * size + j] << " ";
        }
        std::cout << std::endl;
    }

    cuMemFree((CUdeviceptr)d_A);
    cuMemFree((CUdeviceptr)d_B);
    cuMemFree((CUdeviceptr)d_C);
    delete[] h_A;
    delete[] h_B;
    delete[] h_C;

    return NULL;
}

void *kernel2(void* arg) {
    cuInit(0);
    CUresult result;
    CUcontext ctx;
    cuCtxCreate(&ctx, 0, 0);
    cuCtxSetCurrent(ctx);
    printf("kernel2 set context %p\n", ctx);

    CUmodule module;
    CUfunction kernel;
    cuModuleLoad(&module, "matrixMultiplication.ptx");
    cuModuleGetFunction(&kernel, module, "matrixMultiplication");

    long long int size = 4096 * 4;
    int numThreads = 16;
    int numBlocks = (size + numThreads - 1) / numThreads;
    cout << "threads: " << numThreads << ",blocks: " << numBlocks << endl;

    CUstream stream;
    cuStreamCreate(&stream, 0);

    float* d_A, *d_B, *d_C;
    cuMemAlloc((CUdeviceptr*)&d_A, size * size * sizeof(float));
    cuMemAlloc((CUdeviceptr*)&d_B, size * size * sizeof(float));
    cuMemAlloc((CUdeviceptr*)&d_C, size * size * sizeof(float));

    CUcontext c;
    cuCtxGetCurrent(&c);
    printf("kernel2 get context %p\n", c);

    float* h_A = new float[size * size];
    float* h_B = new float[size * size];
    for (int i = 0; i < size * size; i++) {
        h_A[i] = 1.0f;
        h_B[i] = 2.0f;
    }
    
    cuMemcpyHtoD((CUdeviceptr)d_A, h_A, size * size * sizeof(float));
    cuMemcpyHtoD((CUdeviceptr)d_B, h_B, size * size * sizeof(float));

    void *args[] = {&d_A, &d_B, &d_C, &size};
    cuLaunchKernel(kernel, numBlocks, numBlocks, 1, numThreads, numThreads, 1, 0, stream, args, NULL);
    cuCtxSynchronize();

    float* h_C = new float[size * size];
    cuMemcpyDtoH(h_C, (CUdeviceptr)d_C, size * size * sizeof(float));

    std::cout << "Result:" << std::endl;
    for (int i = 0; i < 10; i++) {
        for (int j = 0; j < 10; j++) {
            std::cout << h_C[i * size + j] << " ";
        }
        std::cout << std::endl;
    }

    cuMemFree((CUdeviceptr)d_A);
    cuMemFree((CUdeviceptr)d_B);
    cuMemFree((CUdeviceptr)d_C);
    delete[] h_A;
    delete[] h_B;
    delete[] h_C;

    return NULL;
}

int main() {
    pthread_t p1, p2;
    pthread_create(&p1, NULL, kernel1, NULL);
    pthread_create(&p2, NULL, kernel2, NULL);

    pthread_join(p1, NULL);
    pthread_join(p2, NULL);

    return 0;
}

Still have the same error: both thread1 and thread2 return the error result and dmesg return NVRM: Xid (PCI:0000:65:01): 31, pid=230563, name=m, channel 0x00000009, intr 00000000. MMU Fault: ENGINE GRAPHICS GPC2 GPCCLIENT_T1_1 faulted @ 0x0_00000000. Fault is of type FAULT_PDE ACCESS_TYPE_VIRT_WRITE.
Does that means this error not only corrupt the context but also reset GPU hardware, and all processes or threads running in this device will fail?

I haven’t looked at this closely, but based on your report I would imagine that process isolation might also be needed for error control, even in the driver API. I had previously created this example for the runtime API, to demonstrate it.