I have changed the code to driver API.
success.cu
extern "C" __global__ void matrixMultiplication(float* A, float* B, float* C, int size) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < size && col < size) {
float sum = 0.0f;
for (int k = 0; k < size; k++) {
sum += A[row * size + k] * B[k * size + col];
}
C[row * size + col] = sum;
}
}
error.cu
extern "C" __global__ void matrixMultiplication_err(float* A, float* B, float* C, int size) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float *err = NULL;
if (row < size && col < size) {
float sum = 0.0f;
for (int k = 0; k < size; k++) {
sum += A[row * size + k] * B[k * size + col];
}
C[row * size + col] = sum;
// error
err[1] = sum;
}
}
mat.cpp
#include <iostream>
#include <unistd.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <vector>
using namespace std;
void *kernel1(void* arg) {
cuInit(0);
CUresult result;
CUcontext ctx;
cuCtxCreate(&ctx, 0, 0);
cuCtxSetCurrent(ctx);
printf("kernel1 set context %p\n", ctx);
CUmodule module;
CUfunction kernel;
cuModuleLoad(&module, "matrixMultiplication_err.ptx");
cuModuleGetFunction(&kernel, module, "matrixMultiplication_err");
long long int size = 4096 * 4;
int numThreads = 16;
int numBlocks = (size + numThreads - 1) / numThreads;
cout << "threads: " << numThreads << ",blocks: " << numBlocks << endl;
CUstream stream;
cuStreamCreate(&stream, 0);
float* d_A, *d_B, *d_C;
cuMemAlloc((CUdeviceptr*)&d_A, size * size * sizeof(float));
cuMemAlloc((CUdeviceptr*)&d_B, size * size * sizeof(float));
cuMemAlloc((CUdeviceptr*)&d_C, size * size * sizeof(float));
CUcontext c;
cuCtxGetCurrent(&c);
printf("kernel1 get context %p\n", c);
// 初始化输入矩阵 A 和 B
float* h_A = new float[size * size];
float* h_B = new float[size * size];
for (int i = 0; i < size * size; i++) {
h_A[i] = 1.0f;
h_B[i] = 2.0f;
}
cuMemcpyHtoD((CUdeviceptr)d_A, h_A, size * size * sizeof(float));
cuMemcpyHtoD((CUdeviceptr)d_B, h_B, size * size * sizeof(float));
void *args[] = {&d_A, &d_B, &d_C, &size};
cuLaunchKernel(kernel, numBlocks, numBlocks, 1, numThreads, numThreads, 1, 0, stream, args, NULL);
cuCtxSynchronize();
float* h_C = new float[size * size];
cuMemcpyDtoH(h_C, (CUdeviceptr)d_C, size * size * sizeof(float));
std::cout << "Result:" << std::endl;
for (int i = 0; i < 10; i++) {
for (int j = 0; j < 10; j++) {
std::cout << h_C[i * size + j] << " ";
}
std::cout << std::endl;
}
cuMemFree((CUdeviceptr)d_A);
cuMemFree((CUdeviceptr)d_B);
cuMemFree((CUdeviceptr)d_C);
delete[] h_A;
delete[] h_B;
delete[] h_C;
return NULL;
}
void *kernel2(void* arg) {
cuInit(0);
CUresult result;
CUcontext ctx;
cuCtxCreate(&ctx, 0, 0);
cuCtxSetCurrent(ctx);
printf("kernel2 set context %p\n", ctx);
CUmodule module;
CUfunction kernel;
cuModuleLoad(&module, "matrixMultiplication.ptx");
cuModuleGetFunction(&kernel, module, "matrixMultiplication");
long long int size = 4096 * 4;
int numThreads = 16;
int numBlocks = (size + numThreads - 1) / numThreads;
cout << "threads: " << numThreads << ",blocks: " << numBlocks << endl;
CUstream stream;
cuStreamCreate(&stream, 0);
float* d_A, *d_B, *d_C;
cuMemAlloc((CUdeviceptr*)&d_A, size * size * sizeof(float));
cuMemAlloc((CUdeviceptr*)&d_B, size * size * sizeof(float));
cuMemAlloc((CUdeviceptr*)&d_C, size * size * sizeof(float));
CUcontext c;
cuCtxGetCurrent(&c);
printf("kernel2 get context %p\n", c);
float* h_A = new float[size * size];
float* h_B = new float[size * size];
for (int i = 0; i < size * size; i++) {
h_A[i] = 1.0f;
h_B[i] = 2.0f;
}
cuMemcpyHtoD((CUdeviceptr)d_A, h_A, size * size * sizeof(float));
cuMemcpyHtoD((CUdeviceptr)d_B, h_B, size * size * sizeof(float));
void *args[] = {&d_A, &d_B, &d_C, &size};
cuLaunchKernel(kernel, numBlocks, numBlocks, 1, numThreads, numThreads, 1, 0, stream, args, NULL);
cuCtxSynchronize();
float* h_C = new float[size * size];
cuMemcpyDtoH(h_C, (CUdeviceptr)d_C, size * size * sizeof(float));
std::cout << "Result:" << std::endl;
for (int i = 0; i < 10; i++) {
for (int j = 0; j < 10; j++) {
std::cout << h_C[i * size + j] << " ";
}
std::cout << std::endl;
}
cuMemFree((CUdeviceptr)d_A);
cuMemFree((CUdeviceptr)d_B);
cuMemFree((CUdeviceptr)d_C);
delete[] h_A;
delete[] h_B;
delete[] h_C;
return NULL;
}
int main() {
pthread_t p1, p2;
pthread_create(&p1, NULL, kernel1, NULL);
pthread_create(&p2, NULL, kernel2, NULL);
pthread_join(p1, NULL);
pthread_join(p2, NULL);
return 0;
}
Still have the same error: both thread1 and thread2 return the error result and dmesg return NVRM: Xid (PCI:0000:65:01): 31, pid=230563, name=m, channel 0x00000009, intr 00000000. MMU Fault: ENGINE GRAPHICS GPC2 GPCCLIENT_T1_1 faulted @ 0x0_00000000. Fault is of type FAULT_PDE ACCESS_TYPE_VIRT_WRITE.
Does that means this error not only corrupt the context but also reset GPU hardware, and all processes or threads running in this device will fail?