Hello,
I am currently trying to get a simple matrix multiplication working using cuBLAS. For simplicity I tried to use the following code:
// This program calculates matrix multiplication (SGEMM) using cuBLAS
// By: Nick from CoffeeBeforeArch
#include <cublas_v2.h>
#include <curand.h>
#include <cassert>
#include <cmath>
#include <ctime>
#include <iostream>
#include <vector>
// Verify our result on the CPU
// Indexing must account for the CUBLAS operating on column-major data
void verify_solution(float *a, float *b, float *c, int M, int N, int K) {
// Tolerance for our result (floats are imperfect)
float epsilon = 0.001f;
// For every row...
for (int row = 0; row < M; row++) {
// For every column
for (int col = 0; col < N; col++) {
// For every element in the row-col pair...
float temp = 0;
for (int i = 0; i < K; i++) {
temp += a[row + M * i] * b[col * K + i];
}
// Check to see if the difference falls within our tolerance
assert(fabs(c[col * M + row] - temp) <= epsilon);
}
}
}
int main() {
// Dimensions for our matrices
// MxK * KxN = MxN
const int M = 1 << 9;
const int N = 1 << 8;
const int K = 1 << 7;
// Pre-calculate the size (in bytes) of our matrices
const size_t bytes_a = M * K * sizeof(float);
const size_t bytes_b = K * N * sizeof(float);
const size_t bytes_c = M * N * sizeof(float);
// Vectors for the host data
std::vector<float> h_a(M * K);
std::vector<float> h_b(K * N);
std::vector<float> h_c(M * N);
// Allocate device memory
float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, bytes_a);
cudaMalloc(&d_b, bytes_b);
cudaMalloc(&d_c, bytes_c);
// Pseudo random number generator
curandGenerator_t prng;
curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
// Set the seed
curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long)clock());
// Fill the matrix with random numbers on the device
curandGenerateUniform(prng, d_a, M * K);
curandGenerateUniform(prng, d_b, K * M);
// cuBLAS handle
cublasHandle_t handle;
cublasCreate(&handle); // FIXME the program fails here
// Scalaing factors
float alpha = 1.0f;
float beta = 0.0f;
// Calculate: c = (alpha*a) * b + (beta*c)
// MxN = MxK * KxN
// Signature: handle, operation, operation, M, N, K, alpha, A, lda, B, ldb,
// beta, C, ldc
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, &alpha, d_a, M, d_b, K,
&beta, d_c, M);
// Copy back the three matrices
cudaMemcpy(h_a.data(), d_a, bytes_a, cudaMemcpyDeviceToHost);
cudaMemcpy(h_b.data(), d_b, bytes_b, cudaMemcpyDeviceToHost);
cudaMemcpy(h_c.data(), d_c, bytes_c, cudaMemcpyDeviceToHost);
// Verify solution
verify_solution(h_a.data(), h_b.data(), h_c.data(), M, N, K);
std::cout << "COMPLETED SUCCESSFULLY\n";
// Free our memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
This code is taken from here. Further info on my system / compiler can be seen below.
bash-4.2$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0
bash-4.2$ nvidia-smi
Mon Jun 13 13:18:34 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02 Driver Version: 470.57.02 CUDA Version: 11.4 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA A100-SXM... On | 00000000:0B:00.0 Off | 0 |
| N/A 52C P0 62W / 400W | 0MiB / 40536MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
I compiled the code using nvcc -arch=sm_80 cublas.cu -lcublas -lcurand -o cublas
without problems but when executing the result it only prints Killed
after a few seconds. Using cuda-gdb
I found that the error seems to occur when cublasCreate(&handle)
is executed but I am at a loss as to why this is happening. Is the way that the handle is created wrong / incomplete or is it a problem with my system?