Hello everyone,
I’m learning the cuBLAS API so I coded a basic square matrix multiplication to test it. But I always get an error which code is 11 (CUBLAS_STATUS_MAPPING_ERROR). I’ve tried these things in all the possible combinations:
- Stop X11 and lightdm services
- Use cudaMemcpy instead of cublasSetMatrix
- Use Page-lock Host memory with cudaMallocHost
- Run cudaMalloc before cublasCreate
I always get that error. If I use cudaMemcpy to copy the matrices into GPU memory, I got the kernel launched (or so says nvprof) but the result is 0 (all the elements of the result matrix are 0).
If I run the “simpleCUBLAS” sample from the official NVIDIA’s sample, it runs perfect.
I’m mainly testing it in a Jetson TX1 with CUDA 8.0 installed but I have tried it also in a GeForce 1050 Ti and the results are the same.
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>
#define DEFAULT_MATRIX_DIM 128
void myCudaCheckError(cudaError_t err) {
if(err != cudaSuccess) {
fprintf(stderr, "Could not allocate memory\nError: %d\n", err);
exit(EXIT_FAILURE);
}
}
int main(int argc, char **argv) {
/* Init cuBLAS context */
cublasStatus_t cublas_status;
cublasHandle_t cublas_handle;
if ((cublas_status = cublasCreate(&cublas_handle)) != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "Could not create cuBLAS context\n");
return EXIT_FAILURE;
}
/* Matrix initialization */
float *A, *B, *C;
float *dev_A, *dev_B, *dev_C;
int dim = DEFAULT_MATRIX_DIM;
int element_count = dim * dim;
size_t matrix_size = element_count * sizeof(float);
/* Allocate memory on the host */
myCudaCheckError(cudaMallocHost(&A, matrix_size));
myCudaCheckError(cudaMallocHost(&B, matrix_size));
myCudaCheckError(cudaMallocHost(&C, matrix_size));
/* Allocate memory on the device */
myCudaCheckError(cudaMalloc((void **)&dev_A, matrix_size));
myCudaCheckError(cudaMalloc((void **)&dev_B, matrix_size));
myCudaCheckError(cudaMalloc((void **)&dev_C, matrix_size));
/* Initialize the matrices */
for(int i = 0; i <= element_count; i++) {
A[i] = 1.0;
B[i] = 1.0;
}
/* Set matrices on device */
//cudaMemcpy(dev_A, A, matrix_size, cudaMemcpyHostToDevice);
//cudaMemcpy(dev_B, B, matrix_size, cudaMemcpyHostToDevice);
cublas_status = cublasSetMatrix(dim, dim, matrix_size, A, dim, dev_A, dim);
if(cublas_status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "cublasSetMatrix error: %d\n", cublas_status);
exit(EXIT_FAILURE);
}
cublas_status = cublasSetMatrix(dim, dim, matrix_size, A, dim, dev_A, dim);
if(cublas_status != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "cublasSetMatrix error: %d\n", cublas_status);
exit(EXIT_FAILURE);
}
/* C = alpha * op(A) * op(B) + beta * C */
const float alpha = 1.0f;
const float beta = 1.0f;
cublas_status = cublasSgemm(
cublas_handle, // handle to the cuBLAS library context
CUBLAS_OP_N, // operation op(A)
CUBLAS_OP_N, // operation op(B)
dim, // m : #rows of op(A) and C
dim, // n : #columns of B) and C
dim, // k : #columns of A and #rows B
&alpha, // alpha : scalar used for multiplication
A, // A : array of dimensions lda x n with lda >= max(1,m)
dim, // lda : leading dimension of a 2-dimensional array used to store the matrix A
B, // B : array of dimensions ldb x n with ldb >= max(1,k)
dim, // ldb : leading dimension of a 2-dimensional array used to store the matrix B
&beta, // beta : scalar used for multiplication
C, // C : array of dimensions ldc x n with ldc >= max(1,m)
dim // ldc : leading dimension of a 2-dimensional array used to store the matrix C
);
cublas_status = cublasGetMatrix(dim, dim, sizeof(float), dev_C, dim, C, dim);
printf("cublas_status: %d\nC[13]: %f\n", cublas_status, C[13]);
/* Release cuBLAS context */
if ((cublas_status = cublasDestroy(cublas_handle)) != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "Could not release cuBLAS context\n");
return EXIT_FAILURE;
}
/* Free resources */
cudaFree(A);
cudaFree(B);
cudaFree(C);
cudaFree(dev_A);
cudaFree(dev_B);
cudaFree(dev_C);
return EXIT_SUCCESS;
}
What am I doing wrong?