Hello,
I try to made a multiplication matrix batch on A(m,k) et B(k,n) where m = 5 k = 5 and n = 1.
I have this code:
#include <stdio.h>
#include <string>
#include <cuda_runtime.h>
#include <cublas_v2.h>
//#include <conio.h>
#define CUDA_CALL(res, str) { if (res != cudaSuccess) { printf("CUDA Error : %s : %s %d : ERR %s\n", str, __FILE__, __LINE__, cudaGetErrorName(res)); } }
#define CUBLAS_CALL(res, str) { if (res != CUBLAS_STATUS_SUCCESS) { printf("CUBLAS Error : %s : %s %d : ERR %d\n", str, __FILE__, __LINE__, int(res)); } }
static cudaEvent_t cu_TimerStart;
static cudaEvent_t cu_TimerStop;
void d_CUDATimerStart(void)
{
CUDA_CALL(cudaEventCreate(&cu_TimerStart), "Failed to create start event!");
CUDA_CALL(cudaEventCreate(&cu_TimerStop), "Failed to create stop event!");
CUDA_CALL(cudaEventRecord(cu_TimerStart), "Failed to record start event!");
}
float d_CUDATimerStop(void)
{
CUDA_CALL(cudaEventRecord(cu_TimerStop), "Failed to record stop event!");
CUDA_CALL(cudaEventSynchronize(cu_TimerStop), "Failed to synch stop event!");
float ms;
CUDA_CALL(cudaEventElapsedTime(&ms, cu_TimerStart, cu_TimerStop), "Failed to elapse events!");
CUDA_CALL(cudaEventDestroy(cu_TimerStart), "Failed to destroy start event!");
CUDA_CALL(cudaEventDestroy(cu_TimerStop), "Failed to destroy stop event!");
return ms;
}
void d_GetInv(float* A, float*B, float *Ainv, float *X, int n)
{
cublasHandle_t cu_cublasHandle;
CUBLAS_CALL(cublasCreate(&cu_cublasHandle), "Failed to initialize cuBLAS!");
float** adA;
float** adB;
float** adAInv;
float** adX;
float* dA;
float* dAInv;
float* dX;
int* dAUPivots;
int* dAUInfo;
float *dB;
float *alpha;
float *beta;
float *dalpha;
float *dbeta;
size_t szA = n * n * sizeof(float);
size_t szB = n * sizeof(float);
CUDA_CALL(cudaMalloc(&adA, sizeof(float*)), "Failed to allocate adA!");
CUDA_CALL(cudaMalloc(&adAInv, sizeof(float*)), "Failed to allocate adA!");
CUDA_CALL(cudaMalloc(&adX,10* sizeof(float*)), "Failed to allocate adC!");
CUDA_CALL(cudaMalloc(&dA, szA), "Failed to allocate dA!");
CUDA_CALL(cudaMalloc(&dAInv, szA), "Failed to allocate dAinv!");
CUDA_CALL(cudaMalloc(&dX, szB*50), "Failed to allocate dX!");
CUDA_CALL(cudaMalloc(&dAUPivots, n * sizeof(int)), "Failed to allocate dAUPivots!");
CUDA_CALL(cudaMalloc(&dAUInfo, sizeof(int)), "Failed to allocate dAUInfo!");
CUDA_CALL(cudaMemcpy(dA, A, szA, cudaMemcpyHostToDevice), "Failed to copy to dA!");
CUDA_CALL(cudaMemcpy(&adA[0], &(dA), sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adA!");
CUDA_CALL(cudaMemcpy(&adAInv[0], &(dAInv), sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adAInv!");
CUDA_CALL(cudaMemcpy(&adX[0], &(dX), 1*sizeof(float), cudaMemcpyHostToDevice), "Failed to copy to adX!");
CUDA_CALL(cudaMalloc(&adB, sizeof(float*)), "Failed to allocate adB!");
CUDA_CALL(cudaMalloc(&dB, szB), "Failed to allocate dB!");
CUDA_CALL(cudaMemcpy(dB, B, szB, cudaMemcpyHostToDevice), "Failed to copy to dA!");
CUDA_CALL(cudaMemcpy(&adB[0], &(dB), sizeof(float*), cudaMemcpyHostToDevice), "Failed to copy to adB!");
alpha =(float*)malloc(10*sizeof(float));
beta =(float*)malloc(10*sizeof(float));
alpha[0]= 1;
beta[0] = 0;
CUDA_CALL(cudaMalloc(&dalpha, 1*sizeof(float)), "Failed to allocate dalpha!");
CUDA_CALL(cudaMalloc(&dbeta, 1*sizeof(float)), "Failed to allocate dbeta!");
CUDA_CALL(cudaMemcpy(dalpha, alpha, sizeof(float), cudaMemcpyHostToDevice), "Failed to copy to dalpha!");
CUDA_CALL(cudaMemcpy(dbeta, beta, sizeof(float), cudaMemcpyHostToDevice), "Failed to copy to dbeta!");
d_CUDATimerStart();
CUBLAS_CALL(cublasSgetrfBatched(cu_cublasHandle, n, adA, n, dAUPivots, dAUInfo, 1), "Failed to perform LU decomp operation!");
CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");
CUBLAS_CALL(cublasSgetriBatched(cu_cublasHandle, n, (const float **)adA, n, dAUPivots, adAInv, n, dAUInfo, 1), "Failed to perform Inverse operation!");
CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");
CUBLAS_CALL(cublasSgemmBatched(cu_cublasHandle,CUBLAS_OP_N,CUBLAS_OP_N,n,1,n,dalpha,(const float **)adA,n,(const float **)adB,1,dbeta,(adX),n,1),"Failed to multiply matrix\n");
CUDA_CALL(cudaDeviceSynchronize(), "Failed to synchronize after kernel call!");
float timed = d_CUDATimerStop();
Ainv = (float*)malloc(szA);
CUDA_CALL(cudaMemcpy(Ainv, dAInv, szA, cudaMemcpyDeviceToHost), "Failed to copy to res!");
printf("Ainv: \n");
for(int j = 0; j < n; j++)
{
for(int i = 0; i < n ; i++)
{
printf(" %f",Ainv[i+j*n]);
}
printf(" \n");
}
printf("cublas inverse in: %.5f ms.\n", timed);
CUDA_CALL(cudaFree(adA), "Failed to free adA!");
CUDA_CALL(cudaFree(adAInv), "Failed to free adC!");
CUDA_CALL(cudaFree(dA), "Failed to free dA!");
CUDA_CALL(cudaFree(dAInv), "Failed to free dC!");
CUDA_CALL(cudaFree(dAUPivots), "Failed to free dAUPivots!");
CUDA_CALL(cudaFree(dAUInfo), "Failed to free dAUInfo!");
CUBLAS_CALL(cublasDestroy(cu_cublasHandle), "Failed to destroy cuBLAS!");
}
The code return me that error:
** On entry to SGEMM parameter number 10 had an illegal value
I don’t understand the problem seems to be in the adB matrix but it is constructed the same has the matrix A.
I don’t know if it is a problem the size of the matrix or the contruction of the matrix.
Can someone help me about this?
Best regard.