I simple use cublasSgemm to do matrix multiplication.
some information:
Device0:“GeForce GTX 750 Ti”
CUDA driver version: 11.0
compute capability: 5.0
I build with compute_50,sm_50 in windows 10 visual studio.
if SIZE <=44, result is equal to the cpu version, but
if SIZE >= 45, result is not equal to cpu version with small difference:
i=1488, c[i]=67681848.000000, g[i]=67681840.000000
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#include
#include
#include
template
struct Matrix
{
int size() const
{
return m_rows * m_cols;
}
T* m_array;
int m_rows;
int m_cols;
size_t m_pitch;
};
template
__host__ void mulMatHost(const Matrix& a, const Matrix& b, Matrix& c)
{
assert(a.m_cols == b.m_rows);
c.m_rows = a.m_rows;
c.m_cols = b.m_cols;
for (int i = 0; i < a.m_rows; i++)
{
for (int j = 0; j < b.m_cols; j++)
{
double value = 0;
for (int k = 0; k < a.m_cols; k++)
{
int idxA = i * a.m_cols + k;
int idxB = k * b.m_cols + j;
value += a.m_array[idxA] * b.m_array[idxB];
}
int idxC = i * c.m_cols + j;
c.m_array[idxC] = (T)value;
}
}
}
template
__host__ void mulMatCublas(const Matrix& a, const Matrix& b, Matrix& c)
{
c.m_rows = a.m_rows;
c.m_cols = b.m_cols;
Matrix dev_a, dev_b, dev_c;
cudaError_t status = cudaSuccess;
dev_a.m_array = 0;
dev_b.m_array = 0;
dev_c.m_array = 0;
status = cudaMalloc((void**)& dev_a.m_array, a.size() * sizeof(T));
if (status != cudaSuccess)
{
printf("cudaMalloc failed \n");
goto Error;
}
status = cudaMalloc((void**)& dev_b.m_array, b.size() * sizeof(T));
if (status != cudaSuccess)
{
printf("cudaMalloc failed \n");
goto Error;
}
status = cudaMalloc((void**)& dev_c.m_array, c.size() * sizeof(T));
if (status != cudaSuccess)
{
printf("cudaMalloc failed \n");
goto Error;
}
status = cudaMemcpy(dev_a.m_array, a.m_array, a.size() * sizeof(T), cudaMemcpyHostToDevice);
status = cudaMemcpy(dev_b.m_array, b.m_array, b.size() * sizeof(T), cudaMemcpyHostToDevice);
if (status != cudaSuccess)
{
printf("cudaMemcpy host to device failed:%s\n", cudaGetErrorString(status));
goto Error;
}
cublasHandle_t handle;
const float alpha = 1;
const float beta = 0;
cublasStatus_t cublasStatus;
cublasStatus = cublasCreate(&handle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate failed:%d\n", cublasStatus);
goto Error;
}
cublasStatus = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, b.m_cols, a.m_rows, b.m_rows, &alpha, dev_b.m_array, b.m_cols, dev_a.m_array, a.m_cols, &beta, dev_c.m_array, b.m_cols);
if (cublasStatus != CUBLAS_STATUS_SUCCESS)
{
printf("cublasSgemm failed:%d\n", cublasStatus);
goto Error;
}
cudaDeviceSynchronize();
status = cudaGetLastError();
if (status != cudaSuccess)
{
printf("cubasSgemm failed:%s\n", cudaGetErrorString(status));
goto Error;
}
status = cudaMemcpy(c.m_array, dev_c.m_array, c.size() * sizeof(T), cudaMemcpyDeviceToHost);
if (status != cudaSuccess)
{
printf("cudaMemcpy device to host failed\n");
goto Error;
}
Error:
cublasDestroy(handle);
cudaFree(dev_a.m_array);
cudaFree(dev_b.m_array);
cudaFree(dev_c.m_array);
}
int main()
{
const int SIZE = 44;
const int ROWS_A = SIZE;
const int COLS_A = SIZE;
const int SIZE_A = ROWS_A * COLS_A;
const int ROWS_B = SIZE;
const int COLS_B = SIZE;
const int SIZE_B = ROWS_B * COLS_B;
Matrix a, b, c, g;
a.m_rows = ROWS_A;
a.m_cols = COLS_A;
b.m_rows = ROWS_B;
b.m_cols = COLS_B;
a.m_array = new float[SIZE_A];
b.m_array = new float[SIZE_B];
c.m_array = new float[ROWS_A * COLS_B];
g.m_array = new float[ROWS_A * COLS_B];
for (int i = 0; i < a.size(); i++)
{
a.m_array[i] = i;
}
for (int i = 0; i < b.size(); i++)
{
b.m_array[i] = i;
}
for (int i = 0; i < ROWS_A * COLS_B; i++)
{
c.m_array[i] = 0;
g.m_array[i] = 0;
}
mulMatHost(a, b, c);
mulMatCublas(a, b, g);
for (int i = 0; i < c.size(); i++)
{
if (fabs(c.m_array[i] - g.m_array[i]) > 1e-6)
{
printf("i=%d, c[i]=%f, g[i]=%f\n", i, c.m_array[i], g.m_array[i]);
exit(EXIT_FAILURE);
}
}
printf("result is equal!\n");
delete[]a.m_array;
delete[]b.m_array;
delete[]c.m_array;
delete[]g.m_array;
return 0;
}