cublasSgemm gives incorrect result with big matrix

I simple use cublasSgemm to do matrix multiplication.
some information:
Device0:“GeForce GTX 750 Ti”
CUDA driver version: 11.0
compute capability: 5.0
I build with compute_50,sm_50 in windows 10 visual studio.
if SIZE <=44, result is equal to the cpu version, but
if SIZE >= 45, result is not equal to cpu version with small difference:
i=1488, c[i]=67681848.000000, g[i]=67681840.000000



#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include 
#include 
#include 
#include 


template
struct Matrix
{
	int size() const
	{
		return m_rows * m_cols;
	}

	T* m_array;
	int m_rows;
	int m_cols;
	size_t m_pitch;
};

template
__host__ void mulMatHost(const Matrix& a, const Matrix& b, Matrix& c)
{
	assert(a.m_cols == b.m_rows);
	c.m_rows = a.m_rows;
	c.m_cols = b.m_cols;
	for (int i = 0; i < a.m_rows; i++)
	{
		for (int j = 0; j < b.m_cols; j++)
		{
			double value = 0;
			for (int k = 0; k < a.m_cols; k++)
			{
				int idxA = i * a.m_cols + k;
				int idxB = k * b.m_cols + j;
				value += a.m_array[idxA] * b.m_array[idxB];
			}
			int idxC = i * c.m_cols + j;
			c.m_array[idxC] = (T)value;
		}
	}
}

template
__host__ void mulMatCublas(const Matrix& a, const Matrix& b, Matrix& c)
{
	c.m_rows = a.m_rows;
	c.m_cols = b.m_cols;
	Matrix dev_a, dev_b, dev_c;
	cudaError_t status = cudaSuccess;

	dev_a.m_array = 0;
	dev_b.m_array = 0;
	dev_c.m_array = 0;

	status = cudaMalloc((void**)& dev_a.m_array, a.size() * sizeof(T));
	if (status != cudaSuccess)
	{
		printf("cudaMalloc failed \n");
		goto Error;
	}

	status = cudaMalloc((void**)& dev_b.m_array, b.size() * sizeof(T));
	if (status != cudaSuccess)
	{
		printf("cudaMalloc failed \n");
		goto Error;
	}

	status = cudaMalloc((void**)& dev_c.m_array, c.size() * sizeof(T));
	if (status != cudaSuccess)
	{
		printf("cudaMalloc failed \n");
		goto Error;
	}

	status = cudaMemcpy(dev_a.m_array, a.m_array, a.size() * sizeof(T), cudaMemcpyHostToDevice);
	status = cudaMemcpy(dev_b.m_array, b.m_array, b.size() * sizeof(T), cudaMemcpyHostToDevice);
	if (status != cudaSuccess)
	{
		printf("cudaMemcpy host to device failed:%s\n", cudaGetErrorString(status));
		goto Error;
	}


	cublasHandle_t handle;
	const float alpha = 1;
	const float beta = 0;

	cublasStatus_t cublasStatus;
	cublasStatus = cublasCreate(&handle);
	if (cublasStatus != CUBLAS_STATUS_SUCCESS)
	{
		printf("cublasCreate failed:%d\n", cublasStatus);
		goto Error;
	}

	cublasStatus = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, b.m_cols, a.m_rows, b.m_rows, &alpha, dev_b.m_array, b.m_cols, dev_a.m_array, a.m_cols, &beta, dev_c.m_array, b.m_cols);
	if (cublasStatus != CUBLAS_STATUS_SUCCESS)
	{
		printf("cublasSgemm failed:%d\n", cublasStatus);
		goto Error;
	}

	cudaDeviceSynchronize();
	status = cudaGetLastError();
	if (status != cudaSuccess)
	{
		printf("cubasSgemm failed:%s\n", cudaGetErrorString(status));
		goto Error;
	}

	status = cudaMemcpy(c.m_array, dev_c.m_array, c.size() * sizeof(T), cudaMemcpyDeviceToHost);
	if (status != cudaSuccess)
	{
		printf("cudaMemcpy device to host failed\n");
		goto Error;
	}

Error:
	cublasDestroy(handle);
	cudaFree(dev_a.m_array);
	cudaFree(dev_b.m_array);
	cudaFree(dev_c.m_array);
}

int main()
{

	const int SIZE = 44;
	const int ROWS_A = SIZE;
	const int COLS_A = SIZE;
	const int SIZE_A = ROWS_A * COLS_A;

	const int ROWS_B = SIZE;
	const int COLS_B = SIZE;
	const int SIZE_B = ROWS_B * COLS_B;

	Matrix a, b, c, g;
	a.m_rows = ROWS_A;
	a.m_cols = COLS_A;
	b.m_rows = ROWS_B;
	b.m_cols = COLS_B;
	a.m_array = new float[SIZE_A];
	b.m_array = new float[SIZE_B];
	c.m_array = new float[ROWS_A * COLS_B];
	g.m_array = new float[ROWS_A * COLS_B];
	for (int i = 0; i < a.size(); i++)
	{
		a.m_array[i] = i;
	}

	for (int i = 0; i < b.size(); i++)
	{
		b.m_array[i] = i;
	}
	for (int i = 0; i < ROWS_A * COLS_B; i++)
	{
		c.m_array[i] = 0;
		g.m_array[i] = 0;
	}

	mulMatHost(a, b, c);
	mulMatCublas(a, b, g);

	for (int i = 0; i < c.size(); i++)
	{
		if (fabs(c.m_array[i] - g.m_array[i]) > 1e-6)
		{
			printf("i=%d, c[i]=%f, g[i]=%f\n", i, c.m_array[i], g.m_array[i]);
			exit(EXIT_FAILURE);
		}
	}

	printf("result is equal!\n");

	delete[]a.m_array;
	delete[]b.m_array;
	delete[]c.m_array;
	delete[]g.m_array;

	return 0;
}