why the matrix multiplication show bug when the matrix is very big?

It is running well when n<1000,but it will show bug when the n>3000.
bug:when the n>3000. err=cudaMemcpy(c, cuda_c, sizeof(int)* n * n, cudaMemcpyDeviceToHost) will return 4
I don’t know what happen.please help me .
thank you!

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>   
#include <stdlib.h>   
#include <time.h>  
#include <stdio.h>
#include <math.h>
int n = 3000;//n*n的据矩阵
using namespace std;
__global__ void aMatrix(const  int *a, const  int *b,  int n,int  *c) {
	int col = blockIdx.x*blockDim.x + threadIdx.x;//x
	int row = blockIdx.y*blockDim.y + threadIdx.y;//y
	//预防越界
	if (col < n && row < n) {
		int sum = 0;
		for (int i = 0; i < n; i++) {
			sum += a[row * n + i] * b[i*n + col];
		}
		c[row * n + col] =sum;
	}
}
void aMatrixCpu(const int *a, const int *b, int n, int  *c) {
	int sum = 0;
	for (int row = 0; row < n; row++) {
		for (int col = 0; col < n; col++) {
			for (int i = 0; i < n; i++) {
				sum += a[row * n + i] * b[i*n + col];
			}
			c[row * n + col] = sum;
		}
	}

}
int main() {
	clock_t start, end,gpuEnd,cpuEnd;
	start = clock();//程序开始计时
	//随机生成二维
	int *a, *b, *c, *temp;
	a=(int*)malloc(sizeof(int) * n*n);
	b=(int*)malloc(sizeof(int) * n*n);
	c=(int *)malloc(sizeof(int) * n*n);
	temp =(int *)malloc(sizeof(int) * n*n);
	for (int i = 0; i < n; i++)
	{
		for (int j = 0; j < n; j++)
		{
			a[i*n+j]=(rand() % 20) ;
			//a[i*n+j]=1 ;
		}
	}
	for (int i = 0; i < n; i++)
	{
		for (int j = 0; j < n; j++)
		{
			b[i*n + j] = (rand() % 20) ;
			//b[i*n + j] = 1;
		}
	}

	//output Matrix
	//if (n <= 10) {
	//	for (int i = 0; i < n; i++)
	//	{
	//		for (int j = 0; j < n; j++)
	//		{
	//			cout << a[i*n + j]<<",";
	//		}
	//		cout << endl;
	//	}
	//	cout << "-------------------------" << endl;
	//	for (int i = 0; i < n; i++)
	//	{
	//		for (int j = 0; j < n; j++)
	//		{
	//			cout <<b[i*n + j] << ",";
	//		}
	//		cout << endl;
	//	}
	//}
	int *cuda_a, *cuda_b,*cuda_c;
	int N = n;
	cudaError_t err;
	cudaMalloc((void**)&cuda_a, sizeof(int)* n * n);
	cudaMalloc((void**)&cuda_b, sizeof(int)* n * n);
	cudaMalloc((void**)&cuda_c, sizeof(int)* n * n);
	cout << "malloc success" << endl;
	err = cudaMemcpy(cuda_a, a, sizeof(int)* n * n, cudaMemcpyHostToDevice);
	if (err != cudaSuccess) {
		cout << "Failed to copy the a data from host" << err << endl;
		return;
	}
	err = cudaMemcpy(cuda_b, b, sizeof(int)* n * n, cudaMemcpyHostToDevice);
	if (err != cudaSuccess) {
		cout << "Failed to copy the b data from host" << err << endl;
		return;
	}
	err = cudaMemcpy(cuda_c, c, sizeof(int)* n * n, cudaMemcpyHostToDevice);
	if (err != cudaSuccess) {
		cout << "Failed to copy the c data from host" << err <<endl;
		return;
	}

	cout << "init succeess" << endl;
	end = clock();//程序结束用时
	double endtime = (double)(end - start) / CLOCKS_PER_SEC;
	cout << "Init time:" << endtime * 1000 << "ms" << endl;	//ms为单位
	dim3 threadsPerBlock(16, 16);
	cout << (N+ threadsPerBlock.x-1) / threadsPerBlock.x<< endl;
	dim3 numBlocks((N+ threadsPerBlock.x-1) / threadsPerBlock.x,( N+ threadsPerBlock.y-1) / threadsPerBlock.y);
	
	aMatrix <<<numBlocks, threadsPerBlock ,0>>> (cuda_a, cuda_b,n, cuda_c);
	cudaDeviceSynchronize();
	err=cudaMemcpy(c, cuda_c, sizeof(int)* n * n, cudaMemcpyDeviceToHost);
	cudaFree(cuda_a);
	cudaFree(cuda_b);
	cudaFree(cuda_c);
	if (err != cudaSuccess) {
		cout << "Failed to copy the  data from device" << err << endl;
		return;
	}
	cout << a[0] <<","<< b[0]<<"," << c[0] << endl;

	gpuEnd = clock();//程序结束用时
	endtime = (double)(gpuEnd - end) / CLOCKS_PER_SEC;
	cout << "GPU Total time:" << endtime * 1000 << "ms" << endl;	//ms为单位

	aMatrixCpu(a, b, n,c);
	cout << a[0] << "," << b[0] << "," << c[0] << endl;

	cpuEnd = clock();//程序结束用时
	endtime = (double)(cpuEnd - gpuEnd) / CLOCKS_PER_SEC;
	cout << "CPU Total time:" << endtime * 1000 << "ms" << endl;	//ms为单位
	getchar();
	return 0;
}

you may be hitting a WDDM TDR timeout or the equivalent on linux

Your code runs without any error for me. However my GPU is not hosting a display.

https://docs.nvidia.com/gameworks/content/developertools/desktop/timeout_detection_recovery.htm

thank you !but I find that the code run well in my linux ,but it will show bugs when it runs in my windows.

Does your Linux system run a GUI on the GPU you are using for computation?

The GUI watchdog timer limit is a function of the operating system. Its purpose is to prevent users from experiencing a “frozen” machine. A GPU can either execute a compute kernel or execute graphics commands for the GUI. A CUDA kernel therefore will block GUI activity while it is running. If the kernel run time exceeds the GUI watchdog timer limit, the OS assumes that the display system “hangs” and performs a soft reset of the GPU. The CUDA context is destroyed in the process.

Under Windows, the default limit of the watchdog timer is around 2 seconds, on some Linux systems the default limit is higher, e.g. 5 seconds. There are usually ways for users to configure GUI watchdog timer limits. These are OS specific, so you if you intend to increase the limit, search the internet for how to change the limit on your operating system. Microsoft documents the registry keys for configuring the TDR timeout on Windows here:

https://docs.microsoft.com/en-us/windows-hardware/drivers/display/tdr-registry-keys

yes ,It work!thank you very much!