I implemented a program for matrix multiplication using CUDA. Everything works fine when the matrix size is small, but when the matrix size reaches the order of 1000*1000, the desktop will freeze for 3~5 seconds when the program finish. This process does not show any error message.

With some searching I temporarily solved the problem by increasing the windows TDR limit from 2 seconds to 4 seconds. But I don’t think this is the real reason, since the running time of the program is much less than 2s (not reach the TDR limit)

So, what is the real cause of this problem?

my graphics card is NVIDIA GeForce RTX 3070 Laptop GPU (for both display and cuda computing)

my cuda version is 11.4

my os is windows11

My cuda code is as follows

```
template<typename T>
__global__ void mat_mul_kernel(const T *a, const T *b, T *c, size_t n) {
__shared__ T as[tiled_2d][tiled_2d];
__shared__ T bs[tiled_2d][tiled_2d];
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
int row = bx * tiled_2d + tx;
int col = by * tiled_2d + ty;
int N = (n + tiled_2d - 1) / tiled_2d;
float value = 0;
for (int i = 0; i < N; i++) {
int aR = row; int aC = i * tiled_2d + ty;
int bR = i * tiled_2d + tx; int bC = col;
if (aR < n && aC < n && bR < n && bC < n) {
as[tx][ty] = a[aR * n + aC];
bs[ty][tx] = b[bR * n + bC];
} else {
as[tx][ty] = 0;
bs[ty][tx] = 0;
}
__syncthreads();
for (int k = 0; k < tiled_2d; k++) {
value += as[tx][k] * bs[ty][k];
}
__syncthreads();
}
if (row < n && col < n)
c[row * n + col] = value;
}
template<typename T>
void mat_mul(const T *a, const T *b, T *c, size_t n) {
T *d_a, *d_b, *d_c;
size_t ds = n * n * sizeof(T); // data size
cudaMalloc((void **)&d_a, n * n);
cudaMemcpy(d_a, a, ds, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_b, n * n);
cudaMemcpy(d_b, b, ds, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_c, n * n);
dim3 dimGrid((unsigned int)ceil(n / tiled_2d), (unsigned int)ceil(n / tiled_2d), 1);
dim3 dimBLock(tiled_2d, tiled_2d, 1);
mat_mul_kernel<T><<<dimGrid, dimBLock>>>(d_a, d_b, d_c, n);
// kernel
cudaMemcpy(c, d_c, ds, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
```