an illegal memory access was encountered

I have search this question but I feel no useful information for me. The following is the code where it got error information:

checkCudaErrors(cudaMemcpy(dev_X, X->data, m*n * sizeof(float), cudaMemcpyHostToDevice));
		start2 = clock();
		dim3 Grid(40, m);
		dim3 Block(1024, 1);
		order2_kernel << <Grid, Block >> > (dev_X, dev_Xsub, m, n, k);
		printf("%s\n", cudaGetErrorString(cudaGetLastError()));
		cudaDeviceSynchronize();
		stop2 = clock();
		float perTime = stop2 - start2;
		totalTime = totalTime + perTime;

		dim3 sumGrid(40, M1);
		dim3 sumBlock(1024, 1);
		int sharedSize = sumBlock.x * sizeof(float);
		sumReduction_kernel << <sumGrid, sumBlock, sharedSize, 0 >> > (dev_Xmean, dev_Xsub, M1, n);
		printf("%s\n", cudaGetErrorString(cudaGetLastError()));  //here got an error (an illegal memory access was encountered)
		
		sub1_kernel << <sumGrid, sumBlock >> > (dev_XFinal, dev_Xsub, dev_Xmean, M1, n);
		printf("%s\n", cudaGetErrorString(cudaGetLastError()));  //here got an error (an illegal memory access was encountered)

A very very strange place is that when I use a smaller matrix (120×32400) to do a test, there is no error occured and it can output a right result. However, when the size of matrix is 224×40000, then there will be an error and the all following kernel will wrong. Finally, if I change the order2_kernel to a similar kernel, everything will be OK!

The following is the code of order2_kernel:

__global__ void order2_kernel(float *p, float *res, int m, int n, int k)
{
	unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
	int back_index = 2*k * n + tid;
	int ford_index = tid;
	int midl_index = k * n + tid;
#pragma unroll
	for (int i = 0; i < (m - 2 * k); i++)
	{
		res[i * n + tid] = p[back_index] + p[ford_index] - 2 * p[midl_index];
		if (res[i * n + tid] < 0) {
			res[i * n + tid] = fabs(res[i * n + tid]);
		}
		back_index += n;
		ford_index += n;
		midl_index += n;
	}
}

In the initialization settings, the m is the rows of every input matrix, and I set 1024 threads in a single block.
So I want to know what is the problem in my code, it’s so weird.

Any suggestion is appreciate,
many many thanks~