I have search this question but I feel no useful information for me. The following is the code where it got error information:
checkCudaErrors(cudaMemcpy(dev_X, X->data, m*n * sizeof(float), cudaMemcpyHostToDevice));
start2 = clock();
dim3 Grid(40, m);
dim3 Block(1024, 1);
order2_kernel << <Grid, Block >> > (dev_X, dev_Xsub, m, n, k);
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
cudaDeviceSynchronize();
stop2 = clock();
float perTime = stop2 - start2;
totalTime = totalTime + perTime;
dim3 sumGrid(40, M1);
dim3 sumBlock(1024, 1);
int sharedSize = sumBlock.x * sizeof(float);
sumReduction_kernel << <sumGrid, sumBlock, sharedSize, 0 >> > (dev_Xmean, dev_Xsub, M1, n);
printf("%s\n", cudaGetErrorString(cudaGetLastError())); //here got an error (an illegal memory access was encountered)
sub1_kernel << <sumGrid, sumBlock >> > (dev_XFinal, dev_Xsub, dev_Xmean, M1, n);
printf("%s\n", cudaGetErrorString(cudaGetLastError())); //here got an error (an illegal memory access was encountered)
A very very strange place is that when I use a smaller matrix (120×32400) to do a test, there is no error occured and it can output a right result. However, when the size of matrix is 224×40000, then there will be an error and the all following kernel will wrong. Finally, if I change the order2_kernel to a similar kernel, everything will be OK!
The following is the code of order2_kernel:
__global__ void order2_kernel(float *p, float *res, int m, int n, int k)
{
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
int back_index = 2*k * n + tid;
int ford_index = tid;
int midl_index = k * n + tid;
#pragma unroll
for (int i = 0; i < (m - 2 * k); i++)
{
res[i * n + tid] = p[back_index] + p[ford_index] - 2 * p[midl_index];
if (res[i * n + tid] < 0) {
res[i * n + tid] = fabs(res[i * n + tid]);
}
back_index += n;
ford_index += n;
midl_index += n;
}
}
In the initialization settings, the m is the rows of every input matrix, and I set 1024 threads in a single block.
So I want to know what is the problem in my code, it’s so weird.
Any suggestion is appreciate,
many many thanks~