Hello, I’m a inexperienced person in using CUDA C.

One question I have is that for the matrix multiplication code with the following structure, why he can parallelize the computation with IO, i.e. use double buffer for data prefetching. In my thinking with C++, the for loop in it is that it should execute the code sequentially, I don’t know why it is parallelized. I know that the FFMA and Load/Store Unit in the GPU can execute instructions in parallel, but corresponding to the cuda core functions, I’m having a hard time understanding it, thank you!

```
__global__ void MatrixMulKernel_Shared_Prefetch(float *A, float *B, float *C, int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns)
{
__shared__ float sharedM[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float sharedN[BLOCK_SIZE][BLOCK_SIZE];
float sum = 0.0;
float gl_a, gl_b;
int row = blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
if (col < numAColumns && row < numARows)
sharedM[threadIdx.y][threadIdx.x] = A[row*numAColumns + threadIdx.x];
else
sharedM[threadIdx.y][threadIdx.x] = 0.0;
if (row < numBRows && col < numBColumns)
sharedN[threadIdx.y][threadIdx.x] = B[threadIdx.y*numBColumns + col];
else
sharedN[threadIdx.y][threadIdx.x] = 0.0;
__syncthreads();
for (int i = 1; i < (int)(ceil((float)numAColumns / blockDim.x)); i++)
{
if (i*blockDim.x + threadIdx.x < numAColumns && row < numARows)
gl_a = A[row*numAColumns + i*blockDim.x + threadIdx.x];
else
gl_a = 0.0;
if (i*blockDim.y + threadIdx.y < numBRows && col < numBColumns)
gl_b = B[(i*blockDim.y + threadIdx.y)*numBColumns + col];
else
gl_b = 0.0;
int stride_i = (i + 1) % 2; // compute flag
int stride_j = i % 2; // IO flag
for (int j = 0; j < blockDim.x; j++)
sum += sharedM[stride_i*(BLOCK_SIZE/2) + threadIdx.y][j] * sharedN[stride_i*(BLOCK_SIZE/2) + j][threadIdx.x];
sharedM[stride_j*(BLOCK_SIZE/2) + threadIdx.y][threadIdx.x] = gl_a;
sharedN[stride_j*(BLOCK_SIZE/2) + threadIdx.y][threadIdx.x] = gl_b;
__syncthreads();
}
for (int j = 0; j < blockDim.x; j++)
sum += sharedM[1*(BLOCK_SIZE/2) + threadIdx.y][j] * sharedN[1*(BLOCK_SIZE/2) + j][threadIdx.x];
__syncthreads();
if (row < numCRows && col < numCColumns)
C[row*numCColumns + col] = sum;
}
```