Hey guys.
I am going deeper into CUDA and i read online, that the Matrix Multiplication example with shared memory has bank conflicts and one cheap way of solving it is by padding the shared memory array with an extra column:
i.e Bs[BLOCK_SIZE][BLOCK_SIZE + 1]
I did that and expected better results BUT i got worse results.
Below is my code. It compiles in 1 .cu file but you need to add your own timer. Could anyone confirm my results and tell me why i am getting worse results when everyone online is claiming different?
(I used pragma unroll 1 so that compiler does not unroll the loops for better comparison).
My times using: Bs[BLOCK_SIZE][BLOCK_SIZE] : 1414 milliseconds
Bs[BLOCK_SIZE][BLOCK_SIZE + 1] : 1446 milliseconds
If anyone could explain what im doing wrong and confirm my results i would really appreciate it.
Thanks a lot!
#include <iostream>
//#include "cuda_time.h"
using namespace std;
#define BLOCK_SIZE 16
#define N 2048
__global__ void kernel2(int* A, int* B, int* C, int size)
{
__shared__ int As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ int Bs[BLOCK_SIZE][BLOCK_SIZE + 1]; // <--- TRY WITH BLOCK_SIZE, and BLOCK_SIZE + 1.
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int row = by * BLOCK_SIZE + ty;
int col = bx * BLOCK_SIZE + tx;
int value = 0;
#pragma unroll 1
for(int m=0; m < size/BLOCK_SIZE; m++)
{
As[ty][tx] = A[row * size + (m * BLOCK_SIZE + tx)];
Bs[ty][tx] = B[col + (m * BLOCK_SIZE + ty) + size];
__syncthreads();
#pragma unroll 1
for(int k=0; k < BLOCK_SIZE; k++)
{
value += As[ty][k] * Bs[k][tx];
}
__syncthreads();
}
C[row * size + col] = value;
}
void go()
{
int* A = new int[N * N];
int* B = new int[N * N];
int* C = new int[N * N];
for(int i=0; i < N * N; i++)
{
A[i] = i;
B[i] = i;
C[i] = i;
}
//***********************************8
int* a;
int* b;
int* c;
cudaMalloc((void**)&a, N * N * sizeof(int));
cudaMalloc((void**)&b, N * N * sizeof(int));
cudaMalloc((void**)&c, N * N * sizeof(int));
cudaMemcpy(a,A, N * N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b,B, N * N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(c,C, N * N * sizeof(int), cudaMemcpyHostToDevice);
dim3 threads(BLOCK_SIZE,BLOCK_SIZE);
dim3 grid(N/BLOCK_SIZE,N/BLOCK_SIZE);
//cuda_time ct; <---- INSERT YOUR OWN TIMER CODE
//ct.start();
kernel2<<<grid,threads>>>(a,b,c,N);
cudaThreadSynchronize();
//ct.stop();
//cout << "TIME IS:\t" << ct.get_time() << endl;
}
int main()
{
go();
cout << "EXIT" << endl;
cin.get();
return 0;
}