I have tried to implement the well known Matrix Multiplication using shared memory and tiles. However, i have 2 problems.
1)The code outputs 0 milliseconds on input 4096 x 4096. The screen crashes and then i get the info that NVIDIA screen has been recovered. (It works fine on 2048 x 2048).
2)Unrolling the loop that is included in the code actually SLOWS IT DOWN rather then speed it up. Can anyone explain why that happens?
here is the code:
#include <iostream>
#include <fstream>
#include "cuda_time.h"
using namespace std;
#define BLOCK_SIZE 32
void print(int* a, int size)
{
for(int i=0; i < size * size; i++)
{
if(i % size == 0)
{
cout << endl;
}
cout << a[i] << "\t";
}
//wait
cin.get();
}
__global__ void MM2()
{
}
__global__ void MM1(float* A, float* B, float* C, int size)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
int iterations = size/BLOCK_SIZE;
int indexA = size * BLOCK_SIZE * by + ty * size + tx;
int indexB = bx * BLOCK_SIZE + ty * size + tx;
float Csub = 0;
//#pragma unroll
for(int i=0; i < iterations; i++)
{
//load the data
As[ty][tx] = A[indexA];
Bs[ty][tx] = B[indexB];
indexA += BLOCK_SIZE;
indexB += size * BLOCK_SIZE;
//wait
__syncthreads();
//#pragma unroll
Csub += As[ty][0] * Bs[0][tx];
Csub += As[ty][1] * Bs[1][tx];
Csub += As[ty][2] * Bs[2][tx];
Csub += As[ty][3] * Bs[3][tx];
Csub += As[ty][4] * Bs[4][tx];
Csub += As[ty][5] * Bs[5][tx];
Csub += As[ty][6] * Bs[6][tx];
Csub += As[ty][7] * Bs[7][tx];
Csub += As[ty][8] * Bs[8][tx];
Csub += As[ty][9] * Bs[9][tx];
Csub += As[ty][10] * Bs[10][tx];
Csub += As[ty][11] * Bs[11][tx];
Csub += As[ty][12] * Bs[12][tx];
Csub += As[ty][13] * Bs[13][tx];
Csub += As[ty][14] * Bs[14][tx];
Csub += As[ty][15] * Bs[15][tx];
Csub += As[ty][16] * Bs[16][tx];
Csub += As[ty][17] * Bs[17][tx];
Csub += As[ty][18] * Bs[18][tx];
Csub += As[ty][19] * Bs[19][tx];
Csub += As[ty][20] * Bs[20][tx];
Csub += As[ty][21] * Bs[21][tx];
Csub += As[ty][22] * Bs[22][tx];
Csub += As[ty][23] * Bs[23][tx];
Csub += As[ty][24] * Bs[24][tx];
Csub += As[ty][25] * Bs[25][tx];
Csub += As[ty][26] * Bs[26][tx];
Csub += As[ty][27] * Bs[27][tx];
Csub += As[ty][28] * Bs[28][tx];
Csub += As[ty][29] * Bs[29][tx];
Csub += As[ty][30] * Bs[30][tx];
Csub += As[ty][31] * Bs[31][tx];
/*for(int k=0; k < BLOCK_SIZE; k++)
{
Csub += As[ty][k] * Bs[k][tx];
}*/
__syncthreads();
}
int indexC = size * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[indexC + size * ty + tx] = Csub;
}
void run()
{
const int N = 4096;
float* A = new float[N * N];
float* B = new float[N * N];
float* C = new float[N * N];
cout << "Reading both files" << endl;
ifstream in;
in.open("C:/CUDA/file_4096.txt");
for(int i=0; i < N * N; i++)
{
in >> A[i];
}
in.close();
ifstream in2;
in2.open("C:/CUDA/file_4096.txt");
for(int i=0; i < N * N; i++)
{
in2 >> B[i];
}
in2.close();
//print(B,N);
float* a;
float* b;
float* c;
cudaMalloc( (void**)&a, N * N * sizeof(float));
cudaMalloc( (void**)&b, N * N * sizeof(float));
cudaMalloc( (void**)&c, N * N * sizeof(float));
cuda_time ct;
ct.start();
cudaMemcpy(a,A, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b,B, N * N * sizeof(float), cudaMemcpyHostToDevice);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(N/BLOCK_SIZE, N/BLOCK_SIZE);
MM1<<<grid,threads>>>(a,b,c,N);
cudaMemcpy(C,c,N * N * sizeof(float), cudaMemcpyDeviceToHost);
ct.stop();
cout << "TIME IS:\t" << ct.get_time() << endl;
// print(C,N);
}
int main()
{
run();
cout << "EXITING THE APPLICATION" << endl;
cin.get();
return 0;
}
I took your code, and modified it slightly to incorporate three versions of the kernel: MM0() is similar to MM1(), but with the manual unroll you wrote removed, and the inner for-loop uncommented; MM1() is exactly the same as what you stated above; MM2() is similar to MM0(), but with the two “#pragma unroll” uncommented. However, for MM2(), I used “#pragma unroll 20” instead of “#pragma unroll” for the outer for-loop because the compiler does not know how many times to unroll the loop because it is a dynamically-computed value. I examined the PTX, and each kernel has very different code generated. MM0() ran in ~1.3s, MM1() in ~0.69s, and MM2() in 0.67s. The code ran on a GTX470, but could not run on a 9800GT, which may explain your first problem. You did not check the error returns, and does occur for the 4096x4096 problem size on the old 9800GT GPU.
Ken
Here’s the modified code:
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
#include <stdio.h>
#include <iostream>
#include <vector>
#include <fstream>
using namespace std;
#define BLOCK_SIZE 32
class cuda_time
{
private:
static bool initialized;
// Returns the overhead of the timer in ticks
static LONGLONG GetOverhead()
{
cuda_time t;
t.start();
t.stop();
return t.m_stop.QuadPart - t.m_start.QuadPart;
}
LARGE_INTEGER m_start;
LARGE_INTEGER m_stop;
static LARGE_INTEGER m_freq;
static LONGLONG m_overhead;
public:
cuda_time()
{
if (initialized == false)
{
initialized = true;
m_freq = (QueryPerformanceFrequency(&cuda_time::m_freq), cuda_time::m_freq);
m_overhead = GetOverhead();
}
}
void start()
{
QueryPerformanceCounter(&m_start);
}
double last;
double stop()
{
QueryPerformanceCounter(&m_stop);
last = (m_stop.QuadPart - m_start.QuadPart - m_overhead) * 1000.0 / m_freq.QuadPart;
return last;
}
double get_time()
{
return last;
}
};
bool cuda_time::initialized = false;
LARGE_INTEGER cuda_time::m_freq;
LONGLONG cuda_time::m_overhead;
void print(int* a, int size)
{
for(int i=0; i < size * size; i++)
{
if(i % size == 0)
{
cout << endl;
}
cout << a[i] << "\t";
}
//wait
cin.get();
}
__global__ void MM2(float* A, float* B, float* C, int size)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
int iterations = size/BLOCK_SIZE;
int indexA = size * BLOCK_SIZE * by + ty * size + tx;
int indexB = bx * BLOCK_SIZE + ty * size + tx;
float Csub = 0;
#pragma unroll 20
for(int i=0; i < iterations; i++)
{
//load the data
As[ty][tx] = A[indexA];
Bs[ty][tx] = B[indexB];
indexA += BLOCK_SIZE;
indexB += size * BLOCK_SIZE;
//wait
__syncthreads();
#pragma unroll
for(int k=0; k < BLOCK_SIZE; k++)
{
Csub += As[ty][k] * Bs[k][tx];
}
__syncthreads();
}
int indexC = size * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[indexC + size * ty + tx] = Csub;
}
__global__ void MM1(float* A, float* B, float* C, int size)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
int iterations = size/BLOCK_SIZE;
int indexA = size * BLOCK_SIZE * by + ty * size + tx;
int indexB = bx * BLOCK_SIZE + ty * size + tx;
float Csub = 0;
//#pragma unroll
for(int i=0; i < iterations; i++)
{
//load the data
As[ty][tx] = A[indexA];
Bs[ty][tx] = B[indexB];
indexA += BLOCK_SIZE;
indexB += size * BLOCK_SIZE;
//wait
__syncthreads();
//#pragma unroll
Csub += As[ty][0] * Bs[0][tx];
Csub += As[ty][1] * Bs[1][tx];
Csub += As[ty][2] * Bs[2][tx];
Csub += As[ty][3] * Bs[3][tx];
Csub += As[ty][4] * Bs[4][tx];
Csub += As[ty][5] * Bs[5][tx];
Csub += As[ty][6] * Bs[6][tx];
Csub += As[ty][7] * Bs[7][tx];
Csub += As[ty][8] * Bs[8][tx];
Csub += As[ty][9] * Bs[9][tx];
Csub += As[ty][10] * Bs[10][tx];
Csub += As[ty][11] * Bs[11][tx];
Csub += As[ty][12] * Bs[12][tx];
Csub += As[ty][13] * Bs[13][tx];
Csub += As[ty][14] * Bs[14][tx];
Csub += As[ty][15] * Bs[15][tx];
Csub += As[ty][16] * Bs[16][tx];
Csub += As[ty][17] * Bs[17][tx];
Csub += As[ty][18] * Bs[18][tx];
Csub += As[ty][19] * Bs[19][tx];
Csub += As[ty][20] * Bs[20][tx];
Csub += As[ty][21] * Bs[21][tx];
Csub += As[ty][22] * Bs[22][tx];
Csub += As[ty][23] * Bs[23][tx];
Csub += As[ty][24] * Bs[24][tx];
Csub += As[ty][25] * Bs[25][tx];
Csub += As[ty][26] * Bs[26][tx];
Csub += As[ty][27] * Bs[27][tx];
Csub += As[ty][28] * Bs[28][tx];
Csub += As[ty][29] * Bs[29][tx];
Csub += As[ty][30] * Bs[30][tx];
Csub += As[ty][31] * Bs[31][tx];
/*for(int k=0; k < BLOCK_SIZE; k++)
{
Csub += As[ty][k] * Bs[k][tx];
}*/
__syncthreads();
}
int indexC = size * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[indexC + size * ty + tx] = Csub;
}
__global__ void MM0(float* A, float* B, float* C, int size)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
int iterations = size/BLOCK_SIZE;
int indexA = size * BLOCK_SIZE * by + ty * size + tx;
int indexB = bx * BLOCK_SIZE + ty * size + tx;
float Csub = 0;
//#pragma unroll
for(int i=0; i < iterations; i++)
{
//load the data
As[ty][tx] = A[indexA];
Bs[ty][tx] = B[indexB];
indexA += BLOCK_SIZE;
indexB += size * BLOCK_SIZE;
//wait
__syncthreads();
//#pragma unroll
for(int k=0; k < BLOCK_SIZE; k++)
{
Csub += As[ty][k] * Bs[k][tx];
}
__syncthreads();
}
int indexC = size * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[indexC + size * ty + tx] = Csub;
}
void run()
{
const int N = 4096;
float* A = new float[N * N];
float* B = new float[N * N];
float* C0 = new float[N * N];
float* C1 = new float[N * N];
float* C2 = new float[N * N];
for (int i = 0; i < N*N; ++i) A[i] = 1;
for (int i = 0; i < N*N; ++i) B[i] = 1;
float* a;
float* b;
float* c0;
float* c1;
float* c2;
cudaMalloc( (void**)&a, N * N * sizeof(float));
cudaMalloc( (void**)&b, N * N * sizeof(float));
cudaMalloc( (void**)&c0, N * N * sizeof(float));
cudaMalloc( (void**)&c1, N * N * sizeof(float));
cudaMalloc( (void**)&c2, N * N * sizeof(float));
cudaMemcpy(a,A, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b,B, N * N * sizeof(float), cudaMemcpyHostToDevice);
for (int i = 0; i < 3; ++i)
{
cudaMemcpy(c0, B, N * N * sizeof(float), cudaMemcpyHostToDevice);
cuda_time ct;
ct.start();
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(N/BLOCK_SIZE, N/BLOCK_SIZE);
MM0<<<grid,threads>>>(a,b,c0,N);
cudaDeviceSynchronize();
cudaError_t e = cudaGetLastError();
if (e)
std::cout << cudaGetErrorString(e) << "\n";
ct.stop();
cout << "TIME IS:\t" << ct.get_time() << endl;
cudaMemcpy(C0,c0,N * N * sizeof(float), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < 3; ++i)
{
cudaMemcpy(c1, B, N * N * sizeof(float), cudaMemcpyHostToDevice);
cuda_time ct;
ct.start();
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(N/BLOCK_SIZE, N/BLOCK_SIZE);
MM1<<<grid,threads>>>(a,b,c1,N);
cudaDeviceSynchronize();
cudaError_t e = cudaGetLastError();
if (e)
std::cout << cudaGetErrorString(e) << "\n";
ct.stop();
cout << "TIME IS:\t" << ct.get_time() << endl;
cudaMemcpy(C1,c1,N * N * sizeof(float), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < 3; ++i)
{
cudaMemcpy(c2, B, N * N * sizeof(float), cudaMemcpyHostToDevice);
cuda_time ct;
ct.start();
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(N/BLOCK_SIZE, N/BLOCK_SIZE);
MM2<<<grid,threads>>>(a,b,c2,N);
cudaDeviceSynchronize();
cudaError_t e = cudaGetLastError();
if (e)
std::cout << cudaGetErrorString(e) << "\n";
ct.stop();
cout << "TIME IS:\t" << ct.get_time() << endl;
cudaMemcpy(C2,c2,N * N * sizeof(float), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < N*N; ++i)
{
if (C0[i] != C1[i])
{
cout << "C1 " << i << "\n";
break;
}
}
for (int i = 0; i < N*N; ++i)
{
if (C0[i] != C2[i])
{
cout << "C2 " << i << "\n";
break;
}
}
}
int main()
{
run();
cout << "EXITING THE APPLICATION" << endl;
cin.get();
return 0;
}
Hey man.
First of all, thanks a lot for putting so much effort into it. I ran your code and im getting the following results for N = 2048:
M0: 1.18 s
M1: 1.18 s
M2: 1.08 s
So indeed the M2 is the fastest but not by much compared to other kernels. Could you tell me whether your results are for 2048 and how many cores you ve got.
For N=2048, the outputted times (in milliseconds) are now this:
TIME IS: 163.138
TIME IS: 163.114
TIME IS: 163.145
TIME IS: 87.1972
TIME IS: 87.2011
TIME IS: 87.2696
TIME IS: 85.7001
TIME IS: 85.6913
TIME IS: 85.6856
EXITING THE APPLICATION
Note, those times are for code generated using compute_10,sm_10 (and run on a GTX470). For compute_20/sm_20, the times are more in line with what you’re seeing:
TIME IS: 87.6322
TIME IS: 87.5662
TIME IS: 87.6813
TIME IS: 87.568
TIME IS: 87.5997
TIME IS: 87.5888
TIME IS: 85.5228
TIME IS: 85.5257
TIME IS: 85.5341
EXITING THE APPLICATION
So, the times are code gen target dependent, as well as on the physical GPU used.
Also, for the 9800GT, the reason that didn’t work was actually because the number of blocks was 32x32 = 1024. That GPU can’t handle anything over 512. (See Appendix A.1 and F.2 of the CUDA Programming Guide.) Adjusting for that (changing BLOCK_SIZE, and the manual unroll with #if’s), the code works.
From the CUDA Programming Guide, the GTX470 has 14 MP, and 448 “CUDA cores”.