Hey guys. I have the following code which has been improved by one of the members of this forum and it works fine when N (size of matrix) is 2048 but when
N = 4096, i get the error message:
“The launch timed out and was terminated”
The code below compiles fine in 1 file, so could you guys run it and tell me whether you experience same problems and what kind of improvement i can make to solve it?
Thanks !!
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
#include <stdio.h>
#include <iostream>
#include <vector>
#include <fstream>
using namespace std;
#define BLOCK_SIZE 32
class cuda_time
{
private: static bool initialized; // Returns the overhead of the timer in ticks
static LONGLONG GetOverhead()
{
cuda_time t;
t.start();
t.stop();
return t.m_stop.QuadPart - t.m_start.QuadPart;
} LARGE_INTEGER m_start;
LARGE_INTEGER m_stop;
static LARGE_INTEGER m_freq;
static LONGLONG m_overhead;
public:
cuda_time()
{
if (initialized == false)
{
initialized = true;
m_freq = (QueryPerformanceFrequency(&cuda_time::m_freq), cuda_time::m_freq);
m_overhead = GetOverhead();
}
}
void start()
{
QueryPerformanceCounter(&m_start);
}
double last;
double stop()
{
QueryPerformanceCounter(&m_stop);
last = (m_stop.QuadPart - m_start.QuadPart - m_overhead) * 1000.0 / m_freq.QuadPart;
return last;
}
double get_time()
{
return last;
}
};
bool cuda_time::initialized = false;
LARGE_INTEGER cuda_time::m_freq;
LONGLONG cuda_time::m_overhead;
void print(int* a, int size)
{
for(int i=0; i < size * size; i++)
{
if(i % size == 0)
{
cout << endl;
}
cout << a[i] << "\t";
} //wait
cin.get();
}
__global__ void MM2(float* A, float* B, float* C, int size)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
int iterations = size/BLOCK_SIZE;
int indexA = size * BLOCK_SIZE * by + ty * size + tx;
int indexB = bx * BLOCK_SIZE + ty * size + tx;
float Csub = 0;
#pragma unroll 20
for(int i=0; i < iterations; i++)
{
//load the data
As[ty][tx] = A[indexA];
Bs[ty][tx] = B[indexB];
indexA += BLOCK_SIZE;
indexB += size * BLOCK_SIZE;
//wait
__syncthreads();
#pragma unroll
for(int k=0; k < BLOCK_SIZE; k++)
{
Csub += As[ty][k] * Bs[k][tx];
}
__syncthreads();
}
int indexC = size * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[indexC + size * ty + tx] = Csub;
}
__global__ void MM1(float* A, float* B, float* C, int size)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
int iterations = size/BLOCK_SIZE;
int indexA = size * BLOCK_SIZE * by + ty * size + tx;
int indexB = bx * BLOCK_SIZE + ty * size + tx;
float Csub = 0;
//#pragma unroll
for(int i=0; i < iterations; i++)
{
//load the data
As[ty][tx] = A[indexA];
Bs[ty][tx] = B[indexB];
indexA += BLOCK_SIZE;
indexB += size * BLOCK_SIZE;
//wait
__syncthreads();
//#pragma unroll
Csub += As[ty][0] * Bs[0][tx];
Csub += As[ty][1] * Bs[1][tx];
Csub += As[ty][2] * Bs[2][tx];
Csub += As[ty][3] * Bs[3][tx];
Csub += As[ty][4] * Bs[4][tx];
Csub += As[ty][5] * Bs[5][tx];
Csub += As[ty][6] * Bs[6][tx];
Csub += As[ty][7] * Bs[7][tx];
Csub += As[ty][8] * Bs[8][tx];
Csub += As[ty][9] * Bs[9][tx];
Csub += As[ty][10] * Bs[10][tx];
Csub += As[ty][11] * Bs[11][tx];
Csub += As[ty][12] * Bs[12][tx];
Csub += As[ty][13] * Bs[13][tx];
Csub += As[ty][14] * Bs[14][tx];
Csub += As[ty][15] * Bs[15][tx];
Csub += As[ty][16] * Bs[16][tx];
Csub += As[ty][17] * Bs[17][tx];
Csub += As[ty][18] * Bs[18][tx];
Csub += As[ty][19] * Bs[19][tx];
Csub += As[ty][20] * Bs[20][tx];
Csub += As[ty][21] * Bs[21][tx];
Csub += As[ty][22] * Bs[22][tx];
Csub += As[ty][23] * Bs[23][tx];
Csub += As[ty][24] * Bs[24][tx];
Csub += As[ty][25] * Bs[25][tx];
Csub += As[ty][26] * Bs[26][tx];
Csub += As[ty][27] * Bs[27][tx];
Csub += As[ty][28] * Bs[28][tx];
Csub += As[ty][29] * Bs[29][tx];
Csub += As[ty][30] * Bs[30][tx];
Csub += As[ty][31] * Bs[31][tx];
/*for(int k=0; k < BLOCK_SIZE; k++)
{
Csub += As[ty][k] * Bs[k][tx];
}*/
__syncthreads();
}
int indexC = size * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[indexC + size * ty + tx] = Csub;
}
__global__ void MM0(float* A, float* B, float* C, int size)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
int iterations = size/BLOCK_SIZE;
int indexA = size * BLOCK_SIZE * by + ty * size + tx;
int indexB = bx * BLOCK_SIZE + ty * size + tx;
float Csub = 0;
//#pragma unroll
for(int i=0; i < iterations; i++)
{
//load the data
As[ty][tx] = A[indexA];
Bs[ty][tx] = B[indexB];
indexA += BLOCK_SIZE;
indexB += size * BLOCK_SIZE;
//wait
__syncthreads();
//#pragma unroll
for(int k=0; k < BLOCK_SIZE; k++)
{
Csub += As[ty][k] * Bs[k][tx];
}
__syncthreads();
}
int indexC = size * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[indexC + size * ty + tx] = Csub;
}
void run()
{
const int N = 4096;
float* A = new float[N * N];
float* B = new float[N * N];
float* C0 = new float[N * N];
float* C1 = new float[N * N];
float* C2 = new float[N * N];
for (int i = 0; i < N*N; ++i) A[i] = 1;
for (int i = 0; i < N*N; ++i) B[i] = 1;
float* a; float* b; float* c0; float* c1; float* c2;
cudaMalloc( (void**)&a, N * N * sizeof(float));
cudaMalloc( (void**)&b, N * N * sizeof(float));
cudaMalloc( (void**)&c0, N * N * sizeof(float));
cudaMalloc( (void**)&c1, N * N * sizeof(float));
cudaMalloc( (void**)&c2, N * N * sizeof(float));
cudaMemcpy(a,A, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b,B, N * N * sizeof(float), cudaMemcpyHostToDevice);
for (int i = 0; i < 3; ++i)
{
cudaMemcpy(c0, B, N * N * sizeof(float), cudaMemcpyHostToDevice);
cuda_time ct;
ct.start();
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(N/BLOCK_SIZE, N/BLOCK_SIZE);
MM0<<<grid,threads>>>(a,b,c0,N);
cudaDeviceSynchronize();
cudaError_t e = cudaGetLastError();
if (e)
std::cout << cudaGetErrorString(e) << "\n";
ct.stop();
cout << "TIME IS:\t" << ct.get_time() << endl;
cudaMemcpy(C0,c0,N * N * sizeof(float), cudaMemcpyDeviceToHost);
}
//stop
//cin.get();
for (int i = 0; i < 3; ++i)
{
cudaMemcpy(c1, B, N * N * sizeof(float), cudaMemcpyHostToDevice);
cuda_time ct;
ct.start();
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(N/BLOCK_SIZE, N/BLOCK_SIZE);
MM1<<<grid,threads>>>(a,b,c1,N);
cudaDeviceSynchronize();
cudaError_t e = cudaGetLastError();
if (e)
std::cout << cudaGetErrorString(e) << "\n";
ct.stop();
cout << "TIME IS:\t" << ct.get_time() << endl;
cudaMemcpy(C1,c1,N * N * sizeof(float), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < 3; ++i)
{
cudaMemcpy(c2, B, N * N * sizeof(float), cudaMemcpyHostToDevice);
cuda_time ct;
ct.start();
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(N/BLOCK_SIZE, N/BLOCK_SIZE);
MM2<<<grid,threads>>>(a,b,c2,N);
cudaDeviceSynchronize();
cudaError_t e = cudaGetLastError();
if (e)
std::cout << cudaGetErrorString(e) << "\n";
ct.stop();
cout << "TIME IS:\t" << ct.get_time() << endl;
cudaMemcpy(C2,c2,N * N * sizeof(float), cudaMemcpyDeviceToHost);
}
for (int i = 0; i < N*N; ++i)
{
if (C0[i] != C1[i])
{
cout << "C1 " << i << "\n";
break;
}
}
for (int i = 0; i < N*N; ++i)
{
if (C0[i] != C2[i])
{
cout << "C2 " << i << "\n";
break;
}
}
}
int main()
{
run();
cout << "EXITING THE APPLICATION" << endl;
cin.get();
return 0;
}