Error in matrix multiplication launch timed out

Hey guys. I have the following code which has been improved by one of the members of this forum and it works fine when N (size of matrix) is 2048 but when

N = 4096, i get the error message:

“The launch timed out and was terminated”

The code below compiles fine in 1 file, so could you guys run it and tell me whether you experience same problems and what kind of improvement i can make to solve it?

Thanks !!

#ifdef _WIN32

#define WIN32_LEAN_AND_MEAN

#endif

#include <windows.h>

#include <stdio.h>

#include <iostream>

#include <vector>

#include <fstream>

using namespace std;

#define BLOCK_SIZE 32

class cuda_time

{

	private:    static bool initialized;    // Returns the overhead of the timer in ticks   

	static LONGLONG GetOverhead()    

	{        

		cuda_time t;     

		t.start();        

		t.stop();       

		return t.m_stop.QuadPart - t.m_start.QuadPart;   

	}    LARGE_INTEGER m_start;    

	

	LARGE_INTEGER m_stop;    

	static LARGE_INTEGER m_freq;   

	static LONGLONG m_overhead;

	

	public:    

	

		cuda_time()    

		{        

			if (initialized == false)        

			{           

				initialized = true;          

				m_freq = (QueryPerformanceFrequency(&cuda_time::m_freq), cuda_time::m_freq);         

				m_overhead = GetOverhead();        

			}    

		}    

		

		void start()     

		{        

		

			QueryPerformanceCounter(&m_start);    

		}    

		

		double last;    

		

		double stop()     

		{        

			QueryPerformanceCounter(&m_stop);        

			last = (m_stop.QuadPart - m_start.QuadPart - m_overhead) * 1000.0 / m_freq.QuadPart;        

			return last;    

		}    

		

		double get_time()    

		{        

			return last;    

		}

};

bool cuda_time::initialized = false;

LARGE_INTEGER cuda_time::m_freq;

LONGLONG cuda_time::m_overhead;

void print(int* a, int size)

{    

	for(int i=0; i < size * size; i++)    

	{        

		if(i % size == 0)        

		{            

			cout << endl;       

		}        

		cout << a[i] << "\t";    

	}    //wait    

	cin.get();

}

__global__ void MM2(float* A, float* B, float* C, int size)

{   

	int tx = threadIdx.x;    

	int ty = threadIdx.y;    

	int bx = blockIdx.x;    

	int by = blockIdx.y;    

	

	__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];   

	__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];   

	int iterations = size/BLOCK_SIZE;    

	

	int indexA = size * BLOCK_SIZE * by + ty * size + tx;   

	int indexB = bx * BLOCK_SIZE + ty * size + tx;    

	

	float Csub = 0;

	

#pragma unroll 20

	for(int i=0; i < iterations; i++)    

	{       

		//load the data       

		As[ty][tx] = A[indexA];       

		Bs[ty][tx] = B[indexB];        

		indexA += BLOCK_SIZE;        

		indexB += size * BLOCK_SIZE;      

		//wait        

		__syncthreads();

		

#pragma unroll        

		for(int k=0; k < BLOCK_SIZE; k++)        

		{            

			Csub += As[ty][k] * Bs[k][tx];       

		}        

		__syncthreads();    

	}    

	int indexC = size * BLOCK_SIZE * by + BLOCK_SIZE * bx;    

	C[indexC + size * ty + tx] = Csub;

	

}

__global__ void MM1(float* A, float* B, float* C, int size)

{    

	int tx = threadIdx.x;    

	int ty = threadIdx.y;    

	int bx = blockIdx.x;    

	int by = blockIdx.y;    

	

	__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];   

	__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];   

	int iterations = size/BLOCK_SIZE;    

	int indexA = size * BLOCK_SIZE * by + ty * size + tx;   

	int indexB = bx * BLOCK_SIZE + ty * size + tx;    

	

	float Csub = 0;    

	//#pragma unroll    

	for(int i=0; i < iterations; i++)    

	{        

		//load the data        

		As[ty][tx] = A[indexA];        

		Bs[ty][tx] = B[indexB];        

		indexA += BLOCK_SIZE;        

		indexB += size * BLOCK_SIZE;        

		//wait        

		__syncthreads();        

		//#pragma unroll        

		Csub += As[ty][0] * Bs[0][tx];        

		Csub += As[ty][1] * Bs[1][tx];        

		Csub += As[ty][2] * Bs[2][tx];        

		Csub += As[ty][3] * Bs[3][tx];        

		Csub += As[ty][4] * Bs[4][tx];        

		Csub += As[ty][5] * Bs[5][tx];        

		Csub += As[ty][6] * Bs[6][tx];        

		Csub += As[ty][7] * Bs[7][tx];        

		Csub += As[ty][8] * Bs[8][tx];        

		Csub += As[ty][9] * Bs[9][tx];        

		Csub += As[ty][10] * Bs[10][tx];      

		Csub += As[ty][11] * Bs[11][tx];      

		Csub += As[ty][12] * Bs[12][tx];      

		Csub += As[ty][13] * Bs[13][tx];      

		Csub += As[ty][14] * Bs[14][tx];      

		Csub += As[ty][15] * Bs[15][tx];

		

		Csub += As[ty][16] * Bs[16][tx];        

		Csub += As[ty][17] * Bs[17][tx];      

		Csub += As[ty][18] * Bs[18][tx];      

		Csub += As[ty][19] * Bs[19][tx];      

		Csub += As[ty][20] * Bs[20][tx];      

		Csub += As[ty][21] * Bs[21][tx];      

		Csub += As[ty][22] * Bs[22][tx];      

		Csub += As[ty][23] * Bs[23][tx];      

		Csub += As[ty][24] * Bs[24][tx];      

		Csub += As[ty][25] * Bs[25][tx];      

		Csub += As[ty][26] * Bs[26][tx];      

		Csub += As[ty][27] * Bs[27][tx];      

		Csub += As[ty][28] * Bs[28][tx];      

		Csub += As[ty][29] * Bs[29][tx];      

		Csub += As[ty][30] * Bs[30][tx];      

		Csub += As[ty][31] * Bs[31][tx]; 

		

		/*for(int k=0; k < BLOCK_SIZE; k++)   

		{        

			Csub += As[ty][k] * Bs[k][tx];       

		}*/        

		

		__syncthreads();    

	}    

	int indexC = size * BLOCK_SIZE * by + BLOCK_SIZE * bx;   

	C[indexC + size * ty + tx] = Csub;

	

}

__global__ void MM0(float* A, float* B, float* C, int size)

{   

	int tx = threadIdx.x;   

	int ty = threadIdx.y;    

	int bx = blockIdx.x;    

	int by = blockIdx.y;    

	

	__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];   

	__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 

	

	int iterations = size/BLOCK_SIZE;    

	int indexA = size * BLOCK_SIZE * by + ty * size + tx;  

	int indexB = bx * BLOCK_SIZE + ty * size + tx;    

	

	float Csub = 0;    

	//#pragma unroll     

	for(int i=0; i < iterations; i++)   

	{        

		//load the data       

		As[ty][tx] = A[indexA];       

		Bs[ty][tx] = B[indexB];        

		indexA += BLOCK_SIZE;        

		indexB += size * BLOCK_SIZE;       

		//wait        

		__syncthreads();        

		//#pragma unroll        

		for(int k=0; k < BLOCK_SIZE; k++)        

		{            

			Csub += As[ty][k] * Bs[k][tx];       

		}        

		

		__syncthreads();   

	}    

	

	int indexC = size * BLOCK_SIZE * by + BLOCK_SIZE * bx;    

	C[indexC + size * ty + tx] = Csub;

	

}

void run()

{    

	const int N = 4096;    

	float* A = new float[N * N];    

	float* B = new float[N * N];    

	float* C0 = new float[N * N];    

	float* C1 = new float[N * N];    

	float* C2 = new float[N * N]; 

	

	for (int i = 0; i < N*N; ++i) A[i] = 1;    

	for (int i = 0; i < N*N; ++i) B[i] = 1;    

	

	float* a;    float* b;    float* c0;    float* c1;    float* c2;   

	cudaMalloc( (void**)&a, N * N * sizeof(float));    

	cudaMalloc( (void**)&b, N * N * sizeof(float));    

	cudaMalloc( (void**)&c0, N * N * sizeof(float));   

	cudaMalloc( (void**)&c1, N * N * sizeof(float));   

	cudaMalloc( (void**)&c2, N * N * sizeof(float));   

	

	cudaMemcpy(a,A, N * N * sizeof(float), cudaMemcpyHostToDevice);   

	cudaMemcpy(b,B, N * N * sizeof(float), cudaMemcpyHostToDevice);   

	for (int i = 0; i < 3; ++i)    

	{        

		cudaMemcpy(c0, B, N * N * sizeof(float), cudaMemcpyHostToDevice);       

		cuda_time ct;        

		ct.start();        

		

		dim3 threads(BLOCK_SIZE, BLOCK_SIZE);     

		dim3 grid(N/BLOCK_SIZE, N/BLOCK_SIZE);    

		MM0<<<grid,threads>>>(a,b,c0,N);        

		cudaDeviceSynchronize();        

		cudaError_t e = cudaGetLastError();        

		

		if (e)            

			std::cout << cudaGetErrorString(e) << "\n";       

		ct.stop();        

		cout << "TIME IS:\t" << ct.get_time() << endl;       

		cudaMemcpy(C0,c0,N * N * sizeof(float), cudaMemcpyDeviceToHost);    

	}    

	//stop

	//cin.get();

	

	for (int i = 0; i < 3; ++i)    

	{        

	

		cudaMemcpy(c1, B, N * N * sizeof(float), cudaMemcpyHostToDevice);        

		cuda_time ct;        

		ct.start();        

		

		dim3 threads(BLOCK_SIZE, BLOCK_SIZE);        

		dim3 grid(N/BLOCK_SIZE, N/BLOCK_SIZE);        

		MM1<<<grid,threads>>>(a,b,c1,N);        

		

		cudaDeviceSynchronize();        

		cudaError_t e = cudaGetLastError();        

		

		if (e)            

			std::cout << cudaGetErrorString(e) << "\n";        

			

		ct.stop();        

		cout << "TIME IS:\t" << ct.get_time() << endl;     

		cudaMemcpy(C1,c1,N * N * sizeof(float), cudaMemcpyDeviceToHost);    

	}   

	for (int i = 0; i < 3; ++i)    

	{        

		cudaMemcpy(c2, B, N * N * sizeof(float), cudaMemcpyHostToDevice);       

		cuda_time ct;        

		ct.start();        

		

		dim3 threads(BLOCK_SIZE, BLOCK_SIZE);        

		dim3 grid(N/BLOCK_SIZE, N/BLOCK_SIZE);       

		MM2<<<grid,threads>>>(a,b,c2,N);        

		

		cudaDeviceSynchronize();        

		cudaError_t e = cudaGetLastError();       

		if (e)  

			std::cout << cudaGetErrorString(e) << "\n";       

		ct.stop();        

		cout << "TIME IS:\t" << ct.get_time() << endl;        

		cudaMemcpy(C2,c2,N * N * sizeof(float), cudaMemcpyDeviceToHost);   

	}    

	

	

	for (int i = 0; i < N*N; ++i)    

	{        

		if (C0[i] != C1[i])       

		{           

			cout << "C1 " << i << "\n";           

				break;        

		}    

	}    

	

	for (int i = 0; i < N*N; ++i)    

	{        

		if (C0[i] != C2[i])        

		{            

			cout << "C2 " << i << "\n";           

				break;        

		}    

	}

}

int main()

{    

	run();    

	cout << "EXITING THE APPLICATION" << endl;   

	cin.get();    

	return 0;

}

See signature.