Matrix inversion performance

I built a function for inverting matrices over GF(2) based on the Gauss-Jordan algorithm using a loop that fixes a column every iteration using parallel functions.
The problem is that after a certain number of iteration depending on the size of the inverted matrix the parallel functions execution-time goes way up with no direct relation to the matrix or the values of the variables during that iteration. And that rise in run-time occurs in all the functions one in each iteration but not in every iteration.

my code is as follows:

__global__ void findOne(Matrix M, int i, int *prow, int *isOne) {
	if (!*isOne && *prow != -2)
	{
		int row = blockIdx.x + i;

		if (M.elements[row * M.width + i])
			atomicExch(prow, row);
	}
}

__global__ void forceOne(Matrix M, Matrix I, int i, int *prow, int *isOne) {
	if (!*isOne && *prow != -1 && *prow != -2)
	{
		int col = blockIdx.x * blockDim.x + threadIdx.x;

		M.elements[i * M.width + col] = (int)M.elements[*prow * M.width + col] ^ (int)M.elements[i * M.width + col];
		I.elements[i * I.width + col] = (int)I.elements[*prow * I.width + col] ^ (int)I.elements[i * I.width + col];
	}
}

__global__ void colFix(Matrix M, Matrix I, int i, int *prow, int *isOne)
{
	if (*prow != -2)
	{
		int row = blockIdx.y * blockDim.y + threadIdx.y;
		int col = blockIdx.x * blockDim.x + threadIdx.x;

		if (row != i)
			if (M.elements[row * M.width + i])
			{
				if (col > i)
					M.elements[row * M.width + col] = (int)M.elements[row * M.width + col] ^ (int)M.elements[i * M.width + col];

				I.elements[row * I.width + col] = (int)I.elements[row * I.width + col] ^ (int)I.elements[i * I.width + col];
			}
	}
}

__global__ void colZeroFix(Matrix M, int i, int *prow, int *isOne)
{
	if (*prow != -2)
	{
		int row = blockIdx.x * blockDim.x + threadIdx.x;

		if (row != i)
			M.elements[row * M.width + i] = 0;

		else
			*isOne = (int)M.elements[(i + 1) * M.width + i + 1];
	}
}

__global__ void SetError(int *prow, int *isOne)
{
	if (!*isOne && *prow == -1)
		*prow = -2;
}

__global__ void Reset(int *prow)
{
	if (*prow != -2)
		*prow = -1;
}

bool Matrix::matrixInverse(Matrix M)
{
	Matrix d_M, d_I, I;
	d_M.width = d_M.stride = width; d_M.height = height;
	d_I.width = d_I.stride = width; d_I.height = height;
	size_t size = width * height * sizeof(int);

	cudaMalloc(&d_M.elements, size);
	cudaMemcpy(d_M.elements, elements, size, cudaMemcpyHostToDevice);

	I.init(width, height, stride); I.IdentityMatrix();
	cudaMalloc(&d_I.elements, size);
	cudaMemcpy(d_I.elements, I.elements, size, cudaMemcpyHostToDevice);

	// Setup the execution configuration
	// Spawning n*n threads
	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
	dim3 dimGrid(width / dimBlock.x, height / dimBlock.y);

	int ∗prow, row = -1; //errCount = 0;
	cudaMalloc((void**)&prow, sizeof(int));
	cudaMemcpy(prow, &row, sizeof(int), cudaMemcpyHostToDevice);

	int *isOne, is = (int)elements[0];
	cudaMalloc((void**)&isOne, sizeof(int));
	cudaMemcpy(isOne, &is, sizeof(int), cudaMemcpyHostToDevice);

	high_resolution_clock::time_point t1, t2;

	// Launch the device computation threads! 
	for (int i = 0; i < width; i++) {

		//cout << endl << "iteration: " << i;

		/*cudaMemcpy(&row, prow, sizeof(int), cudaMemcpyDeviceToHost);
		cudaMemcpy(&is, isOne, sizeof(int), cudaMemcpyDeviceToHost);

		cout << endl << "isOne: " << is;
		cout << endl << "prow: " << row;*/

		//t1 = high_resolution_clock::now();
		findOne << < (height - i), 1 >> > (d_M, i, prow, isOne);
		/*t2 = high_resolution_clock::now();
		auto a = duration_cast<microseconds>(t2 - t1).count();
		cout << endl << a;*/

		SetError << < 1, 1 >> > (prow, isOne);

		//t1 = high_resolution_clock::now();
		forceOne << < width / BLOCK_SIZE, BLOCK_SIZE >> > (d_M, d_I, i, prow, isOne);
		/*t2 = high_resolution_clock::now();
		auto b = duration_cast<microseconds>(t2 - t1).count();
		cout << endl << b;*/

		/*cudaMemcpy(&row, prow, sizeof(int), cudaMemcpyDeviceToHost);
		cout << endl << "prow: " << row;*/

		//t1 = high_resolution_clock::now();
		colFix << <dimGrid, dimBlock >> >(d_M, d_I, i, prow, isOne);
		/*t2 = high_resolution_clock::now();
		auto c = duration_cast<microseconds>(t2 - t1).count();
		cout << endl << c;*/

		//t1 = high_resolution_clock::now();
		colZeroFix << < height / BLOCK_SIZE, BLOCK_SIZE >> > (d_M, i, prow, isOne);
		/*t2 = high_resolution_clock::now();
		auto d = duration_cast<microseconds>(t2 - t1).count();
		cout << endl << d;*/

		Reset << < 1, 1 >> > (prow);
	}

	// Copy back the results from device to host
	cudaMemcpy(M.elements, d_I.elements, size, cudaMemcpyDeviceToHost);
	//cudaMemcpy(I.elements, d_M.elements, size, cudaMemcpyDeviceToHost);

	cudaFree(prow);
	cudaFree(isOne);
	cudaFree(d_M.elements);
	cudaFree(d_I.elements);

	cudaMemcpy(&row, prow, sizeof(int), cudaMemcpyDeviceToHost);

	if (row == -2)
		return false;
	return true;
}

All matrices are triangular and this start being visible when i started using sizes 64^2 and 128^2. Elements of the matrix are stored in - float elements[widthheight].

You are timing kernel launch time, not kernel runtime. Use events for timing, or just take a look at the timeline in the profiler for more accurate kernel run times.