# Matrix inversion performance

I built a function for inverting matrices over GF(2) based on the Gauss-Jordan algorithm using a loop that fixes a column every iteration using parallel functions.
The problem is that after a certain number of iteration depending on the size of the inverted matrix the parallel functions execution-time goes way up with no direct relation to the matrix or the values of the variables during that iteration. And that rise in run-time occurs in all the functions one in each iteration but not in every iteration.

my code is as follows:

``````__global__ void findOne(Matrix M, int i, int *prow, int *isOne) {
if (!*isOne && *prow != -2)
{
int row = blockIdx.x + i;

if (M.elements[row * M.width + i])
atomicExch(prow, row);
}
}

__global__ void forceOne(Matrix M, Matrix I, int i, int *prow, int *isOne) {
if (!*isOne && *prow != -1 && *prow != -2)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;

M.elements[i * M.width + col] = (int)M.elements[*prow * M.width + col] ^ (int)M.elements[i * M.width + col];
I.elements[i * I.width + col] = (int)I.elements[*prow * I.width + col] ^ (int)I.elements[i * I.width + col];
}
}

__global__ void colFix(Matrix M, Matrix I, int i, int *prow, int *isOne)
{
if (*prow != -2)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;

if (row != i)
if (M.elements[row * M.width + i])
{
if (col > i)
M.elements[row * M.width + col] = (int)M.elements[row * M.width + col] ^ (int)M.elements[i * M.width + col];

I.elements[row * I.width + col] = (int)I.elements[row * I.width + col] ^ (int)I.elements[i * I.width + col];
}
}
}

__global__ void colZeroFix(Matrix M, int i, int *prow, int *isOne)
{
if (*prow != -2)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;

if (row != i)
M.elements[row * M.width + i] = 0;

else
*isOne = (int)M.elements[(i + 1) * M.width + i + 1];
}
}

__global__ void SetError(int *prow, int *isOne)
{
if (!*isOne && *prow == -1)
*prow = -2;
}

__global__ void Reset(int *prow)
{
if (*prow != -2)
*prow = -1;
}

bool Matrix::matrixInverse(Matrix M)
{
Matrix d_M, d_I, I;
d_M.width = d_M.stride = width; d_M.height = height;
d_I.width = d_I.stride = width; d_I.height = height;
size_t size = width * height * sizeof(int);

cudaMalloc(&d_M.elements, size);
cudaMemcpy(d_M.elements, elements, size, cudaMemcpyHostToDevice);

I.init(width, height, stride); I.IdentityMatrix();
cudaMalloc(&d_I.elements, size);
cudaMemcpy(d_I.elements, I.elements, size, cudaMemcpyHostToDevice);

// Setup the execution configuration
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y);

int ∗prow, row = -1; //errCount = 0;
cudaMalloc((void**)&prow, sizeof(int));
cudaMemcpy(prow, &row, sizeof(int), cudaMemcpyHostToDevice);

int *isOne, is = (int)elements[0];
cudaMalloc((void**)&isOne, sizeof(int));
cudaMemcpy(isOne, &is, sizeof(int), cudaMemcpyHostToDevice);

high_resolution_clock::time_point t1, t2;

// Launch the device computation threads!
for (int i = 0; i < width; i++) {

//cout << endl << "iteration: " << i;

/*cudaMemcpy(&row, prow, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&is, isOne, sizeof(int), cudaMemcpyDeviceToHost);

cout << endl << "isOne: " << is;
cout << endl << "prow: " << row;*/

//t1 = high_resolution_clock::now();
findOne << < (height - i), 1 >> > (d_M, i, prow, isOne);
/*t2 = high_resolution_clock::now();
auto a = duration_cast<microseconds>(t2 - t1).count();
cout << endl << a;*/

SetError << < 1, 1 >> > (prow, isOne);

//t1 = high_resolution_clock::now();
forceOne << < width / BLOCK_SIZE, BLOCK_SIZE >> > (d_M, d_I, i, prow, isOne);
/*t2 = high_resolution_clock::now();
auto b = duration_cast<microseconds>(t2 - t1).count();
cout << endl << b;*/

/*cudaMemcpy(&row, prow, sizeof(int), cudaMemcpyDeviceToHost);
cout << endl << "prow: " << row;*/

//t1 = high_resolution_clock::now();
colFix << <dimGrid, dimBlock >> >(d_M, d_I, i, prow, isOne);
/*t2 = high_resolution_clock::now();
auto c = duration_cast<microseconds>(t2 - t1).count();
cout << endl << c;*/

//t1 = high_resolution_clock::now();
colZeroFix << < height / BLOCK_SIZE, BLOCK_SIZE >> > (d_M, i, prow, isOne);
/*t2 = high_resolution_clock::now();
auto d = duration_cast<microseconds>(t2 - t1).count();
cout << endl << d;*/

Reset << < 1, 1 >> > (prow);
}

// Copy back the results from device to host
cudaMemcpy(M.elements, d_I.elements, size, cudaMemcpyDeviceToHost);
//cudaMemcpy(I.elements, d_M.elements, size, cudaMemcpyDeviceToHost);

cudaFree(prow);
cudaFree(isOne);
cudaFree(d_M.elements);
cudaFree(d_I.elements);

cudaMemcpy(&row, prow, sizeof(int), cudaMemcpyDeviceToHost);

if (row == -2)
return false;
return true;
}
``````

All matrices are triangular and this start being visible when i started using sizes 64^2 and 128^2. Elements of the matrix are stored in - float elements[widthheight].

You are timing kernel launch time, not kernel runtime. Use events for timing, or just take a look at the timeline in the profiler for more accurate kernel run times.