Hi all,

I’m working on a CUDA program that calculates QR decomposition of m by n matrix. Here’s the kernel/helper functions

```
// Kernel Function for Q matrix calculation
__global__ void subQ(float* A, float* Q, unsigned int* track, int m, int n, float* debug){
register unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
register unsigned int j = blockDim.y * blockIdx.y + threadIdx.y;
register unsigned int k, l;
if (i < m && j < n){
// Base case for the first column
if (j == 0) {
Q[i * n] = A[i * n] / magn(A, 0, (m-1)*n, n);
}
// Algorithm for all subsequent columns
else {
// First assign the original value from input matrix into Q
Q[i * n + j] = A[i * n + j];
// Iterates through each previous columns and subtract the corresponding values
// Since Wx = Vx - <Vx,Q1>Q1 - <Vx,Q2>Q2... - <Vx,Q(x-1)>Q(x-1)
for (k=0; k < j; ++k){
// Loop that utilizes the track matrix to wait until all previous calculations are complete
for (l=0; l < m; ++l){
while (track[l * n + k] != 2){
continue;}
}
if (j==3 && k==1)
debug[i] = Q[i * n + k];
// Perform appropriate <Vx,Qy>Qy calculation and subtract it from Wx.
Q[i * n + j] -= (dotProduct(A, Q, j, (m-1)*n+j, k, (m-1)*n+k, n) * Q[i * n + k]);
}
// Change the track number to -1 to indicate that the number has finished Wx calculations
track[i * n + j] = 1;
// Loop that waits until all numbers in the same column has completed Wx calculations
for (l=0; l<m; ++l){
while (track[l * n + j] != 1){
continue;}
}
// Final calculation for Qx, since Qx = Wx / ||Wx||
Q[i * n + j] /= magn(Q, j, (m-1)*n+j, n);
}
// Change track number to 2 to indicate that the calculations are complete
track[i * n + j] = 2;
}
}
// Helper function for dot product
__device__ float dotProduct(float* A, float* B, unsigned int a_beg, unsigned int a_end, unsigned int b_beg, unsigned int b_end, unsigned int indent){
register float temp2 = 0;
register unsigned int a = a_beg;
register unsigned int b = b_beg;
for (;a<=a_end && b <= b_end;){
temp2 += A[a] * B[b];
a += indent;
b += indent;
}
return temp2;
}
// Helper function for magnitude calculations
__device__ float magn(float* A, unsigned int beg, unsigned int end, unsigned int indent){
register float temp3 = 0;
for (unsigned int b=beg; b<=end; b += indent)
temp3 += A[b] * A[b];
return sqrt(temp3);
}
```

Since each column requires all previous columns to finish their Q calculations, I use a tracking matrix that will change the number to 1 once it has finished W calculation, then to 2 once it has finished Q calculation (Q = W / ||W||).

I tested the code using an example given on wikipedia, where the input matrix is

12, -15, 4

6, 167, -68

-4, 24, -41

the resulting Q matrix should be

6/7 -69/175 -58/175

3/7 158/175 6/175

-2/7 6/35 -33/35

The first two columns of my result match the correct answer, but the third column does not. After some debugging, I found out that the third column is using values -69, 158, and 30 from second column instead of the correct values, so it’s using the values before they were divided by the magnitude of the column, which doesn’t make sense because the track matrix already marked through index as ‘2’, meaning that it has finished its calculations.

Any help :(?