Hello,
I have a CUDA function that uses CUBLAS. The function is as follows:
[codebox]
void householder_reduction_gpu(int m, int n, float *cuda_mat, float *d, float *B)
{
int i1 = 0;
float s, f;
s = f = 0.0f;
for (int i = 0; i < n; ++i) {
i1 = i + 1;
// Use CUBLAS to calculate the square of the norm for the column
s = cublasSdot(m-i1, &cuda_mat[IDX2C(i1, i, m)], 1, &cuda_mat[IDX2C(i1, i, m)], 1);
// Skip transformation
if (fabs(s) < MATHS::SVD_TOL) { CUDA_SAFE_CALL(cudaMemcpy(&d[i], &cuda_mat[IDX2C(i, i, m)], sizeof(float), cudaMemcpyDeviceToHost)); }
else {
CUDA_SAFE_CALL(cudaMemcpy(&f, &cuda_mat[IDX2C(i, i, m)], sizeof(float), cudaMemcpyDeviceToHost));
s += f*f;
d[i] = (f < 0) ? sqrt(s): -sqrt(s);
s = f*d[i]-s;
f = f-d[i];
CUDA_SAFE_CALL(cudaMemcpy(&cuda_mat[IDX2C(i, i, m)], &f, sizeof(float), cudaMemcpyHostToDevice));
float scale_fac = 0.0f;
for (int j = i1; j < n; ++j) {
// Use CUBLAS to calculate the scale factor
scale_fac = cublasSdot(m-i, &cuda_mat[IDX2C(i, i, m)], 1, &cuda_mat[IDX2C(i, j, m)], 1);
scale_fac /= s;
cublasSaxpy(m-i, scale_fac, &cuda_mat[IDX2C(i, i, m)], 1, &cuda_mat[IDX2C(i, j, m)], 1);
}
}
if (i < (n-1)) {
s = cublasSdot(n-i1-1, &cuda_mat[IDX2C(i, i1+1, m)], m, &cuda_mat[IDX2C(i, i1+1, m)], m);
if (fabs(s) < MATHS::SVD_TOL) { CUDA_SAFE_CALL(cudaMemcpy(&b[i], &cuda_mat[IDX2C(i, i1, m)], sizeof(float), cudaMemcpyDeviceToHost)); }
else {
CUDA_SAFE_CALL(cudaMemcpy(&f, &cuda_mat[IDX2C(i, i1, m)], sizeof(float), cudaMemcpyDeviceToHost));
s += f*f;
b[i] = (f < 0) ? sqrt(s): -sqrt(s);
s = f*b[i]-s;
f = f-b[i];
CUDA_SAFE_CALL(cudaMemcpy(&cuda_mat[IDX2C(i, i1, m)], &f, sizeof(float), cudaMemcpyHostToDevice));
float *cpu_data = new float[m-i1];
float scale_fac = 0.0f;
for (int j = i1; j < m; ++j) {
scale_fac = cublasSdot(n-i1, &cuda_mat[IDX2C(i, i1, m)], m, &cuda_mat[IDX2C(j, i1, m)], m);
scale_fac /= s;
cublasSaxpy(n-i1, scale_fac, &cuda_mat[IDX2C(i, i1, m)], m, &cuda_mat[IDX2C(j, i1, m)], m);
}
// int *dummy = 0;
// CUDA_SAFE_CALL(cudaMalloc((void**)&dummy, sizeof(int)));
// CUDA_SAFE_CALL(cudaMemcpy(dummy, &i, sizeof(int), cudaMemcpyHostToDevice));
// cudaFree(dummy);
}
}
}
}
[/codebox]
The crazy bit is when I uncomment the dead code (with the dummy variable), the program gives completely different answers! However, this bit of the code is dead and there is no reason why it should effect the answers. I am at a complete loss on how to explain this behavior.
Any ideas or suggestions as to what might be happening is really appreciated. I have tried a lot of things.
It seems that with the dead bit of the code in there the scale_fac values that are generated are different than when this bit of the code is not there. It starts with small differences in least significant decimal places and then it completely overpowers.
Many thanks,
Luca