Real crazy CUBLAS problem! Ok, this beats all explanations!

Hello,

I have a CUDA function that uses CUBLAS. The function is as follows:

[codebox]

void householder_reduction_gpu(int m, int n, float *cuda_mat, float *d, float *B)

{

int i1 = 0;

float s, f;

s = f = 0.0f;

for (int i = 0; i < n; ++i) {

    i1 = i + 1;

    // Use CUBLAS to calculate the square of the norm for the column

    s = cublasSdot(m-i1, &cuda_mat[IDX2C(i1, i, m)], 1, &cuda_mat[IDX2C(i1, i, m)], 1);

    // Skip transformation

    if (fabs(s) < MATHS::SVD_TOL) { CUDA_SAFE_CALL(cudaMemcpy(&d[i], &cuda_mat[IDX2C(i, i, m)], sizeof(float), cudaMemcpyDeviceToHost)); }

    else {

        CUDA_SAFE_CALL(cudaMemcpy(&f, &cuda_mat[IDX2C(i, i, m)], sizeof(float), cudaMemcpyDeviceToHost));

        s += f*f;

        d[i] = (f < 0) ? sqrt(s): -sqrt(s);

        s = f*d[i]-s;

        f = f-d[i];

        CUDA_SAFE_CALL(cudaMemcpy(&cuda_mat[IDX2C(i, i, m)], &f, sizeof(float), cudaMemcpyHostToDevice));

        float scale_fac = 0.0f;

        for (int j = i1; j < n; ++j) {

            // Use CUBLAS to calculate the scale factor

            scale_fac = cublasSdot(m-i, &cuda_mat[IDX2C(i, i, m)], 1, &cuda_mat[IDX2C(i, j, m)], 1);

            scale_fac /= s;

            cublasSaxpy(m-i, scale_fac, &cuda_mat[IDX2C(i, i, m)], 1, &cuda_mat[IDX2C(i, j, m)], 1);

        }

    }

if (i < (n-1)) {

        s = cublasSdot(n-i1-1, &cuda_mat[IDX2C(i, i1+1, m)], m, &cuda_mat[IDX2C(i, i1+1, m)], m);

        if (fabs(s) < MATHS::SVD_TOL) { CUDA_SAFE_CALL(cudaMemcpy(&b[i], &cuda_mat[IDX2C(i, i1, m)], sizeof(float), cudaMemcpyDeviceToHost)); }

        else {

            CUDA_SAFE_CALL(cudaMemcpy(&f, &cuda_mat[IDX2C(i, i1, m)], sizeof(float), cudaMemcpyDeviceToHost));

            s += f*f;

            b[i] = (f < 0) ? sqrt(s): -sqrt(s);

            s = f*b[i]-s;

            f = f-b[i];

            CUDA_SAFE_CALL(cudaMemcpy(&cuda_mat[IDX2C(i, i1, m)], &f, sizeof(float), cudaMemcpyHostToDevice));

            float *cpu_data = new float[m-i1];

            float scale_fac = 0.0f;

            for (int j = i1; j < m; ++j) {

                scale_fac = cublasSdot(n-i1, &cuda_mat[IDX2C(i, i1, m)], m, &cuda_mat[IDX2C(j, i1, m)], m);

                scale_fac /= s;

                cublasSaxpy(n-i1, scale_fac, &cuda_mat[IDX2C(i, i1, m)], m, &cuda_mat[IDX2C(j, i1, m)], m);

            }

// int *dummy = 0;

// CUDA_SAFE_CALL(cudaMalloc((void**)&dummy, sizeof(int)));

// CUDA_SAFE_CALL(cudaMemcpy(dummy, &i, sizeof(int), cudaMemcpyHostToDevice));

// cudaFree(dummy);

        }

    }

}

}

[/codebox]

The crazy bit is when I uncomment the dead code (with the dummy variable), the program gives completely different answers! However, this bit of the code is dead and there is no reason why it should effect the answers. I am at a complete loss on how to explain this behavior.

Any ideas or suggestions as to what might be happening is really appreciated. I have tried a lot of things.

It seems that with the dead bit of the code in there the scale_fac values that are generated are different than when this bit of the code is not there. It starts with small differences in least significant decimal places and then it completely overpowers.

Many thanks,

Luca

I did a few more experiments and it seems that this behavior only happens when I compile in release mode. So, I am guessing that the compiler is performing some optimizations that are responsible for this strange behavior.

The answers are at least consistent in the debug mode.

How is that dead code?
You’re copying whatever happens to be in dummy (which is allocated but not set on device, so garbage) to &i, which you later use when computing scale_fac;

No! I am copying from ‘i’ to ‘dummy’. I am copying from host to device and the ‘dummy’ var is not used anywhere else.

Yep, you’re right.

Remember that CUDA_SAFE_CALLs are not doing anything in release mode, so you’re not checking return values. You might want to start doing that manually.