Real crazy CUBLAS problem! Ok, this beats all explanations!

xargon · June 5, 2010, 3:49pm

Hello,

I have a CUDA function that uses CUBLAS. The function is as follows:

[codebox]

void householder_reduction_gpu(int m, int n, float *cuda_mat, float *d, float *B)

{

int i1 = 0;

float s, f;

s = f = 0.0f;

for (int i = 0; i < n; ++i) {

    i1 = i + 1;

    // Use CUBLAS to calculate the square of the norm for the column

    s = cublasSdot(m-i1, &cuda_mat[IDX2C(i1, i, m)], 1, &cuda_mat[IDX2C(i1, i, m)], 1);

    // Skip transformation

    if (fabs(s) < MATHS::SVD_TOL) { CUDA_SAFE_CALL(cudaMemcpy(&d[i], &cuda_mat[IDX2C(i, i, m)], sizeof(float), cudaMemcpyDeviceToHost)); }

    else {

        CUDA_SAFE_CALL(cudaMemcpy(&f, &cuda_mat[IDX2C(i, i, m)], sizeof(float), cudaMemcpyDeviceToHost));

        s += f*f;

        d[i] = (f < 0) ? sqrt(s): -sqrt(s);

        s = f*d[i]-s;

        f = f-d[i];

        CUDA_SAFE_CALL(cudaMemcpy(&cuda_mat[IDX2C(i, i, m)], &f, sizeof(float), cudaMemcpyHostToDevice));

        float scale_fac = 0.0f;

        for (int j = i1; j < n; ++j) {

            // Use CUBLAS to calculate the scale factor

            scale_fac = cublasSdot(m-i, &cuda_mat[IDX2C(i, i, m)], 1, &cuda_mat[IDX2C(i, j, m)], 1);

            scale_fac /= s;

            cublasSaxpy(m-i, scale_fac, &cuda_mat[IDX2C(i, i, m)], 1, &cuda_mat[IDX2C(i, j, m)], 1);

        }

    }

if (i < (n-1)) {

        s = cublasSdot(n-i1-1, &cuda_mat[IDX2C(i, i1+1, m)], m, &cuda_mat[IDX2C(i, i1+1, m)], m);

        if (fabs(s) < MATHS::SVD_TOL) { CUDA_SAFE_CALL(cudaMemcpy(&b[i], &cuda_mat[IDX2C(i, i1, m)], sizeof(float), cudaMemcpyDeviceToHost)); }

        else {

            CUDA_SAFE_CALL(cudaMemcpy(&f, &cuda_mat[IDX2C(i, i1, m)], sizeof(float), cudaMemcpyDeviceToHost));

            s += f*f;

            b[i] = (f < 0) ? sqrt(s): -sqrt(s);

            s = f*b[i]-s;

            f = f-b[i];

            CUDA_SAFE_CALL(cudaMemcpy(&cuda_mat[IDX2C(i, i1, m)], &f, sizeof(float), cudaMemcpyHostToDevice));

            float *cpu_data = new float[m-i1];

            float scale_fac = 0.0f;

            for (int j = i1; j < m; ++j) {

                scale_fac = cublasSdot(n-i1, &cuda_mat[IDX2C(i, i1, m)], m, &cuda_mat[IDX2C(j, i1, m)], m);

                scale_fac /= s;

                cublasSaxpy(n-i1, scale_fac, &cuda_mat[IDX2C(i, i1, m)], m, &cuda_mat[IDX2C(j, i1, m)], m);

            }

// int *dummy = 0;

// CUDA_SAFE_CALL(cudaMalloc((void**)&dummy, sizeof(int)));

// CUDA_SAFE_CALL(cudaMemcpy(dummy, &i, sizeof(int), cudaMemcpyHostToDevice));

// cudaFree(dummy);

}

[/codebox]

The crazy bit is when I uncomment the dead code (with the dummy variable), the program gives completely different answers! However, this bit of the code is dead and there is no reason why it should effect the answers. I am at a complete loss on how to explain this behavior.

Any ideas or suggestions as to what might be happening is really appreciated. I have tried a lot of things.

It seems that with the dead bit of the code in there the scale_fac values that are generated are different than when this bit of the code is not there. It starts with small differences in least significant decimal places and then it completely overpowers.

Many thanks,

Luca

xargon · June 5, 2010, 5:04pm

I did a few more experiments and it seems that this behavior only happens when I compile in release mode. So, I am guessing that the compiler is performing some optimizations that are responsible for this strange behavior.

The answers are at least consistent in the debug mode.

Hello,

I have a CUDA function that uses CUBLAS. The function is as follows:

[codebox]

void householder_reduction_gpu(int m, int n, float *cuda_mat, float *d, float *B)

{
int i1 = 0;

float s, f;

s = f = 0.0f;

for (int i = 0; i < n; ++i) {

    i1 = i + 1;

    // Use CUBLAS to calculate the square of the norm for the column

    s = cublasSdot(m-i1, &cuda_mat[IDX2C(i1, i, m)], 1, &cuda_mat[IDX2C(i1, i, m)], 1);

    // Skip transformation

    if (fabs(s) < MATHS::SVD_TOL) { CUDA_SAFE_CALL(cudaMemcpy(&d[i], &cuda_mat[IDX2C(i, i, m)], sizeof(float), cudaMemcpyDeviceToHost)); }

    else {

        CUDA_SAFE_CALL(cudaMemcpy(&f, &cuda_mat[IDX2C(i, i, m)], sizeof(float), cudaMemcpyDeviceToHost));

        s += f*f;

        d[i] = (f < 0) ? sqrt(s): -sqrt(s);

        s = f*d[i]-s;

        f = f-d[i];

        CUDA_SAFE_CALL(cudaMemcpy(&cuda_mat[IDX2C(i, i, m)], &f, sizeof(float), cudaMemcpyHostToDevice));

        float scale_fac = 0.0f;

        for (int j = i1; j < n; ++j) {

            // Use CUBLAS to calculate the scale factor

            scale_fac = cublasSdot(m-i, &cuda_mat[IDX2C(i, i, m)], 1, &cuda_mat[IDX2C(i, j, m)], 1);

            scale_fac /= s;

            cublasSaxpy(m-i, scale_fac, &cuda_mat[IDX2C(i, i, m)], 1, &cuda_mat[IDX2C(i, j, m)], 1);

        }

    }
if (i < (n-1)) {
        s = cublasSdot(n-i1-1, &cuda_mat[IDX2C(i, i1+1, m)], m, &cuda_mat[IDX2C(i, i1+1, m)], m);

        if (fabs(s) < MATHS::SVD_TOL) { CUDA_SAFE_CALL(cudaMemcpy(&b[i], &cuda_mat[IDX2C(i, i1, m)], sizeof(float), cudaMemcpyDeviceToHost)); }

        else {

            CUDA_SAFE_CALL(cudaMemcpy(&f, &cuda_mat[IDX2C(i, i1, m)], sizeof(float), cudaMemcpyDeviceToHost));

            s += f*f;

            b[i] = (f < 0) ? sqrt(s): -sqrt(s);

            s = f*b[i]-s;

            f = f-b[i];

            CUDA_SAFE_CALL(cudaMemcpy(&cuda_mat[IDX2C(i, i1, m)], &f, sizeof(float), cudaMemcpyHostToDevice));

            float *cpu_data = new float[m-i1];

            float scale_fac = 0.0f;

            for (int j = i1; j < m; ++j) {

                scale_fac = cublasSdot(n-i1, &cuda_mat[IDX2C(i, i1, m)], m, &cuda_mat[IDX2C(j, i1, m)], m);

                scale_fac /= s;

                cublasSaxpy(n-i1, scale_fac, &cuda_mat[IDX2C(i, i1, m)], m, &cuda_mat[IDX2C(j, i1, m)], m);

            }
// int *dummy = 0;

// CUDA_SAFE_CALL(cudaMalloc((void**)&dummy, sizeof(int)));

// CUDA_SAFE_CALL(cudaMemcpy(dummy, &i, sizeof(int), cudaMemcpyHostToDevice));

// cudaFree(dummy);
        }

    }

}
}

[/codebox]

The crazy bit is when I uncomment the dead code (with the dummy variable), the program gives completely different answers! However, this bit of the code is dead and there is no reason why it should effect the answers. I am at a complete loss on how to explain this behavior.

Any ideas or suggestions as to what might be happening is really appreciated. I have tried a lot of things.

It seems that with the dead bit of the code in there the scale_fac values that are generated are different than when this bit of the code is not there. It starts with small differences in least significant decimal places and then it completely overpowers.

Many thanks,

Luca

_Big_Mac · June 5, 2010, 7:56pm

How is that dead code?
You’re copying whatever happens to be in dummy (which is allocated but not set on device, so garbage) to &i, which you later use when computing scale_fac;

xargon · June 6, 2010, 12:01am

No! I am copying from ‘i’ to ‘dummy’. I am copying from host to device and the ‘dummy’ var is not used anywhere else.

_Big_Mac · June 6, 2010, 9:24am

Yep, you’re right.

Remember that CUDA_SAFE_CALLs are not doing anything in release mode, so you’re not checking return values. You might want to start doing that manually.

Topic		Replies	Views
CUBLAS vs GOTOBLAS2 CUDA Programming and Performance	20	10698	September 15, 2010
cuBLAS call from kernel in CUDA 10.0 GPU-Accelerated Libraries	9	4864	April 7, 2021
Program hit cudaErrorInvalidValue (error 1) due to "invalid argument" on CUDA API call to cudaMemsetAsync CUDA Programming and Performance	7	7802	January 11, 2020
Is it correct that my Pascal card is calling Maxwell_gemm kernels through cublas? And if so, why is cublas unusably slow for me? CUDA Programming and Performance	6	948	August 23, 2018
cublasZgemm() gives false result for large data and potential bug GPU-Accelerated Libraries	6	1157	October 12, 2021
CUBLAS + Kernels + double precision + "for loop" = strange behavior CUDA Programming and Performance	17	4331	September 25, 2009
Logarithmic fold operation behaving strangely CUDA Programming and Performance	1	892	October 15, 2010
Issue when calling cublasDdot from within kernel GPU-Accelerated Libraries	7	935	March 21, 2018
Strange "unspecified launch error" from a call to cublas gemm CUDA Programming and Performance	23	2752	January 19, 2019
Error parameter number 10 cublasSgemmBatched GPU-Accelerated Libraries	4	1787	June 23, 2016

Real crazy CUBLAS problem! Ok, this beats all explanations!

Related topics