gemm returns NaN in cuBLAS and cuSPARSE when ssse3 intel intrinsics are used

Happens on Fedora 20, Ubuntu 15.04, 14.04, 12.04 using GTX 970, GTX 1080, GTX TITAN X and TESLA K20x.
Tested on CUDA Toolkit 7.5 and 8.0 RC.

Please compile using:

nvcc cublas_test.cc -o cublas_code -L/usr/local/cuda/lib64 -lcublas --compiler-options -mssse3
nvcc cusparse_test.cc -o cusparse_code -L/usr/local/cuda/lib64 -lcusparse --compiler-options -mssse3

cublas_test.cc:

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <stdio.h>
#include <cstdlib>
#include <vector>
#include <tmmintrin.h>

using std::vector;

void allocate(void ** ptr, int len) {
    cudaError_t ret = cudaMalloc(ptr, len);
    if (ret != cudaSuccess) {
        printf("error in allocate\n");
        exit(1);
    }
}

void break_cublas() {
    __m64 a = (__m64)0LL;
    __m64 b = (__m64)0LL;
    __m64 c = _mm_hadd_pi32(a, b);
//     __m64 c = _mm_shuffle_pi8(a, b);
}

void test(cublasHandle_t & cublasHandle) {

    for (int i = 0; i < 3; i++) {

        printf("------- %d ------- \n", i);
        if (i > 0)
            break_cublas();

        cudaError_t res;
        cublasStatus_t cublasStatus;

        int m = 8;
        int n = 8;
        int k = 1;

        double * A_val;
        allocate((void**)&A_val, m * n * sizeof(double));
        cudaMemset(A_val, 0, sizeof(double) * m * n);

        double * B_val;
        allocate((void**)&B_val, sizeof(double) * n * k);
        cudaMemset(B_val, 0, sizeof(double) * n * k);

        double * C_val;
        allocate((void**)&C_val, sizeof(double) * m * k);
        cudaMemset(C_val, 0, sizeof(double) * n * k);

        vector<double> r(m * k);
        cudaMemcpy(r.data(), C_val, sizeof(double) * m * k, cudaMemcpyDeviceToHost);
        printf("(before multiply) C = ");
        for (int i = 0; i < r.size(); i++)
            printf("%f ", r[i]);
        printf("\n");

        double alpha = 1.0;
        double beta = 0.0;

        cublasStatus = cublasDgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, m, k, n,
                                   &alpha, A_val, m, B_val, n, &beta, C_val, m);

        if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
            printf("error in multiply\n");
            exit(1);
        }

        cudaMemcpy(r.data(), C_val, sizeof(double) * m * k, cudaMemcpyDeviceToHost);
        printf("(after multiply) C = ");
        for (int i = 0; i < r.size(); i++)
            printf("%f ", r[i]);
        printf("\n\n");

        cudaFree(A_val);
        cudaFree(B_val);
        cudaFree(C_val);
    }
}

int main() {
    cublasHandle_t cublasHandle;
    cublasStatus_t cublasStatus;
    cublasStatus = cublasCreate(&cublasHandle);
    if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
        printf("error cublas create\n");
        exit(1);
    }

    test(cublasHandle);

    cublasDestroy(cublasHandle);

    return 0;
}

cusparse_test.cc:

#include <cuda_runtime.h>
#include <cusparse_v2.h>
#include <stdio.h>
#include <cstdlib>
#include <vector>
#include <tmmintrin.h>

using std::vector;

void allocate(void ** ptr, int len) {
    cudaError_t ret = cudaMalloc(ptr, len);
    if (ret != cudaSuccess) {
        printf("error in allocate\n");
        exit(1);
    }
}

void break_cusparse() {
    __m64 a = (__m64)0LL;
    __m64 b = (__m64)0LL;
    __m64 c = _mm_hadd_pi32(a, b);
}

void test(cusparseHandle_t & cusparseHandle, cusparseMatDescr_t & descr) {

    for (int i = 0; i < 3; i++) {

        printf("-------- %d ---------- \n", i);
        if (i > 0)
            break_cusparse();

        cudaError_t res;

        cusparseStatus_t cusparseStatus;

        int m = 8;
        int n = 8;
        int k = 1;

        double * A_val;
        allocate((void**)&A_val, 0 * sizeof(double));

        int * A_col;
        allocate((void**)&A_col, 0);

        int * A_row;
        allocate((void**)&A_row, sizeof(int) * (m + 1));
        cudaMemset(A_row, 0, sizeof(int) * (m + 1));

        int A_nnz = 0;
        double alpha = 1.0;
        double beta = 0.0;

        double * B_val;
        allocate((void**)&B_val, sizeof(double) * n * k);
        cudaMemset(B_val, 7, sizeof(double) * n * k);

        double * C_val;
        allocate((void**)&C_val, sizeof(double) * m * k);
        cudaMemset(C_val, 0, sizeof(double) * n * k);

        vector<double> r(m * k);
        cudaMemcpy(r.data(), C_val, sizeof(double) * m * k, cudaMemcpyDeviceToHost);
        printf("(before multiply) C = ");
        for (int i = 0; i < r.size(); i++)
            printf("%f ", r[i]);
        printf("\n");
        cusparseStatus = cusparseDcsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                        m, k, n, A_nnz, &alpha, descr, A_val, A_row, A_col,
                                        B_val, n, &beta, C_val, m);

        if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
            printf("error multiply\n");
            exit(1);
        }

        cudaMemcpy(r.data(), C_val, sizeof(double) * m * k, cudaMemcpyDeviceToHost);
        printf("(after multiply) C = ");
        for (int i = 0; i < r.size(); i++)
            printf("%f ", r[i]);
        printf("\n\n");

        cudaFree(A_val);
        cudaFree(A_row);
        cudaFree(A_col);
        cudaFree(B_val);
        cudaFree(C_val);
    }
}

int main() {
    cusparseHandle_t cusparseHandle;
    cusparseMatDescr_t descr;
    cusparseStatus_t cusparseStatus;
    cusparseStatus = cusparseCreate(&cusparseHandle);
    if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
        printf("error cusparse create\n");
        exit(1);
    }

    cusparseStatus = cusparseCreateMatDescr(&descr);
    if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) {
        printf("error cusparseCreateMatDescr\n");
        exit(1);
    }
    cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
    test(cusparseHandle, descr);

    cusparseDestroy(cusparseHandle);
    cusparseDestroyMatDescr(descr);

    return 0;
}

On 64-bit Windows, __m64 is not supported, so I changed the break_cublas() function as follows:

void break_cublas() {
    __m128i a;
    __m128i b;
    __m128i c;
    memset (&a, 0, sizeof(a));
    memset (&b, 0, sizeof(b));
    c = _mm_hadd_epi32(a, b);
    c = _mm_shuffle_epi8(a, b);
}

With that change and compiled with CUDA 7.5, the app runs normally, printing a bunch of zeros at the end:

------- 0 -------
(before multiply) C = 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
(after multiply) C = 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

------- 1 -------
(before multiply) C = 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
(after multiply) C = 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

------- 2 -------
(before multiply) C = 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
(after multiply) C = 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000

Based on
https://msdn.microsoft.com/en-us/library/08x3t697.aspx

__m64 should be accessible in 64-bits Windows too.

Just need to include xmmintrin.h instead.

I tried building Microsoft’s _mm_hadd_pi32() example code (https://msdn.microsoft.com/en-us/library/bb514013(v=vs.120).aspx) using standalone MSVC, without any CUDA toolchain involvement, and could not get it to build successfully. Googling for an explanation I came across information that says __m64 is not supported on x64 platforms. That’s all I know at this point. BTW, the Microsoft page you pointed me to says as much: “The __m64 data type is not supported on x64 processors.”

It would be interesting to see what happens on your Linux platform if you exchange your version of break_cublas() with mine. I have no idea how use of SSE intrinsics could possibly break CUBLAS; the only thing I could imagine is that it breaks something in the host code, causing the data returned by the GPU to be incorrectly displayed as NaNs.

Adding your changes to my code fixes the issue (no NaN).

As far as I recall (vague memories) __m64 data is handled by the old style x87/MMX units, not by the SSE units. And after using the MMX units, an FPU context switch is required by calling _mm_empty(), otherwise subsequent host-side floating-point processing can/will go off the rails.

What happens if you add a call to _mm_empty() at the end of your original break_cublas() function?

Adding _mm_empty() to the end of break_cublas and break_cusparse functions fixed the issue.

Thanks.

nice work njuffa