sdot cuda vs cublas result differences

I wrote a simple application that performs sdot for 2 large vectors.

If the vectors are all ones, the results match. If one vector is all ones and one is all negative ones, results match. If one vector is all ones and the other one has half negative ones and half ones, the results don’t match. Does anyone know why?

#include <cassert>
#include <cstdio>
#include <vector>

#include <cooperative_groups.h>
#include <cub/cub.cuh>

#include <cublas_v2.h>

using data_type = float;

constexpr size_t num {1<<24};
constexpr size_t size_bytes { num * sizeof( data_type ) };
constexpr size_t loops { 1 };

__managed__ data_type result {};
template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void sdot( const data_type *__restrict__ d_a, const data_type *__restrict__ d_b ) {

    auto block = cooperative_groups::this_thread_block( );

    typedef cub::BlockLoad<data_type, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
    typedef cub::BlockReduce<data_type, BLOCK_THREADS, cub::BLOCK_REDUCE_WARP_REDUCTIONS>              BlockReduce;

    __shared__ union TempStorage {
        typename BlockLoad::TempStorage   load;
        typename BlockReduce::TempStorage reduce;

    } temp_storage;

    float a[ITEMS_PER_THREAD] {};
    float b[ITEMS_PER_THREAD] {};

    BlockLoad( temp_storage.load ).Load( d_a, a );
    block.sync( );

    BlockLoad( temp_storage.load ).Load( d_b, b );
    block.sync( );

    data_type sum {};

    for ( int i = 0; i < ITEMS_PER_THREAD; i++ )
        sum += a[i] * b[i];

    data_type aggregate = BlockReduce( temp_storage.reduce ).Sum( sum );
    block.sync( );

    if ( block.thread_rank( ) == 0 ) {
        atomicAdd( &result, aggregate );
    }
}
template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
void Test( const data_type *a, const data_type *b ) {

    cudaMemset( &result, 0, sizeof( data_type ) );

    cudaEvent_t start, stop;
    cudaEventCreate( &start );
    cudaEventCreate( &stop );

    void *args[] { &a, &b };

    const int blocksPerGrid { num  / BLOCK_THREADS / ITEMS_PER_THREAD };
    std::printf("num: %d, blocksPerGrid: %d\n", num, blocksPerGrid);

    cudaEventRecord( start );

    for ( int i = 0; i < loops; i++ ) {
        cudaLaunchKernel( reinterpret_cast<void *>( &sdot<BLOCK_THREADS, ITEMS_PER_THREAD> ),
                          blocksPerGrid,
                          BLOCK_THREADS,
                          args,
                          0,
                          0 );
        }
    cudaEventRecord( stop );
    cudaEventSynchronize( stop );

    float milliseconds {};
    cudaEventElapsedTime( &milliseconds, start, stop );

    std::printf( "R: %f |B: %d | T: %d | I: %d @ %f\n", result, blocksPerGrid, BLOCK_THREADS, ITEMS_PER_THREAD, ( milliseconds / loops ) );
}

int main( ) {

    data_type *a {};
    data_type *b {};

    cudaEvent_t start1, stop1;
    cublasHandle_t cublasHandle = 0;
    float milliseconds {};

    cudaMallocManaged( &a, size_bytes );
    cudaMallocManaged( &b, size_bytes );

    for ( int i = 0; i < num/2; i++ ) {
        a[i] = 1;
        b[i] = -1;
    }

    for ( int i = num/2; i < num; i++ ) {
        a[i] = 1;
        b[i] = 1;
    }

	// Copy data to GPU
    cudaMemPrefetchAsync( a, num, 0 );
    cudaMemPrefetchAsync( b, num, 0 );

	// Warm up kernel
    Test<32, 2>( a, b );

    std::printf( "---- Disregard warm-up above ----- \n" );

    std::printf( "---------------------------------- \n" );

    Test<512, 2>( a, b );

    std::printf( "--------------CUBLAS-------------- \n" );
    cublasCreate(&cublasHandle);

    cudaEventCreate( &start1 );
    cudaEventCreate( &stop1 );

    cudaEventRecord( start1 );
    for ( int i = 0; i < loops; i++ ) {
        cublasSdot(cublasHandle, num , a, 1, b, 1, &result);
    }   
    cudaEventRecord( stop1 );
    cudaEventSynchronize( stop1 );

    cudaEventElapsedTime( &milliseconds, start1, stop1 );
    std::printf( "R: %f @ %f | num*loops: %lu\n", result, (milliseconds/loops), (num*loops) );

    cublasDestroy(cublasHandle);
}

Hi,

Thanks for reporting this to us.

We try to reproduce this issue in our environment but meet some error related to the cub.cuh.
Would you mind to tell us how to reproduce this issue on Xavier? Are you using JetPack4.3?

Thanks.

Hi,

I run the code as provided above and see the error in the printout of result. I am using jet pack 4.2.2. I downloaded cub separately and added that as part of my include directory in makefile.

Thanks.

Hi,

Any thing more on this?

Hi,

Would you mind to share a more detail steps for reproducing this issue?

Ex. The complete source with Makefile and the cub link you are using.

Thank.

Hi,

The source code block I provided above resides in a file called sdot.cu.

This is the makefile

NVCC	:=nvcc --cudart=static -ccbin g++
CFLAGS	:=-O3 -std=c++11

INC_DIR	:=-I/usr/local/cuda/samples/common/inc
LIB_DIR	:=
LIBS	:=

ARCHES :=-gencode arch=compute_75,code=\"compute_75,sm_75\" \
	-gencode arch=compute_70,code=\"compute_70,sm_70\"

SOURCES :=sdot

all: $(SOURCES)
.PHONY: all

sdot: sdot.cu
	$(NVCC) $(CFLAGS) $(INC_DIR) $(LIB_DIR) ${ARCHES} $^ -o $@ $(LIBS)

clean:
	rm -f $(SOURCES)

This is the cub link https://nvlabs.github.io/cub/index.html.

Hi,

The problem is the fact that the offset was not provided while performing block load. The following code is with the fix.

#include <cassert>
#include <cstdio>
#include <vector>

#include <cooperative_groups.h>
#include <cub/cub.cuh>

#include <cublas_v2.h>

using data_type = float;

constexpr size_t num {1<<24};
constexpr size_t size_bytes { num * sizeof( data_type ) };
constexpr size_t loops { 1 };

__managed__ data_type result {};
template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__global__ void sdot( const data_type *__restrict__ d_a, const data_type *__restrict__ d_b ) {

    auto block = cooperative_groups::this_thread_block( );
    const int block_offset = blockIdx.x * blockDim.x * ITEMS_PER_THREAD;

    typedef cub::BlockLoad<data_type, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
    typedef cub::BlockReduce<data_type, BLOCK_THREADS, cub::BLOCK_REDUCE_WARP_REDUCTIONS>              BlockReduce;

    __shared__ union TempStorage {
        typename BlockLoad::TempStorage   load;
        typename BlockReduce::TempStorage reduce;

    } temp_storage;

    float a[ITEMS_PER_THREAD] {};
    float b[ITEMS_PER_THREAD] {};

    BlockLoad( temp_storage.load ).Load( d_a + block_offset, a );
    block.sync( );

    BlockLoad( temp_storage.load ).Load( d_b + block_offset, b );
    block.sync( );

    data_type sum {};

    for ( int i = 0; i < ITEMS_PER_THREAD; i++ )
        sum += a[i] * b[i];

    data_type aggregate = BlockReduce( temp_storage.reduce ).Sum( sum );
    block.sync( );

    if ( block.thread_rank( ) == 0 ) {
        atomicAdd( &result, aggregate );
    }
}
template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
void Test( const data_type *a, const data_type *b ) {

    cudaMemset( &result, 0, sizeof( data_type ) );

    cudaEvent_t start, stop;
    cudaEventCreate( &start );
    cudaEventCreate( &stop );

    void *args[] { &a, &b };

    const int blocksPerGrid { num  / BLOCK_THREADS / ITEMS_PER_THREAD };
    std::printf("num: %d, blocksPerGrid: %d\n", num, blocksPerGrid);

    cudaEventRecord( start );

    for ( int i = 0; i < loops; i++ ) {
        cudaLaunchKernel( reinterpret_cast<void *>( &sdot<BLOCK_THREADS, ITEMS_PER_THREAD> ),
                          blocksPerGrid,
                          BLOCK_THREADS,
                          args,
                          0,
                          0 );
        }
    cudaEventRecord( stop );
    cudaEventSynchronize( stop );

    float milliseconds {};
    cudaEventElapsedTime( &milliseconds, start, stop );

    std::printf( "R: %f |B: %d | T: %d | I: %d @ %f\n", result, blocksPerGrid, BLOCK_THREADS, ITEMS_PER_THREAD, ( milliseconds / loops ) );
}

int main( ) {

    data_type *a {};
    data_type *b {};

    cudaEvent_t start1, stop1;
    cublasHandle_t cublasHandle = 0;
    float milliseconds {};

    cudaMallocManaged( &a, size_bytes );
    cudaMallocManaged( &b, size_bytes );

    for ( int i = 0; i < num/2; i++ ) {
        a[i] = 1;
        b[i] = -1;
    }

    for ( int i = num/2; i < num; i++ ) {
        a[i] = 1;
        b[i] = 1;
    }

	// Copy data to GPU
    cudaMemPrefetchAsync( a, num, 0 );
    cudaMemPrefetchAsync( b, num, 0 );

	// Warm up kernel
    Test<32, 2>( a, b );

    std::printf( "---- Disregard warm-up above ----- \n" );

    std::printf( "---------------------------------- \n" );

    Test<512, 2>( a, b );

    std::printf( "--------------CUBLAS-------------- \n" );
    cublasCreate(&cublasHandle);

    cudaEventCreate( &start1 );
    cudaEventCreate( &stop1 );

    cudaEventRecord( start1 );
    for ( int i = 0; i < loops; i++ ) {
        cublasSdot(cublasHandle, num , a, 1, b, 1, &result);
    }   
    cudaEventRecord( stop1 );
    cudaEventSynchronize( stop1 );

    cudaEventElapsedTime( &milliseconds, start1, stop1 );
    std::printf( "R: %f @ %f | num*loops: %lu\n", result, (milliseconds/loops), (num*loops) );

    cublasDestroy(cublasHandle);
}

sdot_sources.zip (1.82 KB)