Here is an example that uses tensor operations. There is a lot going on in the code so please study it before jumping in. It provides a couple of knobs to turn to prove to yourself it’s working correctly.
There are two scenarios that use tensor operation with complex half precision.
- CUDA_C_16F, CUDA_C_16F, CUDA_C_16F, CUDA_C_32F, CUDA_C_32F
- CUDA_C_16F, CUDA_C_16F, CUDA_C_32F, CUDA_C_32F, CUDA_C_32F
First, set IDENTITY 1 and PRINT 1. This will create 2 input identity matrices, in matrix A and B. The result should print a 16x16 identity matrix. You’ll notice they are pairs, to show real and imaginary parts.
Next, set PRINT 0 and you can test multiple square matrices with the identity test.
Lastly, set IDENTITY 0 and PRINT 0 and you will test multiple square and non-square matrices with randomly generated data.
You should notice another define variable TIME_TRANSFORM. When matrix A and B are generated with random numbers, they were generated with an interleaved layout. Meaning data is stored [real, imaginary, real, imaginary, …]. In order to utilize Tensor Cores the data must be in planar layout. Meaning data is stored [real, real, real, … (half way), imaginary, imaginary, imaginary].
When TIME_TRANSFORM 1 then the time taken to transform A and B to planar layout, perform matrix multiplication, and the time taken to transform C from planar to interleaved layout is calculated. When TIME_TRANSFORM 0 only matrix multiplication is profiled.
That should cover everything and I hope it helps.
/*
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws. Users and possessors of this source code
* are hereby granted a nonexclusive, royalty-free license to use this code
* in individual and commercial software.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*
* Any use of this source code in individual and commercial software must
* include, in the user documentation and internal comments to the code,
* the above Disclaimer and U.S. Government End Users Notice.
*/
/* This example demonstrates how to use the CUBLAS library
* by scaling an array of floating-point values on the device
* and comparing the result to the same operation performed
* on the host.
*/
/* Includes, system */
#include <cstdio>
/* Includes, cuda */
#include <cublasLt.h>
#include <cuda_runtime.h>
#include <thrust/complex.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/random.h>
#include "helper_cuda.h"
#define PRINT 0
#define IDENTITY 0
#define TIME_TRANSFORM 1
#define SCENARIO 0
auto constexpr kernelRepeats = 10;
auto constexpr threadsPerBlock = 1024;
#if SCENARIO == 0 // CUDA_C_16F, CUDA_C_16F, CUDA_C_16F, CUDA_C_32F, CUDA_C_32F
auto constexpr cudaTypeI = CUDA_C_16F;
typedef half2 dataTypeI;
auto constexpr cudaTypeO = CUDA_C_16F;
typedef half2 dataTypeO;
//auto constexpr cudaTypeS = CUDA_C_32F;
typedef thrust::complex<float> dataTypeS;
auto constexpr cudaTypeCom = CUDA_C_32F;
#elif SCENARIO == 1 // CUDA_C_16F, CUDA_C_16F, CUDA_C_32F, CUDA_C_32F, CUDA_C_32F
auto constexpr cudaTypeI = CUDA_C_16F;
typedef half2 dataTypeI;
auto constexpr cudaTypeO = CUDA_C_32F;
typedef thrust::complex<float> dataTypeO;
//auto constexpr cudaTypeS = CUDA_C_32F;
typedef thrust::complex<float> dataTypeS;
auto constexpr cudaTypeCom = CUDA_C_32F;
#endif
struct GenRand {
__device__
dataTypeI operator ()( int const & idx ) {
dataTypeI result;
thrust::default_random_engine randEng;
thrust::uniform_real_distribution<float> uniDist;
randEng.discard( idx );
result.x = __float2half( uniDist( randEng ) );
result.y = __float2half( uniDist( randEng ) );
return ( result );
}
};
struct setIdentity {
int const m;
setIdentity( int const & _m ) :
m( _m ) {
}
__device__
dataTypeI operator ()( int const & idx ) {
dataTypeI result;
result.x = __float2half( 0.0f );
result.y = __float2half( 0.0f );
int const diagIdx = ( m + 1 ); // Since we are using complex half.
if ( idx % ( diagIdx ) == 0 ) result.x = __float2half( 1.0f );
return ( result );
}
};
template<typename Pointer>
__global__ void __launch_bounds__(threadsPerBlock) checkIdentity( int const n, int const m, dataTypeO const * d_C, Pointer d_p ) {
for ( int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < n; tid += blockDim.x * gridDim.x ) {
int const diagIdx = m + 1;
#if SCENARIO == 0
if ( tid % ( diagIdx ) == 0 ) { // If thread index is on the diagonal
if ( __hge( fabsf( d_C[tid].x - __float2half(1.0f) ), __float2half(1e-7f) ) ) *d_p = false; // abs( d_C - 1.0f ) > 1e-7
} else if ( __hge( d_C[tid].x, __float2half(1e-7f) ) ) *d_p = false;
#elif SCENARIO == 1
if ( tid % ( diagIdx ) == 0 ) { // If thread index is on the diagonal
if ( fabsf( d_C[tid].real( ) - 1.0f ) > 1e-7f ) *d_p = false;
} else if ( d_C[tid].real( ) > 1e-7f ) *d_p = false;
#endif
}
};
void LtSgemm(
cublasLtHandle_t ltHandle,
cublasOperation_t transa,
cublasOperation_t transb,
int const & m,
int const & n,
int const & k,
dataTypeS const *alpha,
int const & sizeA,
dataTypeI const *A,
int const & lda,
int const & sizeB,
dataTypeI const *B,
int const & ldb,
dataTypeS const *beta,
int const & sizeC,
dataTypeO *C,
int const & ldc,
void *workSpace,
size_t workSpaceSize ) {
// The offset should start right after real data
size_t planarOffsetA = ( sizeA * sizeof(dataTypeI) ) / 2;
size_t planarOffsetB = ( sizeB * sizeof(dataTypeI) ) / 2;
size_t planarOffsetC = ( sizeC * sizeof(dataTypeO) ) / 2;
cublasLtMatmulDesc_t operationDesc = nullptr;
cublasLtMatrixLayout_t Adesc = nullptr, Bdesc = nullptr, Cdesc = nullptr;
cublasLtMatmulPreference_t preference = nullptr;
dataTypeI * Atransform, *Btransform;
dataTypeO * Ctransform;
cublasLtMatrixTransformDesc_t transformDescI = nullptr, transformDescO = nullptr;
cublasLtMatrixLayout_t AtransformDesc = nullptr, BtransformDesc = nullptr, CtransformDesc = nullptr;
// Allocate memory for transformed matrix
checkCudaErrors( cudaMalloc( reinterpret_cast<void**>(&Atransform), sizeA * sizeof(dataTypeI) ) );
checkCudaErrors( cudaMalloc( reinterpret_cast<void**>(&Btransform), sizeB * sizeof(dataTypeI) ) );
checkCudaErrors( cudaMalloc( reinterpret_cast<void**>(&Ctransform), sizeC * sizeof(dataTypeO) ) );
// Create preference handle; In general, extra attributes can be
// used here to disable tensor ops or to make sure algo selected
// will work with badly aligned A, B, C. However, for simplicity
// here we assume A,B,C are always well aligned (e.g., directly
// come from cudaMalloc)
checkCudaErrors( cublasLtMatmulPreferenceCreate( &preference ) );
checkCudaErrors(
cublasLtMatmulPreferenceSetAttribute( preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workSpaceSize, sizeof( workSpaceSize ) ) );
// Create operation descriptor; see cublasLtMatmulDescAttributes_t
// for details about defaults; here we just set the transforms for
// A and B.
checkCudaErrors( cublasLtMatmulDescCreate( &operationDesc, cudaTypeCom ) );
checkCudaErrors( cublasLtMatmulDescSetAttribute( operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof( transa ) ) );
checkCudaErrors( cublasLtMatmulDescSetAttribute( operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof( transa ) ) );
// Create matrix descriptors for interleaved data. Not setting any extra attributes.
checkCudaErrors( cublasLtMatrixLayoutCreate( &Adesc, cudaTypeI, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda ) );
checkCudaErrors( cublasLtMatrixLayoutCreate( &Bdesc, cudaTypeI, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb ) );
checkCudaErrors( cublasLtMatrixLayoutCreate( &Cdesc, cudaTypeO, m, n, ldc ) );
// Create transform descriptor to convert interleaved to planar
checkCudaErrors( cublasLtMatrixTransformDescCreate( &transformDescI, cudaTypeCom ) );
checkCudaErrors( cublasLtMatrixTransformDescCreate( &transformDescO, cudaTypeCom ) );
// Create matrix descriptors for planar data. Not setting any extra attributes.
// Need to double check 3rd parameter
checkCudaErrors( cublasLtMatrixLayoutCreate( &AtransformDesc, cudaTypeI, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda ) );
checkCudaErrors( cublasLtMatrixLayoutCreate( &BtransformDesc, cudaTypeI, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb ) );
checkCudaErrors( cublasLtMatrixLayoutCreate( &CtransformDesc, cudaTypeO, m, n, ldc ) );
// Configure inputs and outputs to as planar layout
checkCudaErrors(
cublasLtMatrixLayoutSetAttribute( AtransformDesc, CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET, &planarOffsetA, sizeof( planarOffsetA ) ) );
checkCudaErrors(
cublasLtMatrixLayoutSetAttribute( BtransformDesc, CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET, &planarOffsetB, sizeof( planarOffsetB ) ) );
checkCudaErrors(
cublasLtMatrixLayoutSetAttribute( CtransformDesc, CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET, &planarOffsetC, sizeof( planarOffsetC ) ) );
// Create CUDA event to time the execution time of each algo
cudaEvent_t startEvent = nullptr, stopEvent = nullptr;
cudaStream_t stream = nullptr;
#if TIME_TRANSFORM == 0
checkCudaErrors(
cublasLtMatrixTransform(
ltHandle,
transformDescI,
alpha,
A,
Adesc,
beta,
NULL,
NULL,
Atransform,
AtransformDesc,
stream ) );
checkCudaErrors(
cublasLtMatrixTransform(
ltHandle,
transformDescI,
alpha,
B,
Bdesc,
beta,
nullptr,
nullptr,
Btransform,
BtransformDesc,
stream ) );
#endif
checkCudaErrors( cudaEventCreate( &startEvent, cudaEventBlockingSync ) );
checkCudaErrors( cudaEventCreate( &stopEvent, cudaEventBlockingSync ) );
checkCudaErrors( cudaEventRecord( startEvent, stream ) );
for ( int loop = 0; loop < kernelRepeats; loop++ ) {
#if TIME_TRANSFORM == 1
// Transform interleaved data to planar
checkCudaErrors(
cublasLtMatrixTransform(
ltHandle,
transformDescI,
alpha,
A,
Adesc,
beta,
NULL,
NULL,
Atransform,
AtransformDesc,
stream ) );
checkCudaErrors(
cublasLtMatrixTransform(
ltHandle,
transformDescI,
alpha,
B,
Bdesc,
beta,
nullptr,
nullptr,
Btransform,
BtransformDesc,
stream ) );
#endif
checkCudaErrors(
cublasLtMatmul(
ltHandle,
operationDesc,
alpha,
Atransform,
AtransformDesc,
Btransform,
BtransformDesc,
beta,
Ctransform,
CtransformDesc,
Ctransform,
CtransformDesc,
nullptr,
workSpace,
workSpaceSize,
stream ) );
#if TIME_TRANSFORM == 1
// Transform planar to interleaved data in output matrix
checkCudaErrors(
cublasLtMatrixTransform(
ltHandle,
transformDescO,
alpha,
Ctransform,
CtransformDesc,
beta,
nullptr,
nullptr,
C,
Cdesc,
stream ) );
#endif
}
checkCudaErrors( cudaEventRecord( stopEvent, stream ) );
checkCudaErrors( cudaEventSynchronize( stopEvent ) );
float time;
checkCudaErrors( cudaEventElapsedTime( &time, startEvent, stopEvent ) );
#if TIME_TRANSFORM == 0
// Transform planar to interleaved data in output matrix
checkCudaErrors(
cublasLtMatrixTransform(
ltHandle,
transformDescO,
alpha,
Ctransform,
CtransformDesc,
beta,
nullptr,
nullptr,
C,
Cdesc,
stream ) );
#endif
double timeAvg = ( time * 1e-3 ) / kernelRepeats; // Convert to seconds, then divide by loops
double gflop = ( 8 * static_cast<unsigned long long int>( m * n ) * k ) * 1e-9; // Complex
printf(
#if IDENTITY
"%d %d %d %d %d %d %d %0.2f %0.0f ",
#else
"%d %d %d %d %d %d %d %0.2f %0.0f\n",
#endif
m,
n,
k,
cudaTypeI,
cudaTypeI,
cudaTypeO,
cudaTypeCom,
time,
gflop/timeAvg );
// Descriptors are no longer needed as all GPU work was already enqueued.
checkCudaErrors( cublasLtMatmulPreferenceDestroy( preference ) );
checkCudaErrors( cublasLtMatrixLayoutDestroy( Cdesc ) );
checkCudaErrors( cublasLtMatrixLayoutDestroy( Bdesc ) );
checkCudaErrors( cublasLtMatrixLayoutDestroy( Adesc ) );
checkCudaErrors( cublasLtMatmulDescDestroy( operationDesc ) );
checkCudaErrors( cudaFree ( Atransform ) );
checkCudaErrors( cudaFree ( Btransform ) );
checkCudaErrors( cudaFree ( Ctransform ) );
checkCudaErrors( cublasLtMatrixLayoutDestroy( AtransformDesc ) );
checkCudaErrors( cublasLtMatrixLayoutDestroy( BtransformDesc ) );
checkCudaErrors( cublasLtMatrixLayoutDestroy( CtransformDesc ) );
checkCudaErrors( cublasLtMatrixTransformDescDestroy( transformDescI ) );
checkCudaErrors( cublasLtMatrixTransformDescDestroy( transformDescO ) );
checkCudaErrors( cudaEventDestroy( startEvent ) );
checkCudaErrors( cudaEventDestroy( stopEvent ) );
}
void calculate( int const & m, int const & n, int const & k, int & count, int const & square ) {
dataTypeS alpha = 1.0f;
dataTypeS beta = 0.0f;
int lda = m, ldb = k, ldc = m;
void *d_workspace = nullptr;
size_t sizeA = m * k;
size_t sizeB = k * n;
size_t sizeC = m * n;
size_t workspace = 4096;
cublasLtHandle_t handle;
/* Initialize CUBLAS */
checkCudaErrors( cublasLtCreate( &handle ) );
/* Allocate device memory for workspace */
checkCudaErrors( cudaMalloc( (void **)&d_workspace, workspace) );
/* Allocate device memory for the matrices */
thrust::device_vector<dataTypeI> d_A( sizeA, __float2half2_rn(0.0f) );
thrust::device_vector<dataTypeI> d_B( sizeB, __float2half2_rn(0.0f) );
#if SCENARIO == 0
thrust::device_vector<dataTypeO> d_C( sizeC, __float2half2_rn(0.0f) );
#elif SCENARIO == 1
thrust::device_vector<dataTypeO> d_C( sizeC, 0.0f );
#endif
/* Retrieve raw pointer for device data */
dataTypeI * d_A_ptr = thrust::raw_pointer_cast( &d_A[0] );
dataTypeI * d_B_ptr = thrust::raw_pointer_cast( &d_B[0] );
dataTypeO * d_C_ptr = thrust::raw_pointer_cast( &d_C[0] );
#if IDENTITY
/* Generate identity matrix on device */
thrust::transform(
thrust::make_counting_iterator( 0 ),
thrust::make_counting_iterator( static_cast<int>( sizeA ) ),
d_A.begin( ),
setIdentity( m ) );
thrust::transform(
thrust::make_counting_iterator( 0 ),
thrust::make_counting_iterator( static_cast<int>( sizeB ) ),
d_B.begin( ),
setIdentity( m ) );
#else
/* Generate random data on device */
thrust::transform( thrust::make_counting_iterator( 0 ), thrust::make_counting_iterator( static_cast<int>( sizeA ) ), d_A.begin( ), GenRand( ) );
thrust::transform( thrust::make_counting_iterator( 0 ), thrust::make_counting_iterator( static_cast<int>( sizeB ) ), d_B.begin( ), GenRand( ) );
#endif
printf( "%d %d ", count, square );
count++;
LtSgemm(
handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m,
n,
k,
&alpha,
sizeA,
d_A_ptr,
lda,
sizeB,
d_B_ptr,
ldb,
&beta,
sizeC,
d_C_ptr,
ldc,
d_workspace,
workspace );
#if IDENTITY
/* Generate device vector to hold flag */
thrust::device_vector<bool> d_p(1, true);
checkIdentity<<<sizeC/threadsPerBlock + 1, threadsPerBlock>>>( sizeC, m, d_C_ptr, d_p.data());
/* Copy device flag to host */
thrust::host_vector<bool> h_p = d_p;
#if PRINT
thrust::host_vector<dataTypeI> h_A = d_A;
thrust::host_vector<dataTypeI> h_B = d_B;
thrust::host_vector<dataTypeO> h_C = d_C;
printf("\n"); // Formatting stdout
for ( int a = 0; a < k; a++ ) {
for ( int b = 0; b < n; b++ )
printf( "{%0.1f %0.1f} ", __half2float(h_A[a * k + b].x), __half2float(h_A[a * k + b].y) );
printf("\n");
}
printf("\n");
for ( int a = 0; a < m; a++ ) {
for ( int b = 0; b < k; b++ )
printf( "{%0.1f %0.1f} ", __half2float(h_B[a * m + b].x), __half2float(h_A[a * m + b].y) );
printf("\n");
}
printf("\n");
for ( int a = 0; a < m; a++ ) {
for ( int b = 0; b < n; b++ )
#if SCENARIO == 0
printf( "{%0.1f %0.1f} ", __half2float(h_C[a * m + b].x), __half2float(h_C[a * m + b].y) );
#elif SCENARIO == 1
printf( "{%0.1f, %0.1f} ", h_C[a * m + b].real(), h_C[a * m + b].imag() );
#endif
printf( "\n" );
}
printf( "\n" );
#endif
if ( h_p[0] ) printf("Passed Identity Test\n");
else printf("\n");
#endif
/* Destroy workspace */
checkCudaErrors( cudaFree (d_workspace) );
/* Shutdown */
checkCudaErrors( cublasLtDestroy( handle ) );
}
/* Main */
int main( int argc, char **argv ) {
int dev = findCudaDevice( argc, ( const char ** ) argv );
if ( dev == -1 ) throw std::runtime_error( "!!!! CUDA device not found" );
printf( "Run Square M N K A_Type B_Type C_Type Compute_Type Time GFLOPS\n" );
int count = 0;
int square = 1;
#if IDENTITY
// Identity for square matrices
#if PRINT
for ( int m = 16; m <= 16; m *= 2 )
#else
for ( int m = 16; m <= 8192; m *= 2 )
#endif
calculate( m, m, m, count, square );
printf( "\n" ); // For better readability stdout
#else
// Compute matrices
for ( int m = 512; m <= 8192; m *= 2 )
for ( int k = 1024; k <= 4096; k *= 2 )
calculate( m, m, k, count, square );
printf("\n");// For better readability stdout
count = 0;
square = 0;
// Compute non-square matrices
for ( int m = 4096; m <= 32768; m *= 2 )
for ( int n = 512; n <= 8192; n *= 2 )
for ( int k = 8; k <= 128; k *= 2 )
calculate( m, n, k, count, square );
printf("\n");// For better readability stdout
#endif
return ( EXIT_SUCCESS );
}