Concurrent contractions/qr in cutensornet

Hello,

I am currently implementing an algorithm were I am using cuTensorNet to contract and qr decompose some tensors. I have implemented the contraction in a way, that the contraction plans, allocation and everything else is done separately beforehand and then I basically only call cutensornetContractSlices on two different tensors with two different streams. However I have noticed while profiling, that the kernels for the contraction are still called sequentially. Here is a example, this is basically the cutensornet_example just with everything twice:

/*
 * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

// Sphinx: #1

#include <stdlib.h>
#include <stdio.h>

#include <unordered_map>
#include <vector>
#include <cassert>

#include <cuda_runtime.h>
#include <cutensornet.h>


#define HANDLE_ERROR(x)                                           \
{ const auto err = x;                                             \
  if( err != CUTENSORNET_STATUS_SUCCESS )                         \
  { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
    fflush(stdout);                                               \
  }                                                               \
};

#define HANDLE_CUDA_ERROR(x)                                      \
{ const auto err = x;                                             \
  if( err != cudaSuccess )                                        \
  { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
    fflush(stdout);                                               \
  }                                                               \
};


struct GPUTimer
{
   GPUTimer(cudaStream_t stream): stream_(stream)
   {
      cudaEventCreate(&start_);
      cudaEventCreate(&stop_);
   }

   ~GPUTimer()
   {
      cudaEventDestroy(start_);
      cudaEventDestroy(stop_);
   }

   void start()
   {
      cudaEventRecord(start_, stream_);
   }

   float seconds()
   {
      cudaEventRecord(stop_, stream_);
      cudaEventSynchronize(stop_);
      float time;
      cudaEventElapsedTime(&time, start_, stop_);
      return time * 1e-3;
   }

   private:
   cudaEvent_t start_, stop_;
   cudaStream_t stream_;
};


int main()
{
   static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");

   bool verbose = true;

   // Check cuTensorNet version
   const size_t cuTensornetVersion = cutensornetGetVersion();
   if(verbose)
      printf("cuTensorNet version: %ld\n", cuTensornetVersion);

   // Set GPU device
   int numDevices {0};
   HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
   const int deviceId = 0;
   HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
   cudaDeviceProp prop;
   HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );

   if(verbose) {
      printf("===== device info ======\n");
      printf("GPU-name:%s\n", prop.name);
      printf("GPU-clock:%d\n", prop.clockRate);
      printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
      printf("GPU-nSM:%d\n", prop.multiProcessorCount);
      printf("GPU-major:%d\n", prop.major);
      printf("GPU-minor:%d\n", prop.minor);
      printf("t:%d\n", prop.textureAlignment);
      printf("========================\n");
   }

   typedef float floatType;
   cudaDataType_t typeData = CUDA_R_32F;
   cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;

   if(verbose)
      printf("Included headers and defined data types\n");

   // Sphinx: #2
   /**********************
   * Computing: R_{k,l} = A_{a,b,c,d,e,f} B_{b,g,h,e,i,j} C_{m,a,g,f,i,k} D_{l,c,h,d,j,m}
   **********************/

   constexpr int32_t numInputs = 4;

   // Create vectors of tensor modes
   std::vector<int32_t> modesA{'a','b','c','d','e','f'};

   std::vector<int32_t> modesB{'b','g','h','e','i','j'};

   std::vector<int32_t> modesC{'m','a','g','f','i','k'};

   std::vector<int32_t> modesD{'l','c','h','d','j','m'};

   std::vector<int32_t> modesR{'k','l'};

   // Set mode extents
   std::unordered_map<int32_t, int64_t> extent;
   extent['a'] = 16;
   extent['b'] = 16;
   extent['c'] = 16;
   extent['d'] = 16;
   extent['e'] = 16;
   extent['f'] = 16;
   extent['g'] = 16;
   extent['h'] = 16;
   extent['i'] = 16;
   extent['j'] = 16;
   extent['k'] = 16;
   extent['l'] = 16;
   extent['m'] = 16;

   // Create a vector of extents for each tensor
   std::vector<int64_t> extentA;
   for (auto mode : modesA)
      extentA.push_back(extent[mode]);
   std::vector<int64_t> extentB;
   for (auto mode : modesB)
      extentB.push_back(extent[mode]);
   std::vector<int64_t> extentC;
   for (auto mode : modesC)
      extentC.push_back(extent[mode]);
   std::vector<int64_t> extentD;
   for (auto mode : modesD)
      extentD.push_back(extent[mode]);
   std::vector<int64_t> extentR;
   for (auto mode : modesR)
      extentR.push_back(extent[mode]);

   if(verbose)
      printf("Defined tensor network, modes, and extents\n");

   // Sphinx: #3
   /**********************
   * Allocating data
   **********************/

   size_t elementsA = 1;
   for (auto mode : modesA)
      elementsA *= extent[mode];
   size_t elementsB = 1;
   for (auto mode : modesB)
      elementsB *= extent[mode];
   size_t elementsC = 1;
   for (auto mode : modesC)
      elementsC *= extent[mode];
   size_t elementsD = 1;
   for (auto mode : modesD)
      elementsD *= extent[mode];
   size_t elementsR = 1;
   for (auto mode : modesR)
      elementsR *= extent[mode];

   size_t sizeA = sizeof(floatType) * elementsA;
   size_t sizeB = sizeof(floatType) * elementsB;
   size_t sizeC = sizeof(floatType) * elementsC;
   size_t sizeD = sizeof(floatType) * elementsD;
   size_t sizeR = sizeof(floatType) * elementsR;
   if(verbose)
      printf("Total GPU memory used for tensor storage: %.2f GiB\n",
             (sizeA + sizeB + sizeC + sizeD + sizeR) / 1024. /1024. / 1024);

   void* rawDataIn_d[numInputs];
   void* R_d;
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[0], sizeA) );
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[1], sizeB) );
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[2], sizeC) );
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[3], sizeD) );
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &R_d, sizeR));

   void* rawDataIn_d2[numInputs];
   void* R_d2;
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d2[0], sizeA) );
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d2[1], sizeB) );
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d2[2], sizeC) );
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d2[3], sizeD) );
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &R_d2, sizeR));

   floatType *A = (floatType*) malloc(sizeof(floatType) * elementsA);
   floatType *B = (floatType*) malloc(sizeof(floatType) * elementsB);
   floatType *C = (floatType*) malloc(sizeof(floatType) * elementsC);
   floatType *D = (floatType*) malloc(sizeof(floatType) * elementsD);
   floatType *R = (floatType*) malloc(sizeof(floatType) * elementsR);

   if (A == NULL || B == NULL || C == NULL || D == NULL || R == NULL)
   {
      printf("Error: Host memory allocation failed!\n");
      return -1;
   }

   /*******************
   * Initialize data
   *******************/

   memset(R, 0, sizeof(floatType) * elementsR);
   for (uint64_t i = 0; i < elementsA; i++)
      A[i] = ((floatType) rand()) / RAND_MAX;
   for (uint64_t i = 0; i < elementsB; i++)
      B[i] = ((floatType) rand()) / RAND_MAX;
   for (uint64_t i = 0; i < elementsC; i++)
      C[i] = ((floatType) rand()) / RAND_MAX;
   for (uint64_t i = 0; i < elementsD; i++)
      D[i] = ((floatType) rand()) / RAND_MAX;

   HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[0], A, sizeA, cudaMemcpyHostToDevice) );
   HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[1], B, sizeB, cudaMemcpyHostToDevice) );
   HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[2], C, sizeC, cudaMemcpyHostToDevice) );
   HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[3], D, sizeD, cudaMemcpyHostToDevice) );

   HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d2[0], A, sizeA, cudaMemcpyHostToDevice) );
   HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d2[1], B, sizeB, cudaMemcpyHostToDevice) );
   HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d2[2], C, sizeC, cudaMemcpyHostToDevice) );
   HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d2[3], D, sizeD, cudaMemcpyHostToDevice) );

   if(verbose)
      printf("Allocated GPU memory for data, and initialize data\n");

   // Sphinx: #4
   /*************************
   * cuTensorNet
   *************************/

   cudaStream_t stream;
   cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

   cudaStream_t stream2;
   cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
   // cudaStreamCreate(&stream2);

   cutensornetHandle_t handle;
   HANDLE_ERROR( cutensornetCreate(&handle) );

      cutensornetHandle_t handle2;
   HANDLE_ERROR( cutensornetCreate(&handle2) );

   const int32_t nmodeA = modesA.size();
   const int32_t nmodeB = modesB.size();
   const int32_t nmodeC = modesC.size();
   const int32_t nmodeD = modesD.size();
   const int32_t nmodeR = modesR.size();

   /*******************************
   * Create Network Descriptor
   *******************************/

   const int32_t* modesIn[] = {modesA.data(), modesB.data(), modesC.data(), modesD.data()};
   int32_t const numModesIn[] = {nmodeA, nmodeB, nmodeC, nmodeD};
   const int64_t* extentsIn[] = {extentA.data(), extentB.data(), extentC.data(), extentD.data()};
   const int64_t* stridesIn[] = {NULL, NULL, NULL, NULL}; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout

   // Set up tensor network
   cutensornetNetworkDescriptor_t descNet;
   HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle,
                     numInputs, numModesIn, extentsIn, stridesIn, modesIn, NULL,
                     nmodeR, extentR.data(), /*stridesOut = */NULL, modesR.data(),
                     typeData, typeCompute,
                     &descNet) );

   cutensornetNetworkDescriptor_t descNet2;
   HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle2,
                     numInputs, numModesIn, extentsIn, stridesIn, modesIn, NULL,
                     nmodeR, extentR.data(), /*stridesOut = */NULL, modesR.data(),
                     typeData, typeCompute,
                     &descNet2) );

   if(verbose)
      printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");

   // Sphinx: #5
   /*******************************
   * Choose workspace limit based on available resources.
   *******************************/

   size_t freeMem, totalMem;
   HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
   uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
   if(verbose)
      printf("Workspace limit = %lu\n", workspaceLimit);

   /*******************************
   * Find "optimal" contraction order and slicing
   *******************************/

   cutensornetContractionOptimizerConfig_t optimizerConfig;
   HANDLE_ERROR( cutensornetCreateContractionOptimizerConfig(handle, &optimizerConfig) );

   // Set the desired number of hyper-samples (defaults to 0)
   int32_t num_hypersamples = 8;
   HANDLE_ERROR( cutensornetContractionOptimizerConfigSetAttribute(handle,
                     optimizerConfig,
                     CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_SAMPLES,
                     &num_hypersamples,
                     sizeof(num_hypersamples)) );

   // Create contraction optimizer info and find an optimized contraction path
   cutensornetContractionOptimizerInfo_t optimizerInfo;
   HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) );

   cutensornetContractionOptimizerInfo_t optimizerInfo2;
   HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle2, descNet2, &optimizerInfo2) );

   HANDLE_ERROR( cutensornetContractionOptimize(handle,
                                             descNet,
                                             optimizerConfig,
                                             workspaceLimit,
                                             optimizerInfo) );

   HANDLE_ERROR( cutensornetContractionOptimize(handle2,
                                             descNet2,
                                             optimizerConfig,
                                             workspaceLimit,
                                             optimizerInfo2) );

   if(verbose)
      printf("Found an optimized contraction path using cuTensorNet optimizer\n");

   // Sphinx: #6
   /*******************************
   * Create workspace descriptor, allocate workspace, and set it.
   *******************************/

   cutensornetWorkspaceDescriptor_t workDesc;
   HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );

   int64_t requiredWorkspaceSize = 0;
   HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle,
                                                         descNet,
                                                         optimizerInfo,
                                                         workDesc) );

   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
                                                   workDesc,
                                                   CUTENSORNET_WORKSIZE_PREF_MIN,
                                                   CUTENSORNET_MEMSPACE_DEVICE,
                                                   CUTENSORNET_WORKSPACE_SCRATCH,
                                                   &requiredWorkspaceSize) );

   void* work = nullptr;
   HANDLE_CUDA_ERROR( cudaMalloc(&work, requiredWorkspaceSize) );

   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
                                               workDesc,
                                               CUTENSORNET_MEMSPACE_DEVICE,
                                               CUTENSORNET_WORKSPACE_SCRATCH,
                                               work,
                                               requiredWorkspaceSize) );

   cutensornetWorkspaceDescriptor_t workDesc2;
   HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle2, &workDesc2) );

   HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle2,
                                                         descNet2,
                                                         optimizerInfo2,
                                                         workDesc2) );

   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle2,
                                                   workDesc2,
                                                   CUTENSORNET_WORKSIZE_PREF_MIN,
                                                   CUTENSORNET_MEMSPACE_DEVICE,
                                                   CUTENSORNET_WORKSPACE_SCRATCH,
                                                   &requiredWorkspaceSize) );

   void* work2 = nullptr;
   HANDLE_CUDA_ERROR( cudaMalloc(&work2, requiredWorkspaceSize) );

   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle2,
                                               workDesc2,
                                               CUTENSORNET_MEMSPACE_DEVICE,
                                               CUTENSORNET_WORKSPACE_SCRATCH,
                                               work2,
                                               requiredWorkspaceSize) );

   if(verbose)
      printf("Allocated and set up the GPU workspace\n");

   // Sphinx: #7
   /*******************************
   * Initialize the pairwise contraction plan (for cuTENSOR).
   *******************************/

   cutensornetContractionPlan_t plan;
   HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
                                                descNet,
                                                optimizerInfo,
                                                workDesc,
                                                &plan) );

   
   cutensornetContractionPlan_t plan2;
   HANDLE_ERROR( cutensornetCreateContractionPlan(handle2,
                                                descNet2,
                                                optimizerInfo,
                                                workDesc2,
                                                &plan2) );

   if(verbose)
      printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n");

   // Sphinx: #8
   /**********************
   * Execute the tensor network contraction
   **********************/

   // GPUTimer timer {stream};
   double minTimeCUTENSORNET = 1e100;
   const int numRuns = 1; // number of repeats to get stable performance results
   for (int i = 0; i < numRuns; ++i)
   {
      // HANDLE_CUDA_ERROR( cudaMemcpy(R_d, R, sizeR, cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
      // HANDLE_CUDA_ERROR( cudaMemcpy(R_d2, R, sizeR, cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
      // HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );

      /*
      * Contract all slices of the tensor network
      */
      // timer.start();

      int32_t accumulateOutput = 0; // output tensor data will be overwritten
      HANDLE_ERROR( cutensornetContractSlices(handle,
                     plan,
                     rawDataIn_d,
                     R_d,
                     accumulateOutput,
                     workDesc,
                     NULL, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
                     stream) );

      HANDLE_ERROR( cutensornetContractSlices(handle2,
                     plan2,
                     rawDataIn_d2,
                     R_d2,
                     accumulateOutput,
                     workDesc2,
                     NULL, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
                     stream2) );

      // Synchronize and measure best timing
      // auto time = timer.seconds();
      // minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
   }

   if(verbose)
      printf("Contracted the tensor network, each slice used the same contraction plan\n");

   // Print the 1-norm of the output tensor (verification)
   HANDLE_CUDA_ERROR( cudaStreamSynchronize(stream) );
   HANDLE_CUDA_ERROR( cudaStreamSynchronize(stream2) );
   HANDLE_CUDA_ERROR( cudaMemcpy(R, R_d, sizeR, cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
   double norm1 = 0.0;
   for (int64_t i = 0; i < elementsR; ++i) {
      norm1 += std::abs(R[i]);
   }
   if(verbose)
      printf("Computed the 1-norm of the output tensor: %e\n", norm1);

   // printf("Tensor network contraction time (ms) = %.3f\n", minTimeCUTENSORNET * 1000.f);

   /*************************/

   // Query the total Flop count for the tensor network contraction
   double flops {0.0};
   HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
                     handle,
                     optimizerInfo,
                     CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
                     &flops,
                     sizeof(flops)) );


   HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) );
   HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
   HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) );
   HANDLE_ERROR( cutensornetDestroyContractionOptimizerConfig(optimizerConfig) );
   HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) );
   HANDLE_ERROR( cutensornetDestroy(handle) );

   // Free Host memory resources
   if (R) free(R);
   if (D) free(D);
   if (C) free(C);
   if (B) free(B);
   if (A) free(A);

   // Free GPU memory resources
   if (work) cudaFree(work);
   if (R_d) cudaFree(R_d);
   if (rawDataIn_d[0]) cudaFree(rawDataIn_d[0]);
   if (rawDataIn_d[1]) cudaFree(rawDataIn_d[1]);
   if (rawDataIn_d[2]) cudaFree(rawDataIn_d[2]);
   if (rawDataIn_d[3]) cudaFree(rawDataIn_d[3]);

   if(verbose)
      printf("Freed resources and exited\n");

   return 0;
}

When profiling I see this:

I assume the sm80_gett… Kernels are the internal Kernels used to contract the tensors.
Why are the Kernels on stream 13 and stream 14 not executed in parallel? I noticed the same thing when doing 2 qr decompositions on two different streams. Is this just the way it is in cuTensorNet or am I missing something?

Thank you for contacting us and posting your question.
Some basic knowledge about streaming workload.
For any independent operations happening in different streams, we can identify 2 or 3 situations:

  1. if the operation on stream_1 fill the GPU SMs with load, then the GPU runtime scheduler is going to fill all the SMs with work. Thus stream_2 will be queued after stream_1, thus making it look like sequential ( maybe tiny overlap will happen at the end of stream_1 execution).
  2. if the operation of stream_1 is small such as it doesn’t fill the GPU SMs with work, then the runtime would be able to put operation from stream_2 into the SMs and users will eventually see it in the profiler
  3. if the operation of stream_1 is very very tiny (let say micro seconds) then the scheduler will not have time to dispatch stream_2 before stream_1 finish and user most likely will not see any overlap.

For the example shown here, you are running in regime (1)., you might see some overlap at the end of stream_1.
I rerun your example and set extents to 18, I was able to see some overlap at the end of stream_13. similarly I changed extent of l=4096 and a=2 and k=128, now you can see more overlap at the end of stream_13.

I hope this clarify.
Please let us know if you need more details.
Thanks

1 Like

Hello,

thank you very much for your explanations. I was indeed able to perform smaller contractions of my algorithm in parallel with some tweaks.

However I still have a problem with parallel SVD-decompositions, which are the main limitation for the runtime of my implementation currently. (And an implementation with mpi would currently be out of scope)

Consider the following code.

/*
 * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

// Sphinx: #1

#include <stdlib.h>
#include <stdio.h>

#include <unordered_map>
#include <vector>
#include <cassert>

#include <cuda_runtime.h>
#include <cutensornet.h>
#include <complex>


#define HANDLE_ERROR(x)                                           \
{ const auto err = x;                                             \
  if( err != CUTENSORNET_STATUS_SUCCESS )                         \
  { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
    fflush(stdout);                                               \
  }                                                               \
};

#define HANDLE_CUDA_ERROR(x)                                      \
{ const auto err = x;                                             \
  if( err != cudaSuccess )                                        \
  { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
    fflush(stdout);                                               \
  }                                                               \
};

// Function to compute combined Extent
int64_t computeCombinedExtent(const std::unordered_map<int32_t, int64_t> &extentMap,
                              const std::vector<int32_t> &modes)
{
    int64_t combinedExtent{1};
    for (auto mode : modes)
    {
        auto it = extentMap.find(mode);
        if (it != extentMap.end())
            combinedExtent *= it->second;
    }
    return combinedExtent;
}


struct GPUTimer
{
   GPUTimer(cudaStream_t stream): stream_(stream)
   {
      cudaEventCreate(&start_);
      cudaEventCreate(&stop_);
   }

   ~GPUTimer()
   {
      cudaEventDestroy(start_);
      cudaEventDestroy(stop_);
   }

   void start()
   {
      cudaEventRecord(start_, stream_);
   }

   float seconds()
   {
      cudaEventRecord(stop_, stream_);
      cudaEventSynchronize(stop_);
      float time;
      cudaEventElapsedTime(&time, start_, stop_);
      return time * 1e-3;
   }

   private:
   cudaEvent_t start_, stop_;
   cudaStream_t stream_;
};


int main()
{
   static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");

   bool verbose = true;

   // Check cuTensorNet version
   const size_t cuTensornetVersion = cutensornetGetVersion();
   if(verbose)
      printf("cuTensorNet version: %ld\n", cuTensornetVersion);

   // Set GPU device
   int numDevices {0};
   HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
   const int deviceId = 0;
   HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
   cudaDeviceProp prop;
   HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );

   if(verbose) {
      printf("===== device info ======\n");
      printf("GPU-name:%s\n", prop.name);
      printf("GPU-clock:%d\n", prop.clockRate);
      printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
      printf("GPU-nSM:%d\n", prop.multiProcessorCount);
      printf("GPU-major:%d\n", prop.major);
      printf("GPU-minor:%d\n", prop.minor);
      printf("t:%d\n", prop.textureAlignment);
      printf("========================\n");
   }

   typedef std::complex<double> floatType;
   cudaDataType_t typeData = CUDA_C_64F;
   cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_64F;

   if(verbose)
      printf("Included headers and defined data types\n");

   // Sphinx: #2
   /**********************
   * Computing: R_{k,l} = A_{a,b,c,d,e,f} B_{b,g,h,e,i,j} C_{m,a,g,f,i,k} D_{l,c,h,d,j,m}
   **********************/

   // Create vector of modes
   int32_t sharedMode = 'x';

   std::vector<int32_t> modesT{'i', 'j', 'm', 'n'};
   std::vector<int32_t> modesU{'i', sharedMode, 'm'};
   std::vector<int32_t> modesV{'n', sharedMode, 'j'};

   // Extents
   std::unordered_map<int32_t, int64_t> extentMap;
   extentMap['i'] = 32;
   extentMap['j'] = 2;
   extentMap['m'] = 2;
   extentMap['n'] = 32;

   int64_t rowExtent = computeCombinedExtent(extentMap, modesU);
   int64_t colExtent = computeCombinedExtent(extentMap, modesV);

   int64_t fullSharedExtent = rowExtent <= colExtent? rowExtent: colExtent;
   const int64_t maxExtent = fullSharedExtent / 2;  //fix extent truncation with half of the singular values trimmed out
   extentMap[sharedMode] = maxExtent;

   // Create a vector of extents for each tensor
   std::vector<int64_t> extentT;
   for (auto mode : modesT)
      extentT.push_back(extentMap[mode]);
   std::vector<int64_t> extentU;
   for (auto mode : modesU)
      extentU.push_back(extentMap[mode]);
   std::vector<int64_t> extentV;
   for (auto mode : modesV)
      extentV.push_back(extentMap[mode]);

   size_t elementsT = 1;
   for (auto mode : modesT)
      elementsT *= extentMap[mode];
   size_t elementsU = 1;
   for (auto mode : modesU)
      elementsU *= extentMap[mode];
   size_t elementsV = 1;
   for (auto mode : modesV)
      elementsV *= extentMap[mode];
   size_t elementsS = 2;

   size_t sizeT = sizeof(floatType) * elementsT;
   size_t sizeU = sizeof(floatType) * elementsU;
   size_t sizeV = sizeof(floatType) * elementsV;
   size_t sizeS = sizeof(double) * elementsS;

   if(verbose)
      printf("Total GPU memory used for tensor storage: %.2f GiB\n",
             (sizeT + sizeU + sizeV) / 1024. /1024. / 1024);
   
   std::vector<std::vector<void *>> pointers;
   int num_streams = 5;
   for (int i=0; i<num_streams;i++) {
   void* D_T;
   void* D_U;
   void* D_V;
   void* D_S;
   std::vector<void *> local_pointers;
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_U, sizeU) );
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_V, sizeV) );
   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_S, sizeS) );
   local_pointers.push_back(D_T);
   local_pointers.push_back(D_U);
   local_pointers.push_back(D_S);
   local_pointers.push_back(D_V);
   pointers.push_back(local_pointers);
   }

   floatType *T = (floatType*) malloc(sizeof(floatType) * elementsT);
   floatType *U = (floatType*) malloc(sizeof(floatType) * elementsU);
   floatType *V = (floatType*) malloc(sizeof(floatType) * elementsV);
   floatType *S = (floatType*) malloc(sizeof(floatType) * elementsS);

   /*******************
   * Initialize data
   *******************/

   memset(S, 0, sizeof(floatType) * 2);
   memset(U, 0, sizeof(floatType) * elementsU);
   memset(V, 0, sizeof(floatType) * elementsV);
   for (uint64_t i = 0; i < elementsT; i++)
      T[i] = floatType (((double) rand()) / RAND_MAX,((double) rand()) / RAND_MAX);

   for (int i=0; i<num_streams;i++) {
      HANDLE_CUDA_ERROR( cudaMemcpy(pointers[i][0], T, sizeT, cudaMemcpyHostToDevice) );
   }

   if(verbose)
      printf("Allocated GPU memory for data, and initialize data\n");

   // Sphinx: #4
   /*************************
   * cuTensorNet
   *************************/

  std::vector<cudaStream_t> streams;

  for (int i=0; i<num_streams;i++) {
   cudaStream_t stream;
   cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
   streams.push_back(stream);
  }

   cutensornetHandle_t handle;
   HANDLE_ERROR( cutensornetCreate(&handle) );

   const int32_t numModesIn = modesT.size();
   const int32_t numModesU = modesU.size();
   const int32_t numModesV = modesV.size();

   /*******************************
   * Create Network Descriptor
   *******************************/

   const int64_t *strides = NULL; // assuming fortran layout for all tensors
   std::vector<std::vector<cutensornetTensorDescriptor_t>> descs;

   // Set up tensor network
   for (int i=0; i<num_streams;i++) {
      cutensornetTensorDescriptor_t descTensorIn;
      cutensornetTensorDescriptor_t descTensorU;
      cutensornetTensorDescriptor_t descTensorV;
      HANDLE_ERROR(cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn));
      HANDLE_ERROR(cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU));
      HANDLE_ERROR(cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV));
      std::vector<cutensornetTensorDescriptor_t> local_descs {descTensorIn,descTensorU,descTensorV};
      descs.push_back(local_descs);
   }

   if(verbose)
      printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");

   // Sphinx: #5
   /*******************************
   * Choose workspace limit based on available resources.
   *******************************/

  cutensornetTensorSVDConfig_t svdConfig;
   HANDLE_ERROR(cutensornetCreateTensorSVDConfig(handle, &svdConfig));

   cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
                                           svdConfig, 
                                           CUTENSORNET_TENSOR_SVD_CONFIG_ALGO, 
                                           &svdAlgo, 
                                           sizeof(svdAlgo)));

   size_t freeMem, totalMem;
   HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
   uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
   if(verbose)
      printf("Workspace limit = %lu\n", workspaceLimit);

   /*******************************
   * Find "optimal" contraction order and slicing
   *******************************/

   if(verbose)
      printf("Found an optimized contraction path using cuTensorNet optimizer\n");

   // Sphinx: #6
   /*******************************
   * Create workspace descriptor, allocate workspace, and set it.
   *******************************/
   std::vector<cutensornetWorkspaceDescriptor_t> workDescs;
   for (int i=0; i<num_streams;i++) {

   cutensornetWorkspaceDescriptor_t workDesc;
   HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
   HANDLE_ERROR(cutensornetWorkspaceComputeSVDSizes(handle, descs[0][0], descs[0][1], descs[0][2], svdConfig, workDesc));

   int64_t hostWorkspaceSize, deviceWorkspaceSize;
   // for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick
   HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(handle,
                                                  workDesc,
                                                  CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
                                                  CUTENSORNET_MEMSPACE_DEVICE,
                                                  CUTENSORNET_WORKSPACE_SCRATCH,
                                                  &deviceWorkspaceSize));

   void *devWork = nullptr, *hostWork = nullptr;
   if (deviceWorkspaceSize > 0)
   {
      HANDLE_CUDA_ERROR(cudaMalloc(&devWork, deviceWorkspaceSize));
   }
   HANDLE_ERROR(cutensornetWorkspaceSetMemory(handle,
                                              workDesc,
                                              CUTENSORNET_MEMSPACE_DEVICE,
                                              CUTENSORNET_WORKSPACE_SCRATCH,
                                              devWork,
                                              deviceWorkspaceSize));
   workDescs.push_back(workDesc);

   }

   if(verbose)
      printf("Allocated and set up the GPU workspace\n");

     // Sphinx: #8
   /**********************
   * Execute the tensor network contraction
   **********************/

   // GPUTimer timer {stream};
   double minTimeCUTENSORNET = 1e100;
   const int numRuns = 1; // number of repeats to get stable performance results
   for (int i = 0; i < numRuns; ++i)
   {
      // HANDLE_CUDA_ERROR( cudaMemcpy(R_d, R, sizeR, cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
      // HANDLE_CUDA_ERROR( cudaMemcpy(R_d2, R, sizeR, cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
      // HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
      // if(!graphCreated){
         // cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
      /*
      * Contract all slices of the tensor network
      */
      // timer.start();

      int32_t accumulateOutput = 0; // output tensor data will be overwritten
      for (int i=0; i<num_streams;i++) {
      HANDLE_ERROR(cutensornetTensorSVD(handle,
                                     descs[i][0], pointers[i][0],
                                     descs[i][1], pointers[i][1],
                                     pointers[i][2],
                                     descs[i][2], pointers[i][3],
                                     svdConfig,
                                     NULL,
                                     workDescs[i],
                                     streams[i]));
      }

      // Synchronize and measure best timing
      // auto time = timer.seconds();
      // minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
   }

   if(verbose)
      printf("Contracted the tensor network, each slice used the same contraction plan\n");

   // Print the 1-norm of the output tensor (verification)
   for (int i=0; i<num_streams;i++) {
      HANDLE_CUDA_ERROR( cudaStreamSynchronize(streams[i]) );
   }
   HANDLE_CUDA_ERROR( cudaMemcpy(U, pointers[1][1], sizeU, cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
   double norm1 = 0.0;
   for (int64_t i = 0; i < elementsU; ++i) {
      norm1 += std::abs(U[i]);
   }
   if(verbose)
      printf("Computed the 1-norm of the output tensor: %e\n", norm1);

   // printf("Tensor network contraction time (ms) = %.3f\n", minTimeCUTENSORNET * 1000.f);

   /*************************/

   // Query the total Flop count for the tensor network contraction
   double flops {0.0};

   // Free GPU memory resources
   // if (work) cudaFree(work);
   if (pointers[0][0]) cudaFree(pointers[0][0]);

   if(verbose)
      printf("Freed resources and exited\n");

   return 0;
}

I am not able to perform parallel SVD decompositions on a A100 80gb, I also tried with different shapes relevant to my problem (64,2,2,64) instead of (32,2,2,32) or smaller shapes. When I look at the profiling, the kernels are executed sequentially. I also had a look at the GPU metrics. The active SMs are low, the GR active are high. I am new to the profiling, so I thought I could measure the utilization of the GPU with SMs active and plot that. But for SVDs, which take the most time in my algorithm and which cannot be executed in parallel, the SMs Active are low. So my question would be, why cant I execute multiple SVD-decompositions in parallel and is the SMs active a good measurement for utilization of the GPU when doing SVDs.

Thank you very much