cuTENSOR with Unified Memory

As cuTENSOR’s documentation states that cutensorContraction needs ‘GPU-accessible memory’, I tried to use it with memory allocated by cudaMallocManaged (see code below). (I wanted to contract tensors in an out-of-core fashion.)

If I set d=64 in my code, each of the three tensors/arrays requires 64^4 * 8 / 1024^2 MiB = 128 MiB (all of them fit onto my GPU at once) and everything works as intended. If I set d=196 (each tensor now requires 11 GiB – a single tensor would fit onto my GPU, but not all of them at the same time) everything still works. However, if I set d=256 (32 GiB per tensor – a single tensor does not fit onto my GPU anymore), calling cutensorInitTensorDescriptor the first time fails with CUTENSOR_STATUS_NOT_SUPPORTED.

Does anybody have a clue what’s going on here? I am using CUDA 10.1 on a Titan V (12 GiB) with 192 GiB RAM.

#include <stdlib.h>
#include <stdio.h>

#include <cuda_runtime.h>
#include <cutensor.h>

#include <unordered_map>
#include <vector>

// Handle cuTENSOR errors
#define HANDLE_ERROR(x) {                                                              \
  const auto err = x;                                                                  \
  if( x != CUTENSOR_STATUS_SUCCESS )                                                   \
  { printf("Error: %s in line %d\n", cutensorGetErrorString(x), __LINE__); exit(-1); } \
}

int main(int argc, char** argv)
{
  // Host element type definition
  typedef float floatTypeA;
  typedef float floatTypeB;
  typedef float floatTypeC;
  typedef float floatTypeCompute;

  // CUDA types
  cudaDataType_t typeA = CUDA_R_32F;
  cudaDataType_t typeB = CUDA_R_32F;
  cudaDataType_t typeC = CUDA_R_32F;
  cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_32F;

  floatTypeCompute alpha = (floatTypeCompute)1.0f;
  floatTypeCompute beta  = (floatTypeCompute)0.0f;

  printf("Include headers and define data types\n");

  /* ***************************** */

  // Create vector of modes
  std::vector<int> modeC{'m','u','n','v'};
  std::vector<int> modeA{'m','h','k','n'};
  std::vector<int> modeB{'u','k','v','h'};
  int nmodeA = modeA.size();
  int nmodeB = modeB.size();
  int nmodeC = modeC.size();

  // Extents
  std::unordered_map<int, int64_t> extent;
  int64_t d = 64;
  extent['m'] = d;
  extent['n'] = d;
  extent['u'] = d;
  extent['v'] = d;
  extent['h'] = d;
  extent['k'] = d;

  // Create a vector of extents for each tensor
  std::vector<int64_t> extentC;
  for(auto mode : modeC)
      extentC.push_back(extent[mode]);
  std::vector<int64_t> extentA;
  for(auto mode : modeA)
      extentA.push_back(extent[mode]);
  std::vector<int64_t> extentB;
  for(auto mode : modeB)
      extentB.push_back(extent[mode]);

  printf("Define modes and extents\n");

  /* ***************************** */

  // Number of elements of each tensor
  size_t elementsA = 1;
  for(auto mode : modeA)
      elementsA *= extent[mode];
  size_t elementsB = 1;
  for(auto mode : modeB)
      elementsB *= extent[mode];
  size_t elementsC = 1;
  for(auto mode : modeC)
      elementsC *= extent[mode];

  // Size in bytes
  size_t sizeA = sizeof(floatTypeA) * elementsA;
  size_t sizeB = sizeof(floatTypeB) * elementsB;
  size_t sizeC = sizeof(floatTypeC) * elementsC;

  // Allocate
  floatTypeA * A;
  floatTypeB * B;
  floatTypeC * C;
  cudaMallocManaged(&A, sizeA);
  cudaMallocManaged(&B, sizeB);
  cudaMallocManaged(&C, sizeC);
  
  printf("num elements: %ld %ld %ld\n", elementsA, elementsB, elementsC);
  
  // Initialize data on host
  for(int64_t i = 0; i < elementsA; i++)
      A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsB; i++)
      B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsC; i++)
      C[i] = (((float) rand())/RAND_MAX - 0.5)*100;

  printf("Allocate, initialize and transfer tensors\n");

  /* ***************************** */

  // Initialize cuTENSOR library
  cutensorHandle_t handle;
  HANDLE_ERROR( cutensorInit(&handle) );

  // Create Tensor Descriptors
  cutensorTensorDescriptor_t descA;
  HANDLE_ERROR( cutensorInitTensorDescriptor( &handle,
              &descA,
              nmodeA,
              extentA.data(),
              NULL,/*stride*/
              typeA, CUTENSOR_OP_IDENTITY ) );

  cutensorTensorDescriptor_t descB;
  HANDLE_ERROR( cutensorInitTensorDescriptor( &handle,
              &descB,
              nmodeB,
              extentB.data(),
              NULL,/*stride*/
              typeB, CUTENSOR_OP_IDENTITY ) );

  cutensorTensorDescriptor_t descC;
  HANDLE_ERROR( cutensorInitTensorDescriptor( &handle,
              &descC,
              nmodeC,
              extentC.data(),
              NULL,/*stride*/
              typeC, CUTENSOR_OP_IDENTITY ) );

  printf("Initialize cuTENSOR and tensor descriptors\n");

  /* ***************************** */

   //Retrieve the memory alignment for each tensor
   uint32_t alignmentRequirementA;
   HANDLE_ERROR( cutensorGetAlignmentRequirement( &handle,
              A,
              &descA,
              &alignmentRequirementA) );

   uint32_t alignmentRequirementB;
   HANDLE_ERROR( cutensorGetAlignmentRequirement( &handle,
              B,
              &descB,
              &alignmentRequirementB) );

   uint32_t alignmentRequirementC;
   HANDLE_ERROR( cutensorGetAlignmentRequirement( &handle,
              C,
              &descC,
              &alignmentRequirementC) );

  printf("Query best alignment requirement for our pointers\n");

  /* ***************************** */

  // Create the Contraction Descriptor
  cutensorContractionDescriptor_t desc;
  HANDLE_ERROR( cutensorInitContractionDescriptor( &handle,
              &desc,
              &descA, modeA.data(), alignmentRequirementA,
              &descB, modeB.data(), alignmentRequirementB,
              &descC, modeC.data(), alignmentRequirementC,
              &descC, modeC.data(), alignmentRequirementC,
              typeCompute) );

  printf("Initialize contraction descriptor\n");

  /* ***************************** */

  // Set the algorithm to use
  cutensorContractionFind_t find;
  HANDLE_ERROR( cutensorInitContractionFind(
              &handle, &find,
              CUTENSOR_ALGO_DEFAULT) );

  printf("Initialize settings to find algorithm\n");

  /* ***************************** */

  // Query workspace
  size_t worksize = 0;
  HANDLE_ERROR( cutensorContractionGetWorkspace(&handle,
              &desc,
              &find,
              CUTENSOR_WORKSPACE_RECOMMENDED, &worksize ) );

  // Allocate workspace
  void *work = nullptr;
  if(worksize > 0)
  {
      if( cudaSuccess != cudaMalloc(&work, worksize) ) // This is optional!
      {
          work = nullptr;
          worksize = 0;
      }
  }

  printf("Query recommended workspace size and allocate it\n");

  /* ***************************** */

  // Create Contraction Plan
  cutensorContractionPlan_t plan;
  HANDLE_ERROR( cutensorInitContractionPlan(&handle,
                                            &plan,
                                            &desc,
                                            &find,
                                            worksize) );

  printf("Create plan for contraction\n");

  /* ***************************** */

  cutensorStatus_t err;

  // Execute the tensor contraction
  err = cutensorContraction(&handle,
                            &plan,
                     (void*)&alpha, A,
                                    B,
                     (void*)&beta,  C,
                                    C,
                            work, worksize, 0 /* stream */);
  cudaDeviceSynchronize();

  // Check for errors
  if(err != CUTENSOR_STATUS_SUCCESS)
  {
      printf("ERROR: %s\n", cutensorGetErrorString(err));
  }

  printf("Execute contraction from plan\n");

  /* ***************************** */

  if ( A ) cudaFree( A );
  if ( B ) cudaFree( B );
  if ( C ) cudaFree( C );
  if ( work ) cudaFree( work );

  printf("Successful completion\n");

  return 0;
}