Matrix Multiply on DLA and checking DLA usage

Hi, I’m trying to set up TensorRT to preform a matrix multiplication to get a better understanding of how to use a DLA on the Jetson Xavier board. For some reason, using Tegrastat i’m unable to see any usage on my dla, and the returned matrix is all zeros. here is a snippet of my code. I was wondering if tegrastat is the correct tool to be using and also if I’m using TensorRT to correctly use the DLA. Hope someone can give me some insight

  IBuilder* builder = createInferBuilder(config::gLogger);
  builder->setFp16Mode(true);
  builder->setDefaultDeviceType((DeviceType) 1);
  std::cout << (int)builder->getDefaultDeviceType() << std::endl;
  INetworkDefinition* network = builder->createNetwork();
  int size = config::sq_dimension * config::sq_dimension * sizeof(float);
  const char* name1 = "in1";
  const char* name2 = "in2";
  DimsHW d = DimsHW(config::sq_dimension,config::sq_dimension);
  ITensor* h1 = network->addInput((const char *) name1, (DataType) 0, (Dims) d);
  ITensor* h2 = network->addInput((const char *) name2, (DataType) 0, (Dims) d);
  IElementWiseLayer* mat_mul_layer = network->addElementWise(*h1,*h2,(ElementWiseOperation)1);
  std::cout << "set run on DLA" << std::endl;
  builder->setDeviceType(mat_mul_layer, (DeviceType) 1);
  builder->setDLACore(0);
  builder->setMaxBatchSize(1);
  builder->setMaxWorkspaceSize( (size_t) 0x40000000);
  std::cout << "can run on DLA?" << std::endl;
  std::cout << (bool)builder->canRunOnDLA(mat_mul_layer) << std::endl;
  std::cout << "theres your answer" << std::endl;
  std::cout << (int)builder->getNbDLACores() << std::endl;
  std::cout << "making your engine" << std::endl;
  mat_mul_layer->getOutput(0)->setName("out");
  network->markOutput(*mat_mul_layer->getOutput(0));
  ICudaEngine* engine = builder->buildCudaEngine(*network);
  //engine->setDLACore(0);
  std::cout << (int)engine->getNbLayers() << std::endl;

  float *h_matrix_1, *h_matrix_2, *check;

  h_matrix_1 = (float *) malloc(size);
  h_matrix_2 = (float *) malloc(size);
  check = (float *) malloc(size);

  for (int i = 0; i < config::sq_dimension; i++) {
     for (int j = 0; j < config::sq_dimension; j++) {
        h_matrix_1[i] = i + j + 0.5;
        h_matrix_2[i] = i + j + 0.8;
        check[i] = 0;
     }
  }

  for (int i = 0; i < config::sq_dimension; i++) {
     for (int j = 0; j < config::sq_dimension; j++) {
        for (int p = 0; p < config::sq_dimension; p++) {
           check[i*config::sq_dimension + j] += h_matrix_1[i*config::sq_dimension + p] * h_matrix_2[p*config::sq_dimension + j];
        }
     }
  }
  std::cout << "making input" << std::endl;
  Batch b{engine, h_matrix_1, h_matrix_2};

  std::cout << "device memory copied" <<std::endl;

  int userInputIndex = b.mEngine->getBindingIndex(config::kin1);
  int itemInputIndex = b.mEngine->getBindingIndex(config::kin2);
  // Get output binding indices.
  int outputPredictionIndex = b.mEngine->getBindingIndex(config::kout);

  // Copy input from host to device.
  config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[userInputIndex], b.mHostMemory[userInputIndex], b.mMemSizes[0], cudaMemcpyHostToDevice, b.mStream));
  config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[itemInputIndex], b.mHostMemory[itemInputIndex], b.mMemSizes[0], cudaMemcpyHostToDevice, b.mStream));


  const char *job_name = "matrix multiplication";

// tag_job_begin(pid, tid, job_name, 14L, false, true, 0);
// Do inference.
b.mContext->enqueue(1, b.mDeviceMemory, b.mStream, nullptr);

//tag_job_end(pid, tid, job_name);
// Copy output from device to host.
config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputPredictionIndex], b.mDeviceMemory[outputPredictionIndex], b.mMemSizes[0], cudaMemcpyDeviceToHost, b.mStream));
cudaStreamSynchronize(b.mStream);

batch code

	mContext = mEngine->createExecutionContext();
	config::checkCUDA(cudaStreamCreate(&mStream));

    // In order to bind the buffers, we need to know the names of the input and output tensors.
	// Note that indices are guaranteed to be less than IEngine::getNbBindings()
	int userInputIndex = mEngine->getBindingIndex(config::kin1);

	int itemInputIndex = mEngine->getBindingIndex(config::kin2);
    int outputPredictionIndex = mEngine->getBindingIndex(config::kout);

    std::cout << userInputIndex << " " << itemInputIndex << " " << outputPredictionIndex << std::endl;
    mMemSizes.push_back(config::sq_dimension * config::sq_dimension * sizeof(float));

    config::checkCUDA(cudaMallocHost(&mHostMemory[userInputIndex], mMemSizes[0]));
	config::checkCUDA(cudaMallocHost(&mHostMemory[itemInputIndex], mMemSizes[0]));
	config::checkCUDA(cudaMallocHost(&mHostMemory[outputPredictionIndex], mMemSizes[0]));



    std::cout << "Copy the input data to host memory" <<std::endl;
    for (unsigned int i = 0; i < (mMemSizes[0]) / sizeof(float); ++i)
        *(static_cast<float *>(mHostMemory[userInputIndex]) + i) = userInput[i];

    for (unsigned int i = 0; i < (mMemSizes[0]) / sizeof(float); ++i)
        *(static_cast<float *>(mHostMemory[itemInputIndex]) + i) = itemInput[i];

    std::cout<<"Allocate GPU memory"<<std::endl;
    config::checkCUDA(cudaMalloc(&mDeviceMemory[userInputIndex], mMemSizes[0]));
	config::checkCUDA(cudaMalloc(&mDeviceMemory[itemInputIndex], mMemSizes[0]));
	config::checkCUDA(cudaMalloc(&mDeviceMemory[outputPredictionIndex], mMemSizes[0]));

Hi,

Sorry that tegrastats doesn’t support DLA monitoring.
Here are two possible way for your reference:

1.
Please use Nsight system to measure how much of a workload is running on the DLA.
https://developer.nvidia.com/nsight-systems

2.
You can also check the device node if DLA is active or not.
For DLA-0

cat /sys/devices/platform/host1x/15880000.nvdla0/power/runtime_status

For DLA-1

cat /sys/devices/platform/host1x/158c0000.nvdla1/power/runtime_status

Ex.

nvidia@jetson-0330618100118:~$ cat /sys/devices/platform/host1x/158c0000.nvdla1/power/runtime_status
active
nvidia@jetson-0330618100118:~$ cat /sys/devices/platform/host1x/158c0000.nvdla1/power/runtime_status
suspended

Thanks.

Hi,
If the DLA says suspended how do I remedy that? Where can I activate the DLA? I’m doing this all through ssh so I don’t have access to the device.

Hi,

The node only report the status of DLA: active or idle.

You should get some error from TensorRT if the task is not working.
Could you share the error or log with us?

Thanks.

Hi,
Sorry for the late response. I am not getting any sort of error, however the file only ever says suspended. I tried running the sampleMNIST and it is able to make the DLA active. I checked the GPU and it doesn’t seem to be using that either, so I’m confused about how the computation is being done. The computation is also incorrect, giving a largely sparse matrix where it shouldn’t be. Any insight would be very helpful
Best Regards

Any updates? i’m facing same issue and right now when I cat both DLA device’s runtime, it’s saying suspended.

However, i’m able to succsually save DLA engines using trtexec with int8

This is the command I used

sudo /usr/src/tensorrt/bin/trtexec --onnx=./model.onnx  --saveEngine=./DLA_engine.trt --int8 --workspace=2048  --verbose --useDLACore=0 --allowGPUFallback

When I use below to run, I didn’t monitor any DLA usage and if i’m monitoring jtop, I can see GPU is still running

/usr/src/tensorrt/bin/trtexec --loadEngine=./DLA_engine.trt 

I also tried adding the --useDLACore when loading engine but it doesn’t change anything