Hi, I’m trying to set up TensorRT to preform a matrix multiplication to get a better understanding of how to use a DLA on the Jetson Xavier board. For some reason, using Tegrastat i’m unable to see any usage on my dla, and the returned matrix is all zeros. here is a snippet of my code. I was wondering if tegrastat is the correct tool to be using and also if I’m using TensorRT to correctly use the DLA. Hope someone can give me some insight
IBuilder* builder = createInferBuilder(config::gLogger);
builder->setFp16Mode(true);
builder->setDefaultDeviceType((DeviceType) 1);
std::cout << (int)builder->getDefaultDeviceType() << std::endl;
INetworkDefinition* network = builder->createNetwork();
int size = config::sq_dimension * config::sq_dimension * sizeof(float);
const char* name1 = "in1";
const char* name2 = "in2";
DimsHW d = DimsHW(config::sq_dimension,config::sq_dimension);
ITensor* h1 = network->addInput((const char *) name1, (DataType) 0, (Dims) d);
ITensor* h2 = network->addInput((const char *) name2, (DataType) 0, (Dims) d);
IElementWiseLayer* mat_mul_layer = network->addElementWise(*h1,*h2,(ElementWiseOperation)1);
std::cout << "set run on DLA" << std::endl;
builder->setDeviceType(mat_mul_layer, (DeviceType) 1);
builder->setDLACore(0);
builder->setMaxBatchSize(1);
builder->setMaxWorkspaceSize( (size_t) 0x40000000);
std::cout << "can run on DLA?" << std::endl;
std::cout << (bool)builder->canRunOnDLA(mat_mul_layer) << std::endl;
std::cout << "theres your answer" << std::endl;
std::cout << (int)builder->getNbDLACores() << std::endl;
std::cout << "making your engine" << std::endl;
mat_mul_layer->getOutput(0)->setName("out");
network->markOutput(*mat_mul_layer->getOutput(0));
ICudaEngine* engine = builder->buildCudaEngine(*network);
//engine->setDLACore(0);
std::cout << (int)engine->getNbLayers() << std::endl;
float *h_matrix_1, *h_matrix_2, *check;
h_matrix_1 = (float *) malloc(size);
h_matrix_2 = (float *) malloc(size);
check = (float *) malloc(size);
for (int i = 0; i < config::sq_dimension; i++) {
for (int j = 0; j < config::sq_dimension; j++) {
h_matrix_1[i] = i + j + 0.5;
h_matrix_2[i] = i + j + 0.8;
check[i] = 0;
}
}
for (int i = 0; i < config::sq_dimension; i++) {
for (int j = 0; j < config::sq_dimension; j++) {
for (int p = 0; p < config::sq_dimension; p++) {
check[i*config::sq_dimension + j] += h_matrix_1[i*config::sq_dimension + p] * h_matrix_2[p*config::sq_dimension + j];
}
}
}
std::cout << "making input" << std::endl;
Batch b{engine, h_matrix_1, h_matrix_2};
std::cout << "device memory copied" <<std::endl;
int userInputIndex = b.mEngine->getBindingIndex(config::kin1);
int itemInputIndex = b.mEngine->getBindingIndex(config::kin2);
// Get output binding indices.
int outputPredictionIndex = b.mEngine->getBindingIndex(config::kout);
// Copy input from host to device.
config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[userInputIndex], b.mHostMemory[userInputIndex], b.mMemSizes[0], cudaMemcpyHostToDevice, b.mStream));
config::checkCUDA(cudaMemcpyAsync(b.mDeviceMemory[itemInputIndex], b.mHostMemory[itemInputIndex], b.mMemSizes[0], cudaMemcpyHostToDevice, b.mStream));
const char *job_name = "matrix multiplication";
// tag_job_begin(pid, tid, job_name, 14L, false, true, 0);
// Do inference.
b.mContext->enqueue(1, b.mDeviceMemory, b.mStream, nullptr);
//tag_job_end(pid, tid, job_name);
// Copy output from device to host.
config::checkCUDA(cudaMemcpyAsync(b.mHostMemory[outputPredictionIndex], b.mDeviceMemory[outputPredictionIndex], b.mMemSizes[0], cudaMemcpyDeviceToHost, b.mStream));
cudaStreamSynchronize(b.mStream);
batch code
mContext = mEngine->createExecutionContext();
config::checkCUDA(cudaStreamCreate(&mStream));
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
int userInputIndex = mEngine->getBindingIndex(config::kin1);
int itemInputIndex = mEngine->getBindingIndex(config::kin2);
int outputPredictionIndex = mEngine->getBindingIndex(config::kout);
std::cout << userInputIndex << " " << itemInputIndex << " " << outputPredictionIndex << std::endl;
mMemSizes.push_back(config::sq_dimension * config::sq_dimension * sizeof(float));
config::checkCUDA(cudaMallocHost(&mHostMemory[userInputIndex], mMemSizes[0]));
config::checkCUDA(cudaMallocHost(&mHostMemory[itemInputIndex], mMemSizes[0]));
config::checkCUDA(cudaMallocHost(&mHostMemory[outputPredictionIndex], mMemSizes[0]));
std::cout << "Copy the input data to host memory" <<std::endl;
for (unsigned int i = 0; i < (mMemSizes[0]) / sizeof(float); ++i)
*(static_cast<float *>(mHostMemory[userInputIndex]) + i) = userInput[i];
for (unsigned int i = 0; i < (mMemSizes[0]) / sizeof(float); ++i)
*(static_cast<float *>(mHostMemory[itemInputIndex]) + i) = itemInput[i];
std::cout<<"Allocate GPU memory"<<std::endl;
config::checkCUDA(cudaMalloc(&mDeviceMemory[userInputIndex], mMemSizes[0]));
config::checkCUDA(cudaMalloc(&mDeviceMemory[itemInputIndex], mMemSizes[0]));
config::checkCUDA(cudaMalloc(&mDeviceMemory[outputPredictionIndex], mMemSizes[0]));