#include "concurrentTest.cuh" #include #include #include #include #include #include using namespace std; // reset; sudo `which nvprof` --profile-from-start off ./a.out 1 64 512 512 64 7 7 1 1 0 1 0 int g_m_timing_loop_count = 0; const char *CUSTOM_BENCHMARK_NETWORK_INPUT_NAME = "input"; const char *CUSTOM_BENCHMARK_NETWORK_OUTPUT_NAME = "output"; const int BLAS_N = 1500; const int N = 1 << 26; __global__ void kernel(float *x, int n) { int tid = threadIdx.x + blockIdx.x * blockDim.x; printf("%d %d\n", n, blockDim.x * gridDim.x); for (int i = tid; i < n; i += blockDim.x * gridDim.x) { x[i] = sqrt(pow(3.14159,i)); } } void ConcurrentTest::Initialize() { srand(0); m_empty_weights.type = nvinfer1::DataType::kFLOAT; m_empty_weights.values = nullptr; m_empty_weights.count = 0; m_timing_loop_count = 20000; m_blas_timing_loop_count = 20; g_m_timing_loop_count = m_timing_loop_count; m_dt = nvinfer1::DataType::kFLOAT; //kHALF } ConcurrentTest::ConcurrentTest() : m_device_input_image(nullptr), m_device_output_image(nullptr) { Initialize(); } ConcurrentTest::~ConcurrentTest() { cout << "[ConcurrentTest]: Cleanup" << endl; float *kernel_weights_values = (float *)m_kernel_weights.values; if (kernel_weights_values) delete [] kernel_weights_values; cudaFree(m_device_input_image); cudaFree(m_device_output_image); m_builder->destroy(); m_builder = nullptr; m_network->destroy(); m_network = nullptr; m_engine->destroy(); m_engine = nullptr; m_runtime->destroy(); m_runtime = nullptr; } void ConcurrentTest::DisplayConfiguration() { cout << "Test parameters :" << endl; cout << "\tbatch size : " << m_batch_size << endl; cout << "\tinput c : " << m_c << endl; cout << "\tinput h : " << m_h << endl; cout << "\tinput w : " << m_w << endl; cout << "\toutput maps : " << m_nb_output_maps << endl; cout << "\tkernel height : " << m_kernel_sizes.h() << endl; cout << "\tkernel width : " << m_kernel_sizes.w() << endl; cout << "\tstride height : " << m_stride.h() << endl; cout << "\tstride width : " << m_stride.w() << endl; cout << "\tdevice type : "; if (0 == (int)m_device_type) cout << "GPU"; else cout << "DLA"; cout << endl; cout << "\tFP16 : " << (bool)m_Fp16 << endl; cout << "\tDisplay output: " << (bool)m_should_display_matrices; cout << endl; } // Parse the command line parameters. void ConcurrentTest::ParseArguments(int argc, char **argv) { int i = 1; m_batch_size = atoi(argv[i++]); m_c = atoi(argv[i++]); m_h = atoi(argv[i++]); m_w = atoi(argv[i++]); m_nb_output_maps = atoi(argv[i++]); m_kernel_sizes.h() = atoi(argv[i++]); m_kernel_sizes.w() = atoi(argv[i++]); m_stride.h() = atoi(argv[i++]); m_stride.w() = atoi(argv[i++]); int device_type = atoi(argv[i++]); if (0 == device_type) m_device_type = nvinfer1::DeviceType::kGPU; else m_device_type = nvinfer1::DeviceType::kDLA; m_Fp16 = (bool)(atoi(argv[i++])); m_should_display_matrices = (bool)atoi(argv[i++]); } void ConcurrentTest::PrepareKernels() { // Prepare kernels. m_kernel_weights.type = nvinfer1::DataType::kFLOAT; int kernel_size = m_kernel_sizes.h() * m_kernel_sizes.w(); float *values = reinterpret_cast(malloc(sizeof(float) * kernel_size * m_c * m_nb_output_maps)); m_kernel_weights.values = values; m_kernel_weights.count = kernel_size * m_c * m_nb_output_maps; } void ConcurrentTest::PrintTensorInformation(const char *title, nvinfer1::ITensor *tensor) { cout << title << ": " << endl; cout << "\tName: " << tensor->getName() << endl; cout << "\tDims: ["; for (int nb = 0; nb < tensor->getDimensions().nbDims; nb++) cout << tensor->getDimensions().d[nb] << ", "; cout << "]" << endl; cout << "\tNetwork output: " << tensor->isNetworkOutput() << endl; cout << "\tLocation: "; if (nvinfer1::TensorLocation::kDEVICE == tensor->getLocation()) cout << "Device" << endl; else cout << "Host" << endl; cout << endl; } void ConcurrentTest::LoadInput() { size_t input_data_size = m_batch_size * m_c * m_h * m_w; size_t output_data_size = m_batch_size * m_nb_output_maps * m_h * m_w; TENSORRT_TYPE *data = new TENSORRT_TYPE[input_data_size]; for (size_t i = 0; i < input_data_size; i++) { data[i] = (TENSORRT_TYPE)((rand() % 255) / (float)(rand() % 1000 + 0.1f)); } cudaMalloc((void **)&m_device_input_image, input_data_size * sizeof(TENSORRT_TYPE)); cudaMemcpy(m_device_input_image, data, input_data_size * sizeof(TENSORRT_TYPE), cudaMemcpyHostToDevice); cudaMalloc((void **)&m_device_output_image, output_data_size * sizeof(TENSORRT_TYPE)); cudaMemset(m_device_output_image, 0, output_data_size * sizeof(TENSORRT_TYPE)); delete [] data; } void ConcurrentTest::BuildNetwork() { m_builder = nvinfer1::createInferBuilder(m_logger); m_builder->setMaxBatchSize(m_batch_size); m_builder->setMaxWorkspaceSize(1 << 20); m_builder->allowGPUFallback(true); // In case the DLA fails, we fallback to GPU. // Build engine // DLA can only be set when FP16 or Int8 mode is enabled. m_builder->setFp16Mode(m_Fp16); m_builder->setDefaultDeviceType(m_device_type); m_network = m_builder->createNetwork(); nvinfer1::DimsNCHW intput_dims{m_batch_size, m_c, m_h, m_w}; cout << "Input dims: " << m_batch_size << ", " << m_c << ", " << m_h << ", " << m_w << ", m_dt: " << (int)m_dt << endl; m_input_tensor = m_network->addInput(CUSTOM_BENCHMARK_NETWORK_INPUT_NAME, m_dt, intput_dims); nvinfer1::IConvolutionLayer *conv1 = m_network->addConvolution(*m_input_tensor, m_nb_output_maps, m_kernel_sizes, m_kernel_weights, m_empty_weights); conv1->setPrecision(nvinfer1::DataType::kHALF); conv1->setStride(m_stride); nvinfer1::DimsHW padding(m_kernel_sizes.h() / 2, m_kernel_sizes.w() / 2); conv1->setPadding(padding); nvinfer1::ITensor *convolution_output = conv1->getOutput(0); conv1->getOutput(0)->setName(CUSTOM_BENCHMARK_NETWORK_OUTPUT_NAME); m_network->markOutput(*conv1->getOutput(0)); PrintTensorInformation("Input tensor", m_input_tensor); PrintTensorInformation("Convolution output tensor", convolution_output); m_engine = m_builder->buildCudaEngine(*m_network); } void ConcurrentTest::BuildKernel() { int kernel_size = m_kernel_sizes.h() * m_kernel_sizes.w(); size_t all_kerenls_size = m_nb_output_maps * m_c * kernel_size; // Reset everything to zero. memset((float *)m_kernel_weights.values, 0, all_kerenls_size * sizeof(float)); // Set some kernel for the main_channel. for (int k = 0; k < all_kerenls_size; k++) { ((float *)m_kernel_weights.values)[k] = (rand() % 10) / (rand() % 50 + 0.1f); } } void ConcurrentTest::RunBLASSgemm(int i) { GpuTimer gpu_blas_timer(m_blas_stream); gpu_blas_timer.start(); cublasStatus_t status; int n2 = BLAS_N * BLAS_N; // Memset in different positions so that the compiler will not remove repeating calls to cublassSgemm. float *dummy = m_blas_d_A; cublasSetStream(m_blas_handle, m_blas_stream); for (int j = 0; j < m_blas_timing_loop_count; j++) { status = cublasSgemm(m_blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, BLAS_N, BLAS_N, BLAS_N, &m_blas_alpha, m_blas_d_A, BLAS_N, m_blas_d_B, BLAS_N, &m_blas_beta, m_blas_d_C, BLAS_N); m_blas_alpha += (j * 1.f) / 5.f; m_blas_beta += (j * 1.f) * 1.24f; dummy += j * 10; cudaMemsetAsync(dummy, j, j * j * sizeof(float), m_blas_stream); } status = cublasGetVector(n2, sizeof(m_blas_h_C[0]), m_blas_d_C, 1, m_blas_h_C, 1); std::cout << "Host result: " << m_blas_h_C[0] << ", " << m_blas_h_C[500] << ", " << m_blas_h_C[1000] << std::endl; cudaStreamSynchronize(m_blas_stream); gpu_blas_timer.stop(); m_blas_time_ms = gpu_blas_timer.milliseconds(); m_blas_average_time_ms = gpu_blas_timer.milliseconds() / m_blas_timing_loop_count; std::cout << "RunBLASSgemm done..." << std::endl; } void ConcurrentTest::RunBLAS() { if (!m_run_blas) return; if (m_run_blas_in_thread) { int num_threads = 1; for (int i = 0; i < num_threads; i++) { m_workers.push_back(std::thread([&]() { std::cout << "thread function.. " << i << "\n"; RunBLASSgemm(i); })); } /* std::for_each(workers.begin(), workers.end(), [](std::thread &t) { t.join(); });*/ } else { RunBLASSgemm(0); } std::cout << "RunBLAS done..." << std::endl; } void ConcurrentTest::RunInference() { // Run inference RVLayerProfiler rvlayer_profiler; char buff[100]; sprintf(buff, "CustomBenchmark - %d", (int)pthread_self()); rvlayer_profiler.SetName(buff); int kernel_size = m_kernel_sizes.h() * m_kernel_sizes.w(); m_runtime = nvinfer1::createInferRuntime(m_logger); if (nvinfer1::DeviceType::kDLA == m_device_type) m_runtime->setDLACore(0); void* buffers[2]; const int input_index = m_engine->getBindingIndex(CUSTOM_BENCHMARK_NETWORK_INPUT_NAME); const int output_index = m_engine->getBindingIndex(CUSTOM_BENCHMARK_NETWORK_OUTPUT_NAME); buffers[input_index] = (void *)m_device_input_image; buffers[output_index] = (void *)(m_device_output_image); nvinfer1::IExecutionContext *execution_engine = m_engine->createExecutionContext(); // Run inference cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // Warm up. for (int i = 0; i < 10; i++) execution_engine->enqueue(m_batch_size, buffers, stream, nullptr); cudaStreamSynchronize(stream); execution_engine->setProfiler(&rvlayer_profiler); cudaProfilerStart(); // Start NVIDIA's nvprof if relevant. cudaDeviceSynchronize(); cout << "Start running...." << endl; cublasStatus_t status; int n2 = BLAS_N * BLAS_N; float *dummy = m_blas_d_A; cout << "N = " << N << endl; m_blas_time_ms = 0.f; m_blas_average_time_ms = 0.f; GpuTimer gpu_timer(stream); gpu_timer.start(); /* typedef enum { EConvolutionOnly = 0, ECUDAOnly, EConvolutionFollowedByCUDA, EConvolutionAndCUDAConcurrently } EConcurrentTestType; */ auto tStart = std::chrono::high_resolution_clock::now(); EConcurrentTestType test_type = EConvolutionAndCUDAConcurrently; // Immediately, run the TensorRT on the TensorRT cores. std::thread trt([&]() { if ((EConvolutionOnly == test_type) || (EConvolutionFollowedByCUDA == test_type) || (EConvolutionAndCUDAConcurrently == test_type)) { for (int i = 0; i < m_timing_loop_count; i++) { execution_engine->enqueue(m_batch_size, buffers, stream, nullptr); // cudaStreamSynchronize(stream); } } if (EConvolutionFollowedByCUDA == test_type) { for (int i = 0; i < m_blas_timing_loop_count; i++) kernel<<<(N+255)/256, 256, 0, stream>>>(m_dummy_data, N); } cudaStreamSynchronize(stream); }); std::thread blas([&](){ if ((ECUDAOnly == test_type) || (EConvolutionAndCUDAConcurrently == test_type)) { for (int i = 0; i < m_blas_timing_loop_count; i++) { std::cout << "Running kernel: [" << i << "/" << m_blas_timing_loop_count << "]" << std::endl; kernel<<<(N+255)/256, 256, 0, m_blas_stream>>>(m_dummy_data, N); cudaStreamSynchronize(m_blas_stream); } } // cublasSetStream(m_blas_handle, m_blas_stream); // for (int i = 0; i < 20; i++) { // status = cublasSgemm(m_blas_handle, CUBLAS_OP_N, CUBLAS_OP_N, BLAS_N, BLAS_N, BLAS_N, &m_blas_alpha, m_blas_d_A, // BLAS_N, m_blas_d_B, BLAS_N, &m_blas_beta, m_blas_d_C, BLAS_N); // m_blas_alpha += (i * 1.f) / 5.f; // m_blas_beta += (i * 1.f) * 1.24f; // dummy += i * 10; // } }); // cudaStreamSynchronize(stream); // cudaStreamSynchronize(m_blas_stream); trt.join(); blas.join(); cudaDeviceSynchronize(); auto tEnd = std::chrono::high_resolution_clock::now(); auto totalHost = std::chrono::duration(tEnd - tStart).count(); cudaDeviceSynchronize(); gpu_timer.stop(); std::cout << "Host result: " << m_blas_h_C[0] << ", " << m_blas_h_C[500] << ", " << m_blas_h_C[1000] << std::endl; cout << "TensorRT done..." << endl; // All streams are done. // Synchronize BLAS on the regular cores and TensorRT on the Tensor Cores hardware. cudaError_t err = cudaDeviceSynchronize(); cout << "All done... error: " << err << endl; cudaProfilerStop(); // Stop NVIDIA's nvprof. cudaDeviceSynchronize(); // Save and display the timing statistics. m_layer_count = rvlayer_profiler.GetLayerCount(); m_time_ms = gpu_timer.milliseconds(); m_average_time_ms = gpu_timer.milliseconds() / m_timing_loop_count; cout << "CUDA error: " << err << endl; cout << "\tTotal host : [" << totalHost << " ms]" << endl; cout << "\tGPU Total time : [" << m_time_ms << " ms]" << endl; cout << "\tGPU timer average : [" << m_average_time_ms << " ms]" << endl; cout << "\tGPU BLAS Total time : [" << m_blas_time_ms << " ms]" << endl; cout << "\tGPU BLAS timer average : [" << m_blas_average_time_ms << " ms]" << endl; } void ConcurrentTest::InitializeBLAS() { m_run_blas = true; m_run_blas_in_thread = true; CHECK(cudaStreamCreate(&m_blas_stream)); cublasStatus_t status = cublasCreate(&m_blas_handle); int i; int n2 = BLAS_N * BLAS_N; m_blas_h_A = m_blas_h_B = m_blas_h_C = m_blas_d_A = m_blas_d_B = m_blas_d_C = nullptr; m_blas_h_A = reinterpret_cast(malloc(n2 * sizeof(m_blas_h_A[0]))); m_blas_h_B = reinterpret_cast(malloc(n2 * sizeof(m_blas_h_B[0]))); m_blas_h_C = reinterpret_cast(malloc(n2 * sizeof(m_blas_h_C[0]))); for (i = 0; i < n2; i++) { m_blas_h_A[i] = rand() / static_cast(RAND_MAX); m_blas_h_B[i] = rand() / static_cast(RAND_MAX); m_blas_h_C[i] = rand() / static_cast(RAND_MAX); } cudaMalloc(reinterpret_cast(&m_blas_d_A), n2 * sizeof(m_blas_d_A[0])); cudaMalloc(reinterpret_cast(&m_blas_d_B), n2 * sizeof(m_blas_d_B[0])); cudaMalloc(reinterpret_cast(&m_blas_d_C), n2 * sizeof(m_blas_d_C[0])); if (m_blas_h_A == nullptr || m_blas_h_B == nullptr || m_blas_h_C == nullptr || m_blas_d_A == nullptr || m_blas_d_B == nullptr || m_blas_d_C == nullptr) { std::cout << "Could not create BLAS objects." << std::endl; exit(1); } int st = (int)cublasSetVector(n2, sizeof(m_blas_h_A[0]), m_blas_h_A, 1, m_blas_d_A, 1); st += (int)cublasSetVector(n2, sizeof(m_blas_h_B[0]), m_blas_h_B, 1, m_blas_d_B, 1); st += (int)cublasSetVector(n2, sizeof(m_blas_h_C[0]), m_blas_h_C, 1, m_blas_d_C, 1); if (st != CUBLAS_STATUS_SUCCESS) { std::cout << "Error in cublasSetVector" << std::endl; exit(1); } cudaMalloc(reinterpret_cast(&m_dummy_data), N * sizeof(m_dummy_data[0])); } void ConcurrentTest::Run() { PrepareKernels(); DisplayConfiguration(); InitializeBLAS(); cout << "Load input" << endl; LoadInput(); cout << "Build kernel" << endl; BuildKernel(); cout << "Build network" << endl; BuildNetwork(); cout << "Run inferencing" << endl; RunInference(); cout << "Done." << endl; } void ConcurrentTest::Run(int argc, char **argv) { // Parse command line parameters. ParseArguments(argc, argv); Run(); } /** * MAIN. */ int main(int argc, char **argv) { /* int num_threads = 1; std::vector workers; for (int i = 0; i < num_threads; i++) { workers.push_back(std::thread([&]() { std::cout << "thread function.. " << i << "\n"; ConcurrentTest benchmark; benchmark.Run(argc, argv); })); } std::for_each(workers.begin(), workers.end(), [](std::thread &t) { t.join(); });*/ float *m_dummy_data; cudaMalloc(reinterpret_cast(&m_dummy_data), N * sizeof(m_dummy_data[0])); //kernel<<<(N+255)/256, 256>>>(m_dummy_data, N); for (int i = 0; i < 50; i++) { cout << "[" << i << "/50]" << endl; kernel<<<1, 1>>>(m_dummy_data, N); cudaDeviceSynchronize(); } cout << endl; }