#include #include #include #include #include #include #include static int CheckCudaError( cudaError_t code, const char *expr, const char *file, int line) { if (code) { printf("CUDA error at %s:%d, code=%d (%s) in '%s'", file, line, (int)code, cudaGetErrorString(code), expr); return 1; } return 0; } static int CheckCudnnError( cudnnStatus_t code, const char *expr, const char *file, int line) { if (code) { printf("CUDNN error at %s:%d, code=%d (%s) in '%s'\n", file, line, (int)code, cudnnGetErrorString(code), expr); return 1; } return 0; } #define CHECK_CUDA_ERR(...) \ do { \ int err = CheckCudaError( \ __VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \ if (err) exit(1); \ } while (0) #define CHECK_CUDNN_ERR(...) \ do { \ int err = CheckCudnnError( \ __VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \ if (err) exit(1); \ } while (0) /* tensor_conv_src | tensor_conv_weight \ | \ / \ / op_conv | tensor_conv_dst (virtual) | tensor_biasadd_bias | / | / op_bias_add | tensor_bias_add_dst (virtual) | op_pointwise_relu | tensor_pointwise_relu_dst */ int cudnn_cbr() { //===-------- Create cuDNN runtime context ----===// // create cudnn handle cudnnHandle_t handle; CHECK_CUDNN_ERR(cudnnCreate(&handle)); // create cuda stream cudaStream_t s; CHECK_CUDA_ERR(cudaStreamCreate(&s)); // bind cuda stream to cudnn handle CHECK_CUDNN_ERR(cudnnSetStream(handle, s)); //===-------- Initialize problem configurations ----===// const int64_t batch_size = 8; const int64_t src_feature_h = 64; const int64_t src_feature_w = 64; const int64_t src_channels = 4; const int64_t filter_h = 1; const int64_t filter_w = 1; const int64_t stride = 1; const int64_t padding = 0; const int64_t dilation = 1; const int64_t dst_channels = 16; const int64_t dst_feature_h = 64; const int64_t dst_feature_w = 64; const int64_t alignment = 16; //===-------- Create tensors in the CBR graph problem ----===// // create convolution src tensor cudnnBackendDescriptor_t tensor_conv_src; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_TENSOR_DESCRIPTOR, &tensor_conv_src)); cudnnDataType_t conv_src_dtype = CUDNN_DATA_FLOAT; CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_src, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &conv_src_dtype)); int64_t conv_src_dims[] = {batch_size, src_channels, src_feature_h, src_feature_w}; int64_t conv_src_strides[] = {src_channels * src_feature_h * src_feature_w, 1, src_feature_w * src_channels, src_channels}; int64_t conv_src_uid = 0; CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_src, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, 4, conv_src_dims)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_src, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, 4, conv_src_strides)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_src, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &conv_src_uid)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_src, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment)); CHECK_CUDNN_ERR(cudnnBackendFinalize(tensor_conv_src)); // create convolution weight tensor cudnnBackendDescriptor_t tensor_conv_wei; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_TENSOR_DESCRIPTOR, &tensor_conv_wei)); cudnnDataType_t conv_wei_dtype = CUDNN_DATA_FLOAT; CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_wei, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &conv_wei_dtype)); int64_t conv_wei_dims[] = {dst_channels, src_channels, filter_h, filter_w}; int64_t conv_wei_strides[] = {src_channels * filter_h * filter_w, 1, filter_w * src_channels, src_channels}; int64_t conv_wei_uid = 1; CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_wei, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, 4, conv_wei_dims)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_wei, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, 4, conv_wei_strides)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_wei, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &conv_wei_uid)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_wei, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment)); CHECK_CUDNN_ERR(cudnnBackendFinalize(tensor_conv_wei)); // create convolution dst tensor cudnnBackendDescriptor_t tensor_conv_dst; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_TENSOR_DESCRIPTOR, &tensor_conv_dst)); cudnnDataType_t conv_dst_dtype = CUDNN_DATA_FLOAT; CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_dst, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &conv_dst_dtype)); int64_t conv_dst_dims[] = {batch_size, dst_channels, dst_feature_h, dst_feature_w}; int64_t conv_dst_strides[] = {dst_channels * dst_feature_h * dst_feature_w, 1, dst_feature_w * dst_channels, dst_channels}; int64_t conv_dst_uid = 2; bool conv_dst_is_virtual = true; // conv dst can be a virtual tensor CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_dst, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, 4, conv_dst_dims)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_dst, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, 4, conv_dst_strides)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_dst, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &conv_dst_uid)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_dst, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_conv_dst, CUDNN_ATTR_TENSOR_IS_VIRTUAL, CUDNN_TYPE_BOOLEAN, 1, &conv_dst_is_virtual)); CHECK_CUDNN_ERR(cudnnBackendFinalize(tensor_conv_dst)); // create biasadd bias tensor cudnnBackendDescriptor_t tensor_biasadd_bias; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_TENSOR_DESCRIPTOR, &tensor_biasadd_bias)); cudnnDataType_t biasadd_bias_dtype = CUDNN_DATA_FLOAT; CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_biasadd_bias, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &biasadd_bias_dtype)); int64_t biasadd_bias_dims[] = {1, dst_channels, 1, 1}; int64_t biasadd_bias_strides[] = {dst_channels, 1, 1, 1}; int64_t biasadd_bias_uid = 3; CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_biasadd_bias, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, 4, biasadd_bias_dims)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_biasadd_bias, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, 4, biasadd_bias_strides)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_biasadd_bias, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &biasadd_bias_uid)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_biasadd_bias, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment)); CHECK_CUDNN_ERR(cudnnBackendFinalize(tensor_biasadd_bias)); // create biasadd dst tensor cudnnBackendDescriptor_t tensor_biasadd_dst; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_TENSOR_DESCRIPTOR, &tensor_biasadd_dst)); cudnnDataType_t biasadd_dst_dtype = CUDNN_DATA_FLOAT; CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_biasadd_dst, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &biasadd_dst_dtype)); int64_t biasadd_dst_uid = 4; bool biasadd_dst_is_virtual = true; // biasadd dst can be a virtual tensor CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_biasadd_dst, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, 4, conv_dst_dims)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_biasadd_dst, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, 4, conv_dst_strides)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_biasadd_dst, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &biasadd_dst_uid)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_biasadd_dst, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_biasadd_dst, CUDNN_ATTR_TENSOR_IS_VIRTUAL, CUDNN_TYPE_BOOLEAN, 1, &conv_dst_is_virtual)); CHECK_CUDNN_ERR(cudnnBackendFinalize(tensor_biasadd_dst)); // create relu dst tensor cudnnBackendDescriptor_t tensor_relu_dst; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_TENSOR_DESCRIPTOR, &tensor_relu_dst)); cudnnDataType_t relu_dst_dtype = CUDNN_DATA_FLOAT; CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_relu_dst, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &relu_dst_dtype)); int64_t relu_dst_uid = 5; CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_relu_dst, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, 4, conv_dst_dims)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_relu_dst, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, 4, conv_dst_strides)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_relu_dst, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &relu_dst_uid)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(tensor_relu_dst, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment)); CHECK_CUDNN_ERR(cudnnBackendFinalize(tensor_relu_dst)); //===-------- Create operations in the CBR graph problem ----===// // create the convolution operation descriptor cudnnBackendDescriptor_t conv_fwd_desc; int64_t spatial_ndims = 2; cudnnDataType_t compute_type = CUDNN_DATA_FLOAT; cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION; int64_t pad[] = {padding, padding}; int64_t stri[] = {stride, stride}; int64_t dila[] = {dilation, dilation}; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR, &conv_fwd_desc)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(conv_fwd_desc, CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS, CUDNN_TYPE_INT64, 1, &spatial_ndims)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(conv_fwd_desc, CUDNN_ATTR_CONVOLUTION_COMP_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &compute_type)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(conv_fwd_desc, CUDNN_ATTR_CONVOLUTION_CONV_MODE, CUDNN_TYPE_CONVOLUTION_MODE, 1, &mode)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(conv_fwd_desc, CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS, CUDNN_TYPE_INT64, spatial_ndims, pad)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(conv_fwd_desc, CUDNN_ATTR_CONVOLUTION_POST_PADDINGS, CUDNN_TYPE_INT64, spatial_ndims, pad)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(conv_fwd_desc, CUDNN_ATTR_CONVOLUTION_DILATIONS, CUDNN_TYPE_INT64, spatial_ndims, dila)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(conv_fwd_desc, CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES, CUDNN_TYPE_INT64, spatial_ndims, stri)); CHECK_CUDNN_ERR(cudnnBackendFinalize(conv_fwd_desc)); // create the convolution operation cudnnBackendDescriptor_t conv_fwd_op; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, &conv_fwd_op)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(conv_fwd_op, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &tensor_conv_src)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(conv_fwd_op, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &tensor_conv_wei)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(conv_fwd_op, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &tensor_conv_dst)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(conv_fwd_op, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &conv_fwd_desc)); CHECK_CUDNN_ERR(cudnnBackendFinalize(conv_fwd_op)); // create the pointwise biasadd operation descriptor cudnnBackendDescriptor_t biasadd_fwd_desc; cudnnPointwiseMode_t biasadd_mode = CUDNN_POINTWISE_ADD; cudnnDataType_t biasadd_math_prec = CUDNN_DATA_FLOAT; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_POINTWISE_DESCRIPTOR, &biasadd_fwd_desc)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(biasadd_fwd_desc, CUDNN_ATTR_POINTWISE_MODE, CUDNN_TYPE_POINTWISE_MODE, 1, &biasadd_mode)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(biasadd_fwd_desc, CUDNN_ATTR_POINTWISE_MATH_PREC, CUDNN_TYPE_DATA_TYPE, 1, &biasadd_math_prec)); CHECK_CUDNN_ERR(cudnnBackendFinalize(biasadd_fwd_desc)); // create the pointwise biasadd operation cudnnBackendDescriptor_t biasadd_fwd_op; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR, &biasadd_fwd_op)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(biasadd_fwd_op, CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &biasadd_fwd_desc)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(biasadd_fwd_op, CUDNN_ATTR_OPERATION_POINTWISE_XDESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &tensor_conv_dst)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(biasadd_fwd_op, CUDNN_ATTR_OPERATION_POINTWISE_BDESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &tensor_biasadd_bias)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(biasadd_fwd_op, CUDNN_ATTR_OPERATION_POINTWISE_YDESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &tensor_biasadd_dst)); CHECK_CUDNN_ERR(cudnnBackendFinalize(biasadd_fwd_op)); // create the pointwise relu operation descriptor cudnnBackendDescriptor_t relu_fwd_desc; cudnnPointwiseMode_t relu_mode = CUDNN_POINTWISE_RELU_FWD; cudnnDataType_t relu_math_prec = CUDNN_DATA_FLOAT; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_POINTWISE_DESCRIPTOR, &relu_fwd_desc)); CHECK_CUDNN_ERR( cudnnBackendSetAttribute(relu_fwd_desc, CUDNN_ATTR_POINTWISE_MODE, CUDNN_TYPE_POINTWISE_MODE, 1, &relu_mode)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(relu_fwd_desc, CUDNN_ATTR_POINTWISE_MATH_PREC, CUDNN_TYPE_DATA_TYPE, 1, &relu_math_prec)); CHECK_CUDNN_ERR(cudnnBackendFinalize(relu_fwd_desc)); // create the pointwise relu operation cudnnBackendDescriptor_t relu_fwd_op; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR, &relu_fwd_op)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(relu_fwd_op, CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &relu_fwd_desc)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(relu_fwd_op, CUDNN_ATTR_OPERATION_POINTWISE_XDESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &tensor_biasadd_dst)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(relu_fwd_op, CUDNN_ATTR_OPERATION_POINTWISE_YDESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &tensor_relu_dst)); CHECK_CUDNN_ERR(cudnnBackendFinalize(relu_fwd_op)); //===-------- Create the operation graph for the CBR graph problem --===// // create an operation graph of convolution -> biasadd -> relu cudnnBackendDescriptor_t op_graph; cudnnBackendDescriptor_t ops[] = {conv_fwd_op, biasadd_fwd_op, relu_fwd_op}; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, &op_graph)); CHECK_CUDNN_ERR( cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_OPS, CUDNN_TYPE_BACKEND_DESCRIPTOR, 3, ops)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle)); CHECK_CUDNN_ERR(cudnnBackendFinalize(op_graph)); int64_t global_count = 0; CHECK_CUDNN_ERR(cudnnBackendGetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT, CUDNN_TYPE_INT64, 1, NULL, &global_count)); printf("the global count of operation graph engine is: %ld\n", global_count); //===-------- Create an engine ----===// cudnnBackendDescriptor_t engine; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_ENGINE_DESCRIPTOR, &engine)); CHECK_CUDNN_ERR( cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_OPERATION_GRAPH, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph)); int64_t global_idx = global_count - 1; CHECK_CUDNN_ERR(cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_GLOBAL_INDEX, CUDNN_TYPE_INT64, 1, &global_idx)); CHECK_CUDNN_ERR(cudnnBackendFinalize(engine)); //===-------- Create an engine config ----===// cudnnBackendDescriptor_t engine_cfg; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &engine_cfg)); CHECK_CUDNN_ERR( cudnnBackendSetAttribute(engine_cfg, CUDNN_ATTR_ENGINECFG_ENGINE, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engine)); CHECK_CUDNN_ERR(cudnnBackendFinalize(engine_cfg)); //===-------- Create an plan ----===// cudnnBackendDescriptor_t plan; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &plan)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engine_cfg)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle)); CHECK_CUDNN_ERR(cudnnBackendFinalize(plan)); int64_t workspace_size = 0; cudnnBackendGetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE, CUDNN_TYPE_INT64, 1, NULL, &workspace_size); //===-------- Allocate the data buffers on device ----===// // conv src const size_t conv_src_size = batch_size * src_channels * src_feature_h * src_feature_w; const size_t conv_src_bsize = conv_src_size * sizeof(float); void *conv_src_data = nullptr; CHECK_CUDA_ERR(cudaMalloc(&conv_src_data, conv_src_bsize)); // conv wei const size_t conv_wei_size = dst_channels * src_channels * filter_h * filter_w; const size_t conv_wei_bsize = conv_wei_size * sizeof(float); void *conv_wei_data = nullptr; CHECK_CUDA_ERR(cudaMalloc(&conv_wei_data, conv_wei_bsize)); // biasadd bias const size_t biasadd_bias_size = dst_channels; const size_t biasadd_bias_bsize = dst_channels * sizeof(float); void *biasadd_bias_data = nullptr; CHECK_CUDA_ERR(cudaMalloc(&biasadd_bias_data, biasadd_bias_bsize)); // as the virtualities of dst tensors of conv and biasadd are true, there is // no need to alloc memory for them // relu dst const size_t relu_dst_size = batch_size * dst_channels * dst_feature_h * dst_feature_w; const size_t relu_dst_bsize = relu_dst_size * sizeof(float); void *relu_dst_data = nullptr; CHECK_CUDA_ERR(cudaMalloc(&relu_dst_data, relu_dst_bsize)); // workspace void *workspace_data = nullptr; if (workspace_size) CHECK_CUDA_ERR(cudaMalloc(&workspace_data, workspace_size)); //===-------- Initialize the device data with host buffers ----===// // conv src std::vector conv_src_data_hst(conv_src_size, 0.f); std::generate(conv_src_data_hst.begin(), conv_src_data_hst.end(), [n = 0]() mutable { return sinf(n++); }); CHECK_CUDA_ERR(cudaMemcpy(conv_src_data, conv_src_data_hst.data(), conv_src_bsize, cudaMemcpyHostToDevice)); // conv weight std::vector conv_wei_data_hst(conv_wei_size, 0.f); std::generate(conv_wei_data_hst.begin(), conv_wei_data_hst.end(), [n = 0]() mutable { return cosf(n++); }); CHECK_CUDA_ERR(cudaMemcpy(conv_wei_data, conv_wei_data_hst.data(), conv_wei_bsize, cudaMemcpyHostToDevice)); // biasadd bias std::vector biasadd_bias_data_hst(biasadd_bias_size, 0.f); std::generate(biasadd_bias_data_hst.begin(), biasadd_bias_data_hst.end(), [n = 0]() mutable { return tanf(n++); }); CHECK_CUDA_ERR(cudaMemcpy(biasadd_bias_data, biasadd_bias_data_hst.data(), biasadd_bias_bsize, cudaMemcpyHostToDevice)); // relu dst std::vector relu_dst_data_hst(relu_dst_size, 0.f); CHECK_CUDA_ERR(cudaMemset(relu_dst_data, 0, relu_dst_bsize)); // workspace if (workspace_size) CHECK_CUDA_ERR(cudaMemset(workspace_data, 0, workspace_size)); //===-------- Create a variant pack ----===// void *dev_ptrs[4] = {conv_src_data, conv_wei_data, biasadd_bias_data, relu_dst_data}; int64_t uids[4] = {conv_src_uid, conv_wei_uid, biasadd_bias_uid, relu_dst_uid}; cudnnBackendDescriptor_t varpack; CHECK_CUDNN_ERR(cudnnBackendCreateDescriptor( CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, &varpack)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS, CUDNN_TYPE_VOID_PTR, 4, dev_ptrs)); CHECK_CUDNN_ERR(cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS, CUDNN_TYPE_INT64, 4, uids)); if (workspace_size) CHECK_CUDNN_ERR(cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_WORKSPACE, CUDNN_TYPE_VOID_PTR, 1, &workspace_data)); CHECK_CUDNN_ERR(cudnnBackendFinalize(varpack)); //===-------- Execute the plan with the variant pack ----===// CHECK_CUDNN_ERR(cudnnBackendExecute(handle, plan, varpack)); CHECK_CUDA_ERR(cudaMemcpyAsync(relu_dst_data_hst.data(), relu_dst_data, relu_dst_bsize, cudaMemcpyDeviceToHost, s)); CHECK_CUDA_ERR(cudaStreamSynchronize(s)); // Free if (workspace_data) CHECK_CUDA_ERR(cudaFree(workspace_data)); CHECK_CUDA_ERR(cudaFree(relu_dst_data)); CHECK_CUDA_ERR(cudaFree(biasadd_bias_data)); CHECK_CUDA_ERR(cudaFree(conv_wei_data)); CHECK_CUDA_ERR(cudaFree(conv_src_data)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(varpack)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(plan)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(engine_cfg)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(engine)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(op_graph)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(relu_fwd_op)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(relu_fwd_desc)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(biasadd_fwd_op)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(biasadd_fwd_desc)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(conv_fwd_op)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(conv_fwd_desc)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(tensor_relu_dst)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(tensor_biasadd_dst)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(tensor_biasadd_bias)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(tensor_conv_dst)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(tensor_conv_wei)); CHECK_CUDNN_ERR(cudnnBackendDestroyDescriptor(tensor_conv_src)); printf("cudnn cbr sample executes successfully.\n"); return 0; } int main() { return cudnn_cbr(); }