I test depthwise convolution using cudnn in FP16 on xvaier, and I found that fp16 is slow than fp32 in both depthwise convolution. Is it correct? And if that means I must using fp32 for inference when there has depthwise convolution like mobilenet-v2.
|in_channel|height|width|out_channel|kernel|stride|group|cudnn FP32|cudnn FP16|
| — | — | — | — | — | — | — | — | — | — |
|480|80|48|480|7|1|480|1.696ms|11.145ms|
|32|400|144|32|7|1|32|1.695ms|5.428ms|
code is as follows:
int conv_op_process(int& batch, int& in_channel, int& height, int& width, int& kernel_size, int& num_filter, int& stride, int& group, int&iter, bool& half){
int input_size = batch * in_channel * height * width;
float* A = (float*)malloc(sizeof(float) * input_size);
int kernel = kernel_size * kernel_size * in_channel * num_filter;
float* W = (float*)malloc(sizeof(float) * kernel);
int output_size = batch * num_filter * height * width / stride / stride;
float* B = (float*)malloc(sizeof(float) * output_size);
// random assign data
for(int i = 0; i < input_size; i++){
A[i] = i/10;
}
for(int i = 0; i < kernel; i++){
W[i] = i/15;
}
for(int i = 0; i < output_size; i++){
B[i] = 1;
}
// alloc cuda memory
void* d_A;
void* d_W;
void* d_B;
checkCudaErr(cudaMalloc(&d_A, sizeof(float) * input_size));
checkCudaErr(cudaMalloc(&d_W, sizeof(float) * kernel));
checkCudaErr(cudaMalloc(&d_B, sizeof(float) * output_size));
// mem for warm up
void* d_A_w;
void* d_W_w;
void* d_B_w;
checkCudaErr(cudaMalloc(&d_A_w, sizeof(float) * input_size));
checkCudaErr(cudaMalloc(&d_W_w, sizeof(float) * kernel));
checkCudaErr(cudaMalloc(&d_B_w, sizeof(float) * output_size));
// memory copy
checkCudaErr(cudaMemcpy(d_A, (void*)A, sizeof(float) * input_size, cudaMemcpyHostToDevice));
checkCudaErr(cudaMemcpy(d_W, (void*)W, sizeof(float) * kernel, cudaMemcpyHostToDevice));
checkCudaErr(cudaMemcpy(d_B, (void*)B, sizeof(float) * output_size, cudaMemcpyHostToDevice));
checkCudaErr(cudaDeviceSynchronize()); // wait for copy finish
struct timeval start, handle_create, end;
// cudnn function init
cudnnHandle_t handle;
cudnnTensorDescriptor_t input_desc;
cudnnTensorDescriptor_t output_desc;
cudnnFilterDescriptor_t filter_desc;
cudnnConvolutionDescriptor_t conv_op_desc;
// handle create process is time cost
checkCudnnErr(cudnnCreate(&handle));
gettimeofday(&start, NULL);
checkCudnnErr(cudnnCreateTensorDescriptor(&input_desc));
checkCudnnErr(cudnnCreateTensorDescriptor(&output_desc));
checkCudnnErr(cudnnCreateFilterDescriptor(&filter_desc));
checkCudnnErr(cudnnCreateConvolutionDescriptor(&conv_op_desc));
checkCudnnErr(cudnnSetConvolutionGroupCount(conv_op_desc, group));
cudnnTensorFormat_t infer_type = CUDNN_TENSOR_NHWC;
int pad = (kernel_size - 1) / 2;
if(half){
printf("Using FP16\n");
// link mem with desc
checkCudnnErr(cudnnSetTensor4dDescriptor(input_desc, infer_type, CUDNN_DATA_HALF, batch, in_channel, height, width));
checkCudnnErr(cudnnSetTensor4dDescriptor(output_desc, infer_type, CUDNN_DATA_HALF, batch, num_filter, int(height/stride), int(width/stride)));
checkCudnnErr(cudnnSetFilter4dDescriptor(filter_desc, CUDNN_DATA_HALF, infer_type, num_filter, int(in_channel/group), kernel_size, kernel_size));
if(group != 1){
checkCudnnErr(cudnnSetConvolution2dDescriptor(conv_op_desc, pad, pad, stride, stride, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));
}
else{
checkCudnnErr(cudnnSetConvolution2dDescriptor(conv_op_desc, pad, pad, stride, stride, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_HALF));
}
// using tensor core
checkCudnnErr( cudnnSetConvolutionMathType(conv_op_desc, CUDNN_TENSOR_OP_MATH));
}
else{
// link mem with desc
checkCudnnErr(cudnnSetTensor4dDescriptor(input_desc, infer_type, CUDNN_DATA_FLOAT, batch, in_channel, height, width));
checkCudnnErr(cudnnSetTensor4dDescriptor(output_desc, infer_type, CUDNN_DATA_FLOAT, batch, num_filter, int(height/stride), int(width/stride)));
checkCudnnErr(cudnnSetFilter4dDescriptor(filter_desc, CUDNN_DATA_FLOAT, infer_type, num_filter, int(in_channel/group), kernel_size, kernel_size));
checkCudnnErr(cudnnSetConvolution2dDescriptor(conv_op_desc, pad, pad, stride, stride, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));
}
printf("conv info %d %d %d %d %d %d %d %d\n", in_channel, height, width, num_filter, kernel_size, pad, stride, group);
// conv op perpape
// cudnnConvolutionFwdAlgoPerf_t algo_pref;
std::unique_ptr<perf_t[]> perf_algos(new perf_t[8]);
int num_algo = 8;
int num_algo_get = 0;
void* workspace_data_temp;
size_t workspace_data_temp_size = 30 * 1024 * 102;
checkCudaErr(cudaMalloc(&workspace_data_temp, workspace_data_temp_size));
cudnnConvolutionFwdAlgo_t algo;
//checkCudnnErr(cudnnGetConvolutionForwardAlgorithm_v7(handle, input_desc, filter_desc, conv_op_desc, output_desc, num_algo, &num_algo_get, perf_algos.get()));
checkCudnnErr(cudnnFindConvolutionForwardAlgorithmEx(handle, input_desc, d_A_w, filter_desc, d_W_w, conv_op_desc, output_desc,d_B_w, num_algo, &num_algo_get, perf_algos.get(), workspace_data_temp, workspace_data_temp_size));
algo = perf_algos[0].algo;
//if(half){
// algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
//}
printf("Algo: %d\n", algo);
size_t workspace_size = 0;
checkCudnnErr(cudnnGetConvolutionForwardWorkspaceSize(handle, input_desc, filter_desc, conv_op_desc, output_desc, algo, &workspace_size));
void* workspace;
cudaMalloc(&workspace, workspace_size);
// printf("workspace_size %d\n", (int)workspace_size);
//#define ITER 2
// do conv
float alpha = 1;
float beta = 0;
// warm up
for(int i = 0; i < 10; i++){
checkCudnnErr(cudnnConvolutionForward(handle, (void*)&alpha, input_desc, d_A_w, filter_desc, d_W_w, conv_op_desc, algo, workspace, workspace_size, (void*)&beta, output_desc, d_B_w));
checkCudaErr(cudaDeviceSynchronize());
}
gettimeofday(&handle_create, NULL);
for(int i = 0; i < iter; i++){
checkCudnnErr(cudnnConvolutionForward(handle, (void*)&alpha, input_desc, d_A, filter_desc, d_W, conv_op_desc, algo, workspace, workspace_size, (void*)&beta, output_desc, d_B));
checkCudaErr(cudaDeviceSynchronize());
}
gettimeofday(&end, NULL);
int time = 0;
printf("========== Time ==========\n");
//time = 1000000 * (handle_create.tv_sec - start.tv_sec) + (handle_create.tv_usec - start.tv_usec);
//printf("Handle Create: %d us\n", time);
time = -1000000 * (handle_create.tv_sec - end.tv_sec) - (handle_create.tv_usec - end.tv_usec);
printf("Conv Op Run: %d us\n", time/iter);
// copy data into host
checkCudaErr(cudaMemcpy((void*)B, d_B, sizeof(float) * output_size, cudaMemcpyDeviceToHost));
checkCudaErr(cudaMemcpy((void*)A, d_A, sizeof(float) * input_size, cudaMemcpyDeviceToHost));
// checkCudaErr(cudaMemcpy((void*)B, d_B, sizeof(float) * output_size, cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
// for(int i = 0; i < output_size; i++){
// printf("%d -> %f\n", i, B[i]);
// }
return 0;
}