the strange performance of TK1 when processing the image in canny.

#include “cuda_runtime_api.h”
#include “device_launch_parameters.h”

#include <math.h>
#include <opencv2/opencv.hpp>
#include <opencv2/gpu/gpu.hpp>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include “local_timer.h”

#define PIC_WIDE 720
#define PIC_HIGH 576

uchar image_buf[PIC_WIDE*PIC_HIGH];
cv::Mat image(PIC_HIGH, PIC_WIDE, CV_8UC1, image_buf);

using namespace cv;

int main(int argc, char **argv)
FILE * video_stream;
int frame_cnt = 200;

#ifdef LINUX
char dir = “./grayx.yuv”;
char dir = “F:\video\grayx.yuv”;

video_stream = fopen(dir,"rb");
uchar *image_d;
uchar *edge_d;
size_t pitch;

cudaMallocPitch((void **)&image_d, &pitch, PIC_WIDE, PIC_HIGH);
cudaMallocPitch((void **)&edge_d, &pitch, PIC_WIDE, PIC_HIGH);

gpu::GpuMat image_device(PIC_HIGH, PIC_WIDE, CV_8UC1, image_d);
gpu::GpuMat edge_device(PIC_HIGH, PIC_WIDE, CV_8UC1, edge_d);
image_device.step = pitch;
edge_device.step = pitch;

local_timer tick;

while (frame_cnt--) {
	fread(image_buf, 1, PIC_WIDE * PIC_HIGH, video_stream);

	if (frame_cnt < 100) {
		cudaMemcpy2D(, image_device.step, image_buf, PIC_WIDE, PIC_WIDE, PIC_HIGH, cudaMemcpyHostToDevice);
		cudaMemcpy2D(image_buf, PIC_WIDE,, edge_device.step, PIC_WIDE, PIC_HIGH, cudaMemcpyDeviceToHost);
		cv::imshow("calculate by gpu", image);
	} else {
		cv::Mat edge;
		cv::Canny(image, edge, 100, 60);
		cv::imshow("calculate by cpu", edge);


return 0;


ubuntu@tegra-ubuntu:/win.doc$ ./open_gpu
cpu used:4.089000
cpu used:7.563000
cpu used:3.510000
cpu used:7.853000
cpu used:12.186000
cpu used:8.828000
cpu used:12.073000
cpu used:6.969000
cpu used:11.691000
cpu used:17.694000
cpu used:20.298000
cpu used:13.972000
cpu used:18.022000
cpu used:8.261000
cpu used:7.643000
cpu used:14.999000
cpu used:14.895000
cpu used:12.109000
cpu used:14.856000
cpu used:15.347000
cpu used:12.306000
cpu used:11.597000
cpu used:12.719000
cpu used:9.762000
cpu used:10.539000
cpu used:11.814000
cpu used:10.585000
cpu used:13.156000
cpu used:13.020000
cpu used:9.084000
cpu used:6.241000
cpu used:7.772000
cpu used:12.249000
cpu used:15.563000
cpu used:19.210000
cpu used:15.026000
cpu used:24.312000
cpu used:12.221000
cpu used:10.420000
cpu used:14.716000
cpu used:13.695000
cpu used:8.812000
cpu used:11.150000
cpu used:11.646000
cpu used:15.473000
cpu used:11.254000
cpu used:12.273000
cpu used:11.653000
cpu used:11.059000
cpu used:10.212000
cpu used:10.790000
cpu used:12.307000
cpu used:14.699000
cpu used:8.389000
cpu used:10.662000
cpu used:9.134000
cpu used:13.741000
cpu used:13.249000
cpu used:12.786000
cpu used:14.688000
cpu used:14.255000
cpu used:11.850000
cpu used:14.063000
cpu used:8.250000
cpu used:10.953000
cpu used:17.525000
cpu used:23.514000
cpu used:18.643000
cpu used:24.222000
cpu used:14.156000
cpu used:12.051000
cpu used:8.943000
cpu used:12.147000
cpu used:13.669000
cpu used:11.916000
cpu used:15.441000
cpu used:11.129000
cpu used:19.107000
cpu used:15.050000
cpu used:14.959000
cpu used:13.670000
cpu used:8.057000
cpu used:9.970000
cpu used:11.532000
cpu used:11.193000
cpu used:11.737000
cpu used:10.739000
cpu used:11.519000
cpu used:11.211000
cpu used:9.933000
cpu used:10.235000
cpu used:13.254000
cpu used:14.557000
cpu used:8.676000
cpu used:11.025000
cpu used:12.146000
cpu used:13.232000
cpu used:9.789000
cpu used:8.878000
cpu used:12.111000
gpu used:46.076000
gpu used:33.636000
gpu used:17.872000
gpu used:29.161000
gpu used:32.852000
gpu used:24.778000
gpu used:21.092000
gpu used:27.290000
gpu used:36.690000
gpu used:35.275000
gpu used:25.791000
gpu used:33.886000
gpu used:48.901000
gpu used:22.725000
gpu used:17.005000
gpu used:24.794000
gpu used:39.010000
gpu used:77.734000
gpu used:34.048000
gpu used:50.446000
gpu used:35.562000
gpu used:53.413000
gpu used:22.575000
gpu used:40.463000
gpu used:35.506000
gpu used:34.397000
gpu used:47.364000
gpu used:31.125000
gpu used:49.176000
gpu used:50.616000
gpu used:22.678000
gpu used:46.243000
gpu used:39.848000
gpu used:22.337000
gpu used:33.471000
gpu used:45.224000
gpu used:22.537000
gpu used:17.752000
gpu used:20.218000
gpu used:41.770000
gpu used:29.653000
gpu used:52.392000
gpu used:32.698000
gpu used:40.081000
gpu used:26.568000
gpu used:42.451000
gpu used:56.393000
gpu used:38.219000
gpu used:69.773000
gpu used:43.150000
gpu used:50.054000
gpu used:23.535000
gpu used:30.788000
gpu used:41.088000
gpu used:33.636000
gpu used:34.332000
gpu used:36.538000
gpu used:51.809000
gpu used:53.773000
gpu used:14.336000
gpu used:16.998000
gpu used:43.380000
gpu used:31.184000
gpu used:54.100000
gpu used:34.882000
gpu used:56.416000
gpu used:40.183000
gpu used:51.487000
gpu used:29.938000
gpu used:39.883000
gpu used:63.509000
gpu used:32.611000
gpu used:60.647000
gpu used:33.345000
gpu used:60.228000
gpu used:34.954000
gpu used:49.390000
gpu used:28.702000
gpu used:36.344000
gpu used:54.155000
gpu used:33.006000
gpu used:53.965000
gpu used:23.096000
gpu used:17.015000
gpu used:21.950000
gpu used:41.598000
gpu used:30.013000
gpu used:33.787000
gpu used:34.660000
gpu used:33.071000
gpu used:21.005000
gpu used:33.828000
gpu used:36.108000
gpu used:34.000000
gpu used:22.926000
gpu used:31.374000
gpu used:55.783000
gpu used:19.838000
gpu used:28.803000
gpu used:28.008000

the gpu is more slower than arm, that right

by use of nvprof, i have find out the cv::canny throughly not use the GPU, if call gpu::canny, the slowest api is
edgesHysteresisLocalKernel(PtrStepSzi map, short2* st), it will spend about 8ms,other kernel api will spends 3 ~ 4ms,
atlast the transform api will spen 1ms.

add up several useless call of cudafree,cudamalloc , it will spend 25 ms per image.