diff --git a/README.md b/README.md
index 32924cc..d7795e6 100644
--- a/README.md
+++ b/README.md
@@ -13,3 +13,12 @@ RMUA2021战队技术答辩视频链接：[RoboMaster 2021高校人工智能挑
 |robot_sentry|哨岗识别模块|
 |robot_decision|决策模块|
 |robot_detection|装甲识别模块|
+|robot_omni|车端全向感知模块|
+|robot_msgs|自定义的ros消息类型|
+
+## 编译及运行
+哨岗识别模块自成一体，不与其他模块一起编译。
+
+决策模块、装甲识别模块和全向感知模块均依赖`robot_msgs`，编译时务必包含在内。
+
+具体编译步骤参考各个模块的README。
diff --git a/robot_detection/README.md b/robot_detection/README.md
index 0daef6f..67e9603 100755
--- a/robot_detection/README.md
+++ b/robot_detection/README.md
@@ -189,9 +189,11 @@ robot_detection
 单独运行测试：进入工作空间，运行节点，播放rosbag
 
 ```bash
-source devel/setup.bash
 catkin_make
+source devel/setup.bash
 roslaunch robot_detection robot_detection.launch
 rosbag play test.bag
 ```
 
+## 4、网盘链接
+网盘中提供了用于演示功能的rosbag。链接：[https://pan.baidu.com/s/1EDhP30KD0Ao7wxOlbpj_VQ](https://pan.baidu.com/s/1EDhP30KD0Ao7wxOlbpj_VQ?pwd=ft9k)，提取码：ft9k。
diff --git a/robot_msgs/CMakeLists.txt b/robot_msgs/CMakeLists.txt
new file mode 100644
index 0000000..555be98
--- /dev/null
+++ b/robot_msgs/CMakeLists.txt
@@ -0,0 +1,84 @@
+cmake_minimum_required(VERSION 3.1)
+project(robot_msgs)
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR})
+
+find_package(catkin REQUIRED COMPONENTS
+  std_msgs
+  nav_msgs
+  geometry_msgs
+  actionlib
+  message_generation
+)
+
+add_action_files(
+  DIRECTORY action
+  FILES
+)
+
+add_message_files(
+  DIRECTORY msg
+  FILES
+  IMUParam.msg
+  TOFdata.msg
+)
+
+add_message_files(
+  DIRECTORY msg/net
+  FILES
+  
+  ZMQdata.msg
+)
+
+add_message_files(
+  DIRECTORY msg/robot
+  FILES
+  ArmorDetection.msg
+  ChassisParam.msg
+  CornerDetection.msg
+  GimbalAngle.msg
+  OdomGoalMsg.msg
+  RobotFlag.msg
+  RobotPose.msg
+  TwistAccel.msg
+  RobotLocation.msg
+  GlobalMap.msg
+)
+
+add_message_files(
+  DIRECTORY msg/referee
+  FILES
+  GameZone.msg
+  GameBuff.msg
+  GameResult.msg
+  GameRobotHP.msg
+  GameStatus.msg
+  RobotDamage.msg
+  RobotStatus.msg
+)
+
+add_message_files(
+  DIRECTORY msg/wireless
+  FILES
+  SentryData.msg
+  SharePath.msg
+  TeamInfo.msg
+  WirelessParam.msg
+)
+
+add_service_files(
+  DIRECTORY srv
+  FILES
+)
+
+generate_messages(
+  DEPENDENCIES
+  std_msgs
+  geometry_msgs
+  nav_msgs
+)
+
+catkin_package(
+  CATKIN_DEPENDS
+  message_runtime
+)
diff --git a/robot_msgs/msg/IMUParam.msg b/robot_msgs/msg/IMUParam.msg
new file mode 100644
index 0000000..db2fd49
--- /dev/null
+++ b/robot_msgs/msg/IMUParam.msg
@@ -0,0 +1,5 @@
+# imu param
+float32 temperature
+float32 gyro_bias_x
+float32 gyro_bias_y
+float32 gyro_bias_z
\ No newline at end of file
diff --git a/robot_msgs/msg/TOFdata.msg b/robot_msgs/msg/TOFdata.msg
new file mode 100644
index 0000000..c8b8ea4
--- /dev/null
+++ b/robot_msgs/msg/TOFdata.msg
@@ -0,0 +1,6 @@
+uint8 id1
+uint8 id2
+uint8 status1
+uint8 status2
+float64 distance1
+float64 distance2
\ No newline at end of file
diff --git a/robot_msgs/msg/net/ZMQdata.msg b/robot_msgs/msg/net/ZMQdata.msg
new file mode 100644
index 0000000..70185d9
--- /dev/null
+++ b/robot_msgs/msg/net/ZMQdata.msg
@@ -0,0 +1,4 @@
+# ZMQ data
+int16[8] robot_pose_x
+int16[8] robot_pose_y
+  
\ No newline at end of file
diff --git a/robot_msgs/msg/referee/GameBuff.msg b/robot_msgs/msg/referee/GameBuff.msg
new file mode 100644
index 0000000..ff8bcb4
--- /dev/null
+++ b/robot_msgs/msg/referee/GameBuff.msg
@@ -0,0 +1,12 @@
+# game zone array
+GameZone[6] zone
+
+# game robot bullet
+uint16 red1
+uint16 red2
+uint16 blue1
+uint16 blue2
+
+# lurk_mode
+uint8 lurl_mode
+uint8 res
\ No newline at end of file
diff --git a/robot_msgs/msg/referee/GameResult.msg b/robot_msgs/msg/referee/GameResult.msg
new file mode 100644
index 0000000..de3f8d7
--- /dev/null
+++ b/robot_msgs/msg/referee/GameResult.msg
@@ -0,0 +1,5 @@
+#game result
+uint8  DRAW=0
+uint8  RED_WIN=1
+uint8  BLUE_WIN=2
+uint8  result
diff --git a/robot_msgs/msg/referee/GameRobotHP.msg b/robot_msgs/msg/referee/GameRobotHP.msg
new file mode 100644
index 0000000..5183c5f
--- /dev/null
+++ b/robot_msgs/msg/referee/GameRobotHP.msg
@@ -0,0 +1,5 @@
+#game robot hp
+uint16 red1
+uint16 red2
+uint16 blue1
+uint16 blue2
diff --git a/robot_msgs/msg/referee/GameStatus.msg b/robot_msgs/msg/referee/GameStatus.msg
new file mode 100644
index 0000000..1290a41
--- /dev/null
+++ b/robot_msgs/msg/referee/GameStatus.msg
@@ -0,0 +1,12 @@
+#game status
+uint8 PRE_MATCH = 0
+uint8 SETUP = 1
+uint8 INIT = 2
+uint8 FIVE_SEC_CD = 3
+uint8 ROUND = 4
+uint8 CALCULATION = 5
+uint8 game_status
+uint16 remaining_time
+
+
+
diff --git a/robot_msgs/msg/referee/GameZone.msg b/robot_msgs/msg/referee/GameZone.msg
new file mode 100644
index 0000000..9783032
--- /dev/null
+++ b/robot_msgs/msg/referee/GameZone.msg
@@ -0,0 +1,10 @@
+#game zone
+uint8 RED_HP_RECOVERY=1
+uint8 RED_BULLET_SUPPLY=2
+uint8 BLUE_HP_RECOVERY=3
+uint8 BLUE_BULLET_SUPPLY=4
+uint8 DISABLE_SHOOTING=5
+uint8 DISABLE_MOVEMENT=6
+
+uint8 type
+bool active
diff --git a/robot_msgs/msg/referee/RobotDamage.msg b/robot_msgs/msg/referee/RobotDamage.msg
new file mode 100644
index 0000000..020fb7a
--- /dev/null
+++ b/robot_msgs/msg/referee/RobotDamage.msg
@@ -0,0 +1,14 @@
+#robot damage
+uint8 ARMOR = 0
+uint8 OFFLINE = 1
+uint8 EXCEED_SHOOT_SPEED = 2
+uint8 EXCEED_SHOOT_HEAT = 3
+uint8 EXCEED_CHASSIS_POWER = 4
+uint8 OBSTACLE_COLLISION = 5
+uint8 damage_type
+
+uint8 FORWARD = 0
+uint8 LEFT = 1
+uint8 BACKWARD = 2
+uint8 RIGHT = 3
+uint8 damage_source
diff --git a/robot_msgs/msg/referee/RobotStatus.msg b/robot_msgs/msg/referee/RobotStatus.msg
new file mode 100644
index 0000000..b389fdf
--- /dev/null
+++ b/robot_msgs/msg/referee/RobotStatus.msg
@@ -0,0 +1,13 @@
+#robot status
+uint8 id
+uint8 level
+uint16 remain_hp
+uint16 max_hp
+
+uint16 heat_cooling_rate
+uint16 heat_cooling_limit
+uint16 heat_speed_limit
+
+bool gimbal_enable
+bool chassis_enable
+bool shooter_enable
diff --git a/robot_msgs/msg/robot/ArmorDetection.msg b/robot_msgs/msg/robot/ArmorDetection.msg
new file mode 100644
index 0000000..a8e6a85
--- /dev/null
+++ b/robot_msgs/msg/robot/ArmorDetection.msg
@@ -0,0 +1,7 @@
+#armor detected msg
+Header header
+uint8 id
+bool detected_enemy
+float64 enemy_x
+float64 enemy_y
+float64 distance
diff --git a/robot_msgs/msg/robot/ChassisParam.msg b/robot_msgs/msg/robot/ChassisParam.msg
new file mode 100644
index 0000000..e87ff36
--- /dev/null
+++ b/robot_msgs/msg/robot/ChassisParam.msg
@@ -0,0 +1,3 @@
+uint8 robot_id
+uint16 wheel_pwm
+float32 voltage
\ No newline at end of file
diff --git a/robot_msgs/msg/robot/CornerDetection.msg b/robot_msgs/msg/robot/CornerDetection.msg
new file mode 100644
index 0000000..8b03aaa
--- /dev/null
+++ b/robot_msgs/msg/robot/CornerDetection.msg
@@ -0,0 +1,12 @@
+uint8 camera_id    # 相机id号
+uint8 num_find     # 相机找到机器人的数量
+
+uint8 robot_id_1
+float64 position_x_1
+float64 position_y_1
+float64 angular_theta_1
+
+uint8 robot_id_2
+float64 position_x_2
+float64 position_y_2
+float64 angular_theta_2
\ No newline at end of file
diff --git a/robot_msgs/msg/robot/GimbalAngle.msg b/robot_msgs/msg/robot/GimbalAngle.msg
new file mode 100644
index 0000000..e9bcef5
--- /dev/null
+++ b/robot_msgs/msg/robot/GimbalAngle.msg
@@ -0,0 +1,13 @@
+#gimbal feedback angle data
+Header    header
+bool      yaw_mode
+
+float64   yaw_gyro
+float64   yaw_angle
+float64   yaw_ecd_angle
+float64   pitch_angle
+float64   yaw_original
+float64   bullet_speed
+
+uint16    armor_angle
+uint16    distance
\ No newline at end of file
diff --git a/robot_msgs/msg/robot/GlobalMap.msg b/robot_msgs/msg/robot/GlobalMap.msg
new file mode 100644
index 0000000..fe1f8c3
--- /dev/null
+++ b/robot_msgs/msg/robot/GlobalMap.msg
@@ -0,0 +1,5 @@
+#GlobalMap
+Header    header
+RobotLocation[] robots
+RobotLocation[6] msg2decision #0->red1 1->red2 2->blue1 3->blue2 4->gray1 5->gray2
+bool      is_sentry_on
\ No newline at end of file
diff --git a/robot_msgs/msg/robot/OdomGoalMsg.msg b/robot_msgs/msg/robot/OdomGoalMsg.msg
new file mode 100644
index 0000000..a50c3aa
--- /dev/null
+++ b/robot_msgs/msg/robot/OdomGoalMsg.msg
@@ -0,0 +1,4 @@
+float64 x
+float64 y
+float64 yaw
+uint8 mode
\ No newline at end of file
diff --git a/robot_msgs/msg/robot/RobotFlag.msg b/robot_msgs/msg/robot/RobotFlag.msg
new file mode 100644
index 0000000..03b1e6e
--- /dev/null
+++ b/robot_msgs/msg/robot/RobotFlag.msg
@@ -0,0 +1 @@
+bool flag
\ No newline at end of file
diff --git a/robot_msgs/msg/robot/RobotLocation.msg b/robot_msgs/msg/robot/RobotLocation.msg
new file mode 100644
index 0000000..11faf89
--- /dev/null
+++ b/robot_msgs/msg/robot/RobotLocation.msg
@@ -0,0 +1,5 @@
+#Robot_location
+uint8     id
+float32   x   #单位m
+float32   y
+float64   yaw #单位弧度 ==100表示无效
diff --git a/robot_msgs/msg/robot/RobotPose.msg b/robot_msgs/msg/robot/RobotPose.msg
new file mode 100644
index 0000000..0170332
--- /dev/null
+++ b/robot_msgs/msg/robot/RobotPose.msg
@@ -0,0 +1,8 @@
+float32 robot0_x
+float32 robot0_y
+float32 robot1_x
+float32 robot1_y
+float32 robot2_x
+float32 robot2_y
+float32 robot3_x
+float32 robot3_y
\ No newline at end of file
diff --git a/robot_msgs/msg/robot/TwistAccel.msg b/robot_msgs/msg/robot/TwistAccel.msg
new file mode 100644
index 0000000..14b1cca
--- /dev/null
+++ b/robot_msgs/msg/robot/TwistAccel.msg
@@ -0,0 +1,3 @@
+uint8 mode
+geometry_msgs/Twist twist
+geometry_msgs/Accel accel
\ No newline at end of file
diff --git a/robot_msgs/msg/wireless/SentryData.msg b/robot_msgs/msg/wireless/SentryData.msg
new file mode 100644
index 0000000..bcee48d
--- /dev/null
+++ b/robot_msgs/msg/wireless/SentryData.msg
@@ -0,0 +1,7 @@
+# sentry data
+Header     header
+uint8      id
+uint8[4]   lable   
+uint16[4]  x    # mm
+uint16[4]  y    # mm
+int16[4]   yaw  # rad*100
\ No newline at end of file
diff --git a/robot_msgs/msg/wireless/SharePath.msg b/robot_msgs/msg/wireless/SharePath.msg
new file mode 100644
index 0000000..5f4775f
--- /dev/null
+++ b/robot_msgs/msg/wireless/SharePath.msg
@@ -0,0 +1,6 @@
+# 机器人路径规划共享
+Header     header
+uint8      id
+
+uint16[7]  pos_x
+uint16[7]  pos_y
diff --git a/robot_msgs/msg/wireless/TeamInfo.msg b/robot_msgs/msg/wireless/TeamInfo.msg
new file mode 100644
index 0000000..3d9a926
--- /dev/null
+++ b/robot_msgs/msg/wireless/TeamInfo.msg
@@ -0,0 +1,14 @@
+Header     header
+uint8      id
+
+uint8 attack_enemy       #正攻击敌方机器人id
+uint8 being_attacked     #受攻击方向
+uint8 to_buff            #要去的buff点
+uint16 pose_x_teammate
+uint16 pose_y_teammate
+uint16 pose_x_enemy
+uint16 pose_y_enemy
+
+bool teammate_retreat    #队友是否撤退
+uint8 team_mode          #主攻还是辅助的标志
+uint8 decision_enemy_id  #决策选择的进攻目标
diff --git a/robot_msgs/msg/wireless/WirelessParam.msg b/robot_msgs/msg/wireless/WirelessParam.msg
new file mode 100644
index 0000000..9ca801c
--- /dev/null
+++ b/robot_msgs/msg/wireless/WirelessParam.msg
@@ -0,0 +1,14 @@
+# wireless param
+
+uint8 id
+
+# freq range: 0~127
+uint8 freq_tx 
+uint8 freq_rx1
+uint8 freq_rx2
+
+uint8 SPEED_250K = 0
+uint8 SPEED_1M   = 1
+uint8 SPEED_2M   = 2
+uint8 speed
+
diff --git a/robot_msgs/package.xml b/robot_msgs/package.xml
new file mode 100644
index 0000000..fc5fdd3
--- /dev/null
+++ b/robot_msgs/package.xml
@@ -0,0 +1,32 @@
+<package>
+    <name>robot_msgs</name>
+    <version>1.0.0</version>
+    <description>
+        The robot msgs package provides all the messages for RoboMaster AI Robot
+    </description>
+    <author>RoboMaster</author>
+    <maintainer email="robomaster@dji.com">RoboMaster</maintainer>
+    <license>GPL 3.0</license>
+    <url>https://github.com/RoboMaster/RoboRTS</url>
+
+    <buildtool_depend>catkin</buildtool_depend>
+
+    <build_depend>actionlib</build_depend>
+    <build_depend>std_msgs</build_depend>
+    <build_depend>geometry_msgs</build_depend>
+    <build_depend>nav_msgs</build_depend>
+    <build_depend>sensor_msgs</build_depend>
+    <build_depend>message_generation</build_depend>
+
+
+    <run_depend>actionlib</run_depend>
+    <run_depend>std_msgs</run_depend>
+    <run_depend>geometry_msgs</run_depend>
+    <run_depend>nav_msgs</run_depend>
+    <run_depend>sensor_msgs</run_depend>
+    <run_depend>message_runtime</run_depend>
+
+    <export>
+    </export>
+
+</package>
\ No newline at end of file
diff --git a/robot_omni/CMakeLists.txt b/robot_omni/CMakeLists.txt
new file mode 100644
index 0000000..d737d2c
--- /dev/null
+++ b/robot_omni/CMakeLists.txt
@@ -0,0 +1,53 @@
+cmake_minimum_required(VERSION 3.5)
+project(omni_detect)
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_BUILD_TYPE Release)
+
+find_package(catkin REQUIRED COMPONENTS
+  cv_bridge
+  image_transport
+  roscpp
+  sensor_msgs
+  std_msgs
+  roslib
+  robot_msgs
+)
+catkin_package()
+
+include_directories(
+  "/usr/include/eigen3"
+  include/robot_detector
+  ${catkin_INCLUDE_DIRS}
+)
+
+set(CUDA_GEN_CODE "-gencode=arch=compute_75,code=sm_75")
+set(CUDA_TOOLKIT_ROOT_DIR     "/usr/local/cuda-11.3")
+set(CUDNN_DIR    "/usr/local/cuda-11.3/lib64")
+set(TENSORRT_DIR "/home/rmua3/TensorRT-8.0.1.6") #需要修改为本地TensorRT路径
+
+find_package(CUDA REQUIRED)
+find_package(OpenCV 4 REQUIRED)
+
+include_directories(
+    ${OpenCV_INCLUDE_DIRS}
+    ${CUDA_TOOLKIT_ROOT_DIR}/include
+    ${TENSORRT_DIR}/include
+    ${CUDNN_DIR}/include
+)
+
+link_directories(
+    ${TENSORRT_DIR}/lib
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+    ${CUDNN_DIR}/lib
+)
+
+add_subdirectory(src)
+
+add_executable(robot_detector_node 
+               src/robot_detector_node.cpp  
+               )
+target_link_libraries(robot_detector_node
+                      robot_detector_lib
+                      ${catkin_LIBRARIES}
+                      )
+
diff --git a/robot_omni/README.md b/robot_omni/README.md
new file mode 100644
index 0000000..5d9fef9
--- /dev/null
+++ b/robot_omni/README.md
@@ -0,0 +1,82 @@
+# 机器人端全向感知模块
+
+## 目录
+
+- [软件功能介绍](#软件功能介绍)
+- [依赖环境及运行方式](#依赖环境及运行方式)
+
+## 软件功能介绍
+
+机器人端全向感知模块通过安装在机器人上的四个相机，搭配广角镜头可以获取机器人周围的全向视野，通过检测周围的机器人和装甲板，可以辅助规划和决策模块完成动态避障，防绕后等行为。由于通信受限等原因，机器人可能无法稳定地接收来自哨岗的信息，这时候机器人的自主感知能力就显得尤为重要。全向感知模块的检测功能与哨岗类似，但它可以让机器人在不依赖外界信息输入的条件下仍具备一定的自主感知能力，使其更加智能化。另外，由于增加了一个检测信息来源，在机器人接收到的哨岗信息出现明显错误时，可以利用机器人端的全向感知模块及时进行纠正，使整个感知系统更加稳定可靠。
+
+<img src="images/robot.png" style="zoom: 50%;" />
+
+
+
+<img src="images/演示效果.gif" />
+
+### 1 全向感知系统流程
+
+全向感知系统的整个流程可以大致分为两部分：前端目标检测和后端信息整合。在前端，使用4个subscriber分别订阅四个相机图像话题，在这四个回调函数里分别对图像进行检测，使用tensorRT部署YOLO v5s目标检测模型，把当前图像的时间戳和检测结果保存下来。对四个图像的时间戳进行同步判断，保证四个相机的检测结果是近似同一时间获得的。对检测结果中的信息进行整合，提取出真实的机器人结果并对其进行位置估计，最后把多个相机中检测到的重复机器人结果滤除掉，得到最终的自身周围信息并发布。
+
+<img src="images/软件流程图.png" style="zoom: 40%;" />
+
+### 2 YOLO v5检测算法的训练与部署
+
+我们制作了包含600张样本的数据集，标注了机器人和装甲板类别。使用YOLO v5s模型训练，参考了开源项目[tensorRT_Pro](https://github.com/shouxieai/tensorRT_Pro)的部署方案。我们的机载计算设备为nuc幻影峡谷11代i7+2060独显，在实际测试中，对于640×640的输入图片，使用FP16精度tensorRT加速后推理一张图片仅耗时2.8ms，可以满足实时性的要求。
+
+### 3 机器人信息匹配
+
+把检测结果中的robot和armor分离，如果是真实的机器人，那么一定是一个robot框内包含若干个armor框的形式，我们根据这一先验知识对结果进行筛选匹配，获得检测结果中的所有真实机器人。
+
+### 4 目标位置估计
+
+根据前面的目标检测与机器人信息匹配，我们得到了每个机器人在图像中的位置，即边界框。我们取机器人边界框的中心点像素坐标，根据相机坐标系到像素坐标系的公式，相机内参已知，只要获得机器人在相机坐标系下的深度Z，我们就能反推得到其X、Y坐标。
+
+$$
+\left\{\begin{array}{l}
+u=f_{x} \frac{X}{Z}+c_{x} \\
+v=f_{y} \frac{Y}{Z}+c_{y}
+\end{array}\right.
+$$
+
+那这个深度Z如何获得呢，我们采用了一种Rough but Effective的办法：分段线性插值。我们很容易知道检测得到的边界框大小和目标离相机的距离是负相关的，即目标距离越近，边界框越大；目标距离越远，边界框越小。我们取机器人边界框的高度作为自变量，到相机的距离作为输出，在不同距离下进行采样，得到一条分段线性插值曲线，理论上分的段数越多这个曲线拟合越精准，但这里我只采样了5个点。有了这条曲线后，我们输入任意一个边界框高度都能得到一个近似的距离估计值，也就是深度Z。于是由上面这个公式就能得到机器人在相机坐标系下的XYZ坐标，再通过外参转换到Body系下，就获得了近似的目标位置估计。
+
+<img src="images/分段线性插值.png" style="zoom: 50%;" />
+
+## 依赖环境及运行方式
+
+### 1 依赖环境
+
+- CUDA 11.3
+- cuDNN 8.2.1
+- TensorRT 8.0.1.6
+- OpenCV 4.2.0
+- Eigen 3
+
+### 2 运行方式
+
+把omni_detect功能包拷贝至您的工作空间下并编译，可以在config/robot_detector.yaml文件中修改您需要的相机话题名称，也可以直接使用我们提供的rosbag进行测试。
+
+如果您使用我们提供的rosbag：
+
+(1)首先启动roscore，然后设置参数 /use_sim_time
+
+```
+rosparam set /use_sim_time true
+```
+
+(2)然后再启动 rosbag play 节点，加上参数 --clock以发布时钟信息，-l 用于循环播放
+
+```
+rosbag play omni_detect_test.bag --clock -l
+```
+
+(3)启动全向感知节点
+
+```
+roslaunch omni_detect robot_detector_node.launch
+```
+
+## 网盘链接
+网盘中提供了用于演示功能的rosbag。链接：[https://pan.baidu.com/s/1X3mPbuscPGmnNs8ye1JiMA](https://pan.baidu.com/s/1X3mPbuscPGmnNs8ye1JiMA?pwd=neye)，提取码：neye。
diff --git a/robot_omni/YoloModel/RD_final.FP16.trtmodel b/robot_omni/YoloModel/RD_final.FP16.trtmodel
new file mode 100644
index 0000000..97fe6c7
Binary files /dev/null and b/robot_omni/YoloModel/RD_final.FP16.trtmodel differ
diff --git a/robot_omni/YoloModel/RD_final.onnx b/robot_omni/YoloModel/RD_final.onnx
new file mode 100644
index 0000000..39a38f9
Binary files /dev/null and b/robot_omni/YoloModel/RD_final.onnx differ
diff --git a/robot_omni/config/robot_detector.yaml b/robot_omni/config/robot_detector.yaml
new file mode 100644
index 0000000..c27a97b
--- /dev/null
+++ b/robot_omni/config/robot_detector.yaml
@@ -0,0 +1,58 @@
+img1_topic_name: "/mvsua_cam/image_raw2"
+img2_topic_name: "/mvsua_cam/image_raw3"
+img3_topic_name: "/mvsua_cam/image_raw4"
+img4_topic_name: "/mvsua_cam/image_raw5"
+yolo_model_path: "/YoloModel/RD_final"
+cof_threshold: 0.65
+nms_area_threshold: 0.5
+Heights: [58, 66, 84, 118, 236, 400]
+Distances: [5.0, 4.0, 3.0, 2.0, 1.0, 0.5]
+imshow: true
+
+
+In1:
+  [ 551.388706, 0.000000, 622.656311,
+    0.000000, 551.536582, 190.223500,
+    0.000000, 0.000000, 1.000000]
+Dis1: [-0.005405, -0.006993, -0.000466, 0.000419, 0.000000]
+Ex1: 
+    [-0.707107, -0.707107,   0.0000,  -0.0219,
+    0.0000,     0.0000,    -1.0000,   0.1300,
+    0.707107,  -0.707107,   0.0000,  -0.2454,
+    0.0000,     0.0000,     0.0000,   1.0000]
+
+
+In2:
+  [ 551.816816, 0.000000, 569.960497,
+    0.000000, 551.736399, 94.749160,
+    0.000000, 0.000000, 1.000000]
+Dis2: [-0.006035, -0.007303, -0.000733, 0.000238, 0.000000]
+Ex2: 
+    [0.707107,  -0.707107,   0.0000,  -0.0219,
+    0.0000,     0.0000,    -1.0000,   0.1300,
+    0.707107,   0.707107,   0.0000,  -0.2454,
+    0.0000,     0.0000,     0.0000,   1.0000]
+
+
+In3:
+  [ 549.305354, 0.000000, 640.951863,
+    0.000000, 549.178482, 146.327399,
+    0.000000, 0.000000, 1.000000]
+Dis3: [-0.005144, -0.007835, 0.000271, -0.000686, 0.000000]
+Ex3: 
+    [0.707107,   0.707107,   0.0000,  -0.0219,
+    0.0000,     0.0000,    -1.0000,   0.1300,
+    -0.707107,   0.707107,   0.0000,  -0.2454,
+    0.0000,     0.0000,     0.0000,   1.0000]
+
+
+In4:
+  [ 550.911025, 0.000000, 629.604206,
+    0.000000, 550.845787, 165.814740,
+    0.000000, 0.000000, 1.000000]
+Dis4: [-0.006915, -0.006944, -0.000547, -0.000318, 0.000000]
+Ex4: 
+    [-0.707107,  0.707107,   0.0000,  -0.0219,
+    0.0000,     0.0000,    -1.0000,   0.1300,
+    -0.707107,  -0.707107,   0.0000,  -0.2454,
+    0.0000,     0.0000,     0.0000,   1.0000]
\ No newline at end of file
diff --git a/robot_omni/images/robot.png b/robot_omni/images/robot.png
new file mode 100644
index 0000000..080e063
Binary files /dev/null and b/robot_omni/images/robot.png differ
diff --git "a/robot_omni/images/\345\210\206\346\256\265\347\272\277\346\200\247\346\217\222\345\200\274.png" "b/robot_omni/images/\345\210\206\346\256\265\347\272\277\346\200\247\346\217\222\345\200\274.png"
new file mode 100644
index 0000000..0abfbbd
Binary files /dev/null and "b/robot_omni/images/\345\210\206\346\256\265\347\272\277\346\200\247\346\217\222\345\200\274.png" differ
diff --git "a/robot_omni/images/\346\274\224\347\244\272\346\225\210\346\236\234.gif" "b/robot_omni/images/\346\274\224\347\244\272\346\225\210\346\236\234.gif"
new file mode 100644
index 0000000..872212c
Binary files /dev/null and "b/robot_omni/images/\346\274\224\347\244\272\346\225\210\346\236\234.gif" differ
diff --git "a/robot_omni/images/\350\275\257\344\273\266\346\265\201\347\250\213\345\233\276.png" "b/robot_omni/images/\350\275\257\344\273\266\346\265\201\347\250\213\345\233\276.png"
new file mode 100644
index 0000000..ef1fbe5
Binary files /dev/null and "b/robot_omni/images/\350\275\257\344\273\266\346\265\201\347\250\213\345\233\276.png" differ
diff --git a/robot_omni/include/robot_detector/RobotDetector.h b/robot_omni/include/robot_detector/RobotDetector.h
new file mode 100644
index 0000000..06c6d27
--- /dev/null
+++ b/robot_omni/include/robot_detector/RobotDetector.h
@@ -0,0 +1,90 @@
+#ifndef ROBOT_DETECTOR_H
+#define ROBOT_DETECTOR_H
+#include <Eigen/Core>
+#include <Eigen/Geometry>
+#include <opencv2/opencv.hpp>
+#include "simple_yolo.hpp"
+#include <dirent.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <cstring>
+#include <opencv2/dnn/dnn.hpp>
+#include <opencv2/core/eigen.hpp>
+
+namespace robot_detector
+{
+    using namespace std;
+
+    class RobotDetector
+    {
+    public:
+        typedef struct
+        {
+            float confidence;
+            int class_id;
+            cv::Rect box; //left_top(x,y), width, height
+        } Object;
+
+        struct Robot
+        {
+            cv::Rect m_rect;
+            int m_number;
+            int m_color;
+            int m_armornums;
+            float m_color_prb;
+            float m_number_prb;
+            int m_id; //1-red1 2-red2 3-blue1 4-blue2 5-grey 0-unknown
+            cv::Point3f XYZ_camera;
+            cv::Point3f XYZ_body;
+            Robot()
+            {
+                m_number = 0;
+                m_color = 0;
+                m_armornums = 0;
+                m_color_prb = 0.0;
+                m_number_prb = 0.0;
+                m_id = -1;
+                XYZ_camera = {0.0, 0.0, 0.0};
+                XYZ_body = {0.0, 0.0, 0.0};
+            }
+        };
+
+        RobotDetector();
+        ~RobotDetector();
+
+        bool init(const string &model_name, SimpleYolo::Mode precision, const double cof_threshold, const double nms_threshold,
+                  const std::vector<float> heights, const std::vector<float> distances);
+        void load_camera_params(const std::vector<std::vector<double>> intrinsics,
+                                const std::vector<std::vector<double>> extrinsics,
+                                const std::vector<std::vector<double>> discoeffs);
+        bool process_frame(cv::Mat inframe, vector<Object> &detected_objects, const int camera_id);
+        void RobotMatch(const int camera_id, const std::vector<Object> &results, std::vector<Robot> &Robots);
+
+    private:
+        cv::Mat letterBox(cv::Mat src);
+        cv::Rect detect2origin(const cv::Rect &dete_rect, float rate_to, int top, int left);
+        void location_estimation(const int camera_id, std::vector<Robot> &Robots, bool is_robot_from_armors = false);
+
+        //params
+        double _cof_threshold;
+        double _nms_area_threshold;
+        float _ratio;
+        int _topPad;
+        int _btmPad;
+        int _leftPad;
+        int _rightPad;
+
+        std::shared_ptr<SimpleYolo::Infer> _yolo;
+
+        std::vector<cv::Mat> _intrinsic_cvs;
+        std::vector<cv::Mat> _extrinsic_cvs;
+        std::vector<cv::Mat> _discoff_cvs;
+
+        std::vector<float> _distances;
+        std::vector<float> _heights;
+    };
+
+} //namespace
+#endif
\ No newline at end of file
diff --git a/robot_omni/include/robot_detector/simple_yolo.cu b/robot_omni/include/robot_detector/simple_yolo.cu
new file mode 100644
index 0000000..43240e7
--- /dev/null
+++ b/robot_omni/include/robot_detector/simple_yolo.cu
@@ -0,0 +1,2349 @@
+
+#include "simple_yolo.hpp"
+#include <NvInfer.h>
+#include <NvOnnxParser.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <string>
+#include <future>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+#include <queue>
+#include <functional>
+
+#if defined(_WIN32)
+#	include <Windows.h>
+#   include <wingdi.h>
+#	include <Shlwapi.h>
+#	pragma comment(lib, "shlwapi.lib")
+#	undef min
+#	undef max
+#else
+#	include <dirent.h>
+#	include <sys/types.h>
+#	include <sys/stat.h>
+#	include <unistd.h>
+#   include <stdarg.h>
+#endif
+
+namespace SimpleYolo{
+
+    using namespace nvinfer1;
+    using namespace std;
+    using namespace cv;
+
+    #define CURRENT_DEVICE_ID   -1
+    #define GPU_BLOCK_THREADS  512
+    #define KernelPositionBlock											\
+        int position = (blockDim.x * blockIdx.x + threadIdx.x);		    \
+        if (position >= (edge)) return;
+
+    #define checkCudaRuntime(call) check_runtime(call, #call, __LINE__, __FILE__)
+    static bool check_runtime(cudaError_t e, const char* call, int line, const char *file);
+
+    #define checkCudaKernel(...)                                                                         \
+        __VA_ARGS__;                                                                                     \
+        do{cudaError_t cudaStatus = cudaPeekAtLastError();                                               \
+        if (cudaStatus != cudaSuccess){                                                                  \
+            INFOE("launch failed: %s", cudaGetErrorString(cudaStatus));                                  \
+        }} while(0);
+
+    #define Assert(op)					 \
+        do{                              \
+            bool cond = !(!(op));        \
+            if(!cond){                   \
+                INFOF("Assert failed, " #op);  \
+            }                                  \
+        }while(false)
+
+    /* 修改这个level来实现修改日志输出级别 */
+    #define CURRENT_LOG_LEVEL       LogLevel::Info
+    #define INFOD(...)			__log_func(__FILE__, __LINE__, LogLevel::Debug, __VA_ARGS__)
+    #define INFOV(...)			__log_func(__FILE__, __LINE__, LogLevel::Verbose, __VA_ARGS__)
+    #define INFO(...)			__log_func(__FILE__, __LINE__, LogLevel::Info, __VA_ARGS__)
+    #define INFOW(...)			__log_func(__FILE__, __LINE__, LogLevel::Warning, __VA_ARGS__)
+    #define INFOE(...)			__log_func(__FILE__, __LINE__, LogLevel::Error, __VA_ARGS__)
+    #define INFOF(...)			__log_func(__FILE__, __LINE__, LogLevel::Fatal, __VA_ARGS__)
+
+    enum class NormType : int{
+        None      = 0,
+        MeanStd   = 1,
+        AlphaBeta = 2
+    };
+
+    enum class ChannelType : int{
+        None          = 0,
+        SwapRB        = 1
+    };
+
+    /* 归一化操作，可以支持均值标准差，alpha beta，和swap RB */
+    struct Norm{
+        float mean[3];
+        float std[3];
+        float alpha, beta;
+        NormType type = NormType::None;
+        ChannelType channel_type = ChannelType::None;
+
+        // out = (x * alpha - mean) / std
+        static Norm mean_std(const float mean[3], const float std[3], float alpha = 1/255.0f, ChannelType channel_type=ChannelType::None);
+
+        // out = x * alpha + beta
+        static Norm alpha_beta(float alpha, float beta = 0, ChannelType channel_type=ChannelType::None);
+
+        // None
+        static Norm None();
+    };
+
+    Norm Norm::mean_std(const float mean[3], const float std[3], float alpha, ChannelType channel_type){
+
+        Norm out;
+        out.type  = NormType::MeanStd;
+        out.alpha = alpha;
+        out.channel_type = channel_type;
+        memcpy(out.mean, mean, sizeof(out.mean));
+        memcpy(out.std,  std,  sizeof(out.std));
+        return out;
+    }
+
+    Norm Norm::alpha_beta(float alpha, float beta, ChannelType channel_type){
+
+        Norm out;
+        out.type = NormType::AlphaBeta;
+        out.alpha = alpha;
+        out.beta = beta;
+        out.channel_type = channel_type;
+        return out;
+    }
+
+    Norm Norm::None(){
+        return Norm();
+    }
+
+    /* 构造时设置当前gpuid，析构时修改为原来的gpuid */
+    class AutoDevice{
+    public:
+        AutoDevice(int device_id = 0){
+            cudaGetDevice(&old_);
+            checkCudaRuntime(cudaSetDevice(device_id));
+        }
+
+        virtual ~AutoDevice(){
+            checkCudaRuntime(cudaSetDevice(old_));
+        }
+    
+    private:
+        int old_ = -1;
+    };
+    
+    enum class LogLevel : int{
+        Debug   = 5,
+        Verbose = 4,
+        Info    = 3,
+        Warning = 2,
+        Error   = 1,
+        Fatal   = 0
+    };
+
+    static void __log_func(const char* file, int line, LogLevel level, const char* fmt, ...);
+    inline int upbound(int n, int align = 32){return (n + align - 1) / align * align;}
+
+    static bool check_runtime(cudaError_t e, const char* call, int line, const char *file){
+        if (e != cudaSuccess) {
+            INFOE("CUDA Runtime error %s # %s, code = %s [ %d ] in file %s:%d", call, cudaGetErrorString(e), cudaGetErrorName(e), e, file, line);
+            return false;
+        }
+        return true;
+    }
+
+    #define TRT_STR(v)  #v
+    #define TRT_VERSION_STRING(major, minor, patch, build)   TRT_STR(major) "." TRT_STR(minor) "." TRT_STR(patch) "." TRT_STR(build)
+    const char* trt_version(){
+        return TRT_VERSION_STRING(NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH, NV_TENSORRT_BUILD);
+    }
+
+    static bool check_device_id(int device_id){
+        int device_count = -1;
+        checkCudaRuntime(cudaGetDeviceCount(&device_count));
+        if(device_id < 0 || device_id >= device_count){
+            INFOE("Invalid device id: %d, count = %d", device_id, device_count);
+            return false;
+        }
+        return true;
+    }
+
+    static bool exists(const string& path){
+
+    #ifdef _WIN32
+        return ::PathFileExistsA(path.c_str());
+    #else
+        return access(path.c_str(), R_OK) == 0;
+    #endif
+    }
+
+    static const char* level_string(LogLevel level){
+        switch (level){
+            case LogLevel::Debug: return "debug";
+            case LogLevel::Verbose: return "verbo";
+            case LogLevel::Info: return "info";
+            case LogLevel::Warning: return "warn";
+            case LogLevel::Error: return "error";
+            case LogLevel::Fatal: return "fatal";
+            default: return "unknow";
+        }
+    }
+
+    template<typename _T>
+    static string join_dims(const vector<_T>& dims){
+        stringstream output;
+        char buf[64];
+        const char* fmts[] = {"%d", " x %d"};
+        for(int i = 0; i < dims.size(); ++i){
+            snprintf(buf, sizeof(buf), fmts[i != 0], dims[i]);
+            output << buf;
+        }
+        return output.str();
+    }
+
+    static bool save_file(const string& file, const void* data, size_t length){
+
+        FILE* f = fopen(file.c_str(), "wb");
+        if (!f) return false;
+
+        if (data && length > 0){
+            if (fwrite(data, 1, length, f) != length){
+                fclose(f);
+                return false;
+            }
+        }
+        fclose(f);
+        return true;
+    }
+
+    static bool save_file(const string& file, const vector<uint8_t>& data){
+        return save_file(file, data.data(), data.size());
+    }
+
+    static string file_name(const string& path, bool include_suffix){
+
+        if (path.empty()) return "";
+
+        int p = path.rfind('/');
+
+#ifdef U_OS_WINDOWS
+        int e = path.rfind('\\');
+        p = std::max(p, e);
+#endif
+        p += 1;
+
+        //include suffix
+        if (include_suffix)
+            return path.substr(p);
+
+        int u = path.rfind('.');
+        if (u == -1)
+            return path.substr(p);
+
+        if (u <= p) u = path.size();
+        return path.substr(p, u - p);
+    }
+
+    vector<string> glob_image_files(const string& directory){
+
+        /* 检索目录下的所有图像："*.jpg;*.png;*.bmp;*.jpeg;*.tiff" */
+        vector<string> files, output;
+        set<string> pattern_set{"jpg", "png", "bmp", "jpeg", "tiff"};
+
+        if(directory.empty()){
+            INFOE("Glob images from folder failed, folder is empty");
+            return output;
+        }
+
+        try{
+			vector<cv::String> files_;
+			files_.reserve(10000);
+            cv::glob(directory + "/*", files_, true);
+			files.insert(files.end(), files_.begin(), files_.end());
+        }catch(...){
+            INFOE("Glob %s failed", directory.c_str());
+            return output;
+        }
+
+        for(int i = 0; i < files.size(); ++i){
+            auto& file = files[i];
+            int p = file.rfind(".");
+            if(p == -1) continue;
+
+            auto suffix = file.substr(p+1);
+            std::transform(suffix.begin(), suffix.end(), suffix.begin(), [](char c){
+                if(c >= 'A' && c <= 'Z')
+                    c -= 'A' + 'a';
+                return c;
+            });
+            if(pattern_set.find(suffix) != pattern_set.end())
+                output.push_back(file);
+        }
+        return output;
+    }
+
+    static void __log_func(const char* file, int line, LogLevel level, const char* fmt, ...){
+
+        if(level > CURRENT_LOG_LEVEL)
+            return;
+
+        va_list vl;
+        va_start(vl, fmt);
+        
+        char buffer[2048];
+        string filename = file_name(file, true);
+        int n = snprintf(buffer, sizeof(buffer), "[%s][%s:%d]:", level_string(level), filename.c_str(), line);
+        vsnprintf(buffer + n, sizeof(buffer) - n, fmt, vl);
+
+        fprintf(stdout, "%s\n", buffer);
+        if (level == LogLevel::Fatal) {
+            fflush(stdout);
+            abort();
+        }
+    }
+
+    static dim3 grid_dims(int numJobs) {
+        int numBlockThreads = numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
+        return dim3(((numJobs + numBlockThreads - 1) / (float)numBlockThreads));
+    }
+
+    static dim3 block_dims(int numJobs) {
+        return numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
+    }
+
+    static int get_device(int device_id){
+        if(device_id != CURRENT_DEVICE_ID){
+            check_device_id(device_id);
+            return device_id;
+        }
+
+        checkCudaRuntime(cudaGetDevice(&device_id));
+        return device_id;
+    }
+
+    void set_device(int device_id) {
+        if (device_id == -1)
+            return;
+
+        checkCudaRuntime(cudaSetDevice(device_id));
+    }
+
+    /////////////////////////////CUDA kernels////////////////////////////////////////////////
+
+    const int NUM_BOX_ELEMENT = 7;      // left, top, right, bottom, confidence, class, keepflag
+    static __device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy){
+        *ox = matrix[0] * x + matrix[1] * y + matrix[2];
+        *oy = matrix[3] * x + matrix[4] * y + matrix[5];
+    }
+
+    static __global__ void decode_kernel(float* predict, int num_bboxes, int num_classes, float confidence_threshold, float* invert_affine_matrix, float* parray, int max_objects){  
+
+        int position = blockDim.x * blockIdx.x + threadIdx.x;
+        if (position >= num_bboxes) return;
+
+        float* pitem     = predict + (5 + num_classes) * position;
+        float objectness = pitem[4];
+        if(objectness < confidence_threshold)
+            return;
+
+        float* class_confidence = pitem + 5;
+        float confidence        = *class_confidence++;
+        int label               = 0;
+        for(int i = 1; i < num_classes; ++i, ++class_confidence){
+            if(*class_confidence > confidence){
+                confidence = *class_confidence;
+                label      = i;
+            }
+        }
+
+        confidence *= objectness;
+        if(confidence < confidence_threshold)
+            return;
+
+        int index = atomicAdd(parray, 1);
+        if(index >= max_objects)
+            return;
+
+        float cx         = *pitem++;
+        float cy         = *pitem++;
+        float width      = *pitem++;
+        float height     = *pitem++;
+        float left   = cx - width * 0.5f;
+        float top    = cy - height * 0.5f;
+        float right  = cx + width * 0.5f;
+        float bottom = cy + height * 0.5f;
+        affine_project(invert_affine_matrix, left,  top,    &left,  &top);
+        affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
+
+        float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
+        *pout_item++ = left;
+        *pout_item++ = top;
+        *pout_item++ = right;
+        *pout_item++ = bottom;
+        *pout_item++ = confidence;
+        *pout_item++ = label;
+        *pout_item++ = 1; // 1 = keep, 0 = ignore
+    }
+
+    static __device__ float box_iou(
+        float aleft, float atop, float aright, float abottom, 
+        float bleft, float btop, float bright, float bbottom
+    ){
+
+        float cleft 	= max(aleft, bleft);
+        float ctop 		= max(atop, btop);
+        float cright 	= min(aright, bright);
+        float cbottom 	= min(abottom, bbottom);
+        
+        float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
+        if(c_area == 0.0f)
+            return 0.0f;
+        
+        float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
+        float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
+        return c_area / (a_area + b_area - c_area);
+    }
+    
+    static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold){
+
+        int position = (blockDim.x * blockIdx.x + threadIdx.x);
+        int count = min((int)*bboxes, max_objects);
+        if (position >= count) 
+            return;
+        
+        // left, top, right, bottom, confidence, class, keepflag
+        float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
+        for(int i = 0; i < count; ++i){
+            float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
+            if(i == position || pcurrent[5] != pitem[5]) continue;
+
+            if(pitem[4] >= pcurrent[4]){
+                if(pitem[4] == pcurrent[4] && i < position)
+                    continue;
+
+                float iou = box_iou(
+                    pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
+                    pitem[0],    pitem[1],    pitem[2],    pitem[3]
+                );
+
+                if(iou > threshold){
+                    pcurrent[6] = 0;  // 1=keep, 0=ignore
+                    return;
+                }
+            }
+        }
+    } 
+
+    static void decode_kernel_invoker(float* predict, int num_bboxes, int num_classes, float confidence_threshold, float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects, cudaStream_t stream){
+        
+        auto grid = grid_dims(num_bboxes);
+        auto block = block_dims(num_bboxes);
+
+        /* 如果核函数有波浪线，没关系，他是正常的，你只是看不顺眼罢了 */
+        checkCudaKernel(decode_kernel<<<grid, block, 0, stream>>>(predict, num_bboxes, num_classes, confidence_threshold, invert_affine_matrix, parray, max_objects));
+
+        grid = grid_dims(max_objects);
+        block = block_dims(max_objects);
+        checkCudaKernel(fast_nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold));
+    }
+
+    static __global__ void warp_affine_bilinear_and_normalize_plane_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, 
+        uint8_t const_value_st, float* warp_affine_matrix_2_3, Norm norm, int edge){
+
+        int position = blockDim.x * blockIdx.x + threadIdx.x;
+        if (position >= edge) return;
+
+        float m_x1 = warp_affine_matrix_2_3[0];
+        float m_y1 = warp_affine_matrix_2_3[1];
+        float m_z1 = warp_affine_matrix_2_3[2];
+        float m_x2 = warp_affine_matrix_2_3[3];
+        float m_y2 = warp_affine_matrix_2_3[4];
+        float m_z2 = warp_affine_matrix_2_3[5];
+
+        int dx      = position % dst_width;
+        int dy      = position / dst_width;
+        float src_x = m_x1 * dx + m_y1 * dy + m_z1;
+        float src_y = m_x2 * dx + m_y2 * dy + m_z2;
+        float c0, c1, c2;
+
+        if(src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height){
+            // out of range
+            c0 = const_value_st;
+            c1 = const_value_st;
+            c2 = const_value_st;
+        }else{
+            int y_low = floorf(src_y);
+            int x_low = floorf(src_x);
+            int y_high = y_low + 1;
+            int x_high = x_low + 1;
+
+            uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
+            float ly    = src_y - y_low;
+            float lx    = src_x - x_low;
+            float hy    = 1 - ly;
+            float hx    = 1 - lx;
+            float w1    = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+            uint8_t* v1 = const_value;
+            uint8_t* v2 = const_value;
+            uint8_t* v3 = const_value;
+            uint8_t* v4 = const_value;
+            if(y_low >= 0){
+                if (x_low >= 0)
+                    v1 = src + y_low * src_line_size + x_low * 3;
+
+                if (x_high < src_width)
+                    v2 = src + y_low * src_line_size + x_high * 3;
+            }
+            
+            if(y_high < src_height){
+                if (x_low >= 0)
+                    v3 = src + y_high * src_line_size + x_low * 3;
+
+                if (x_high < src_width)
+                    v4 = src + y_high * src_line_size + x_high * 3;
+            }
+
+            // same to opencv
+            c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);
+            c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);
+            c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);
+        }
+
+        if(norm.channel_type == ChannelType::SwapRB){
+            float t = c2;
+            c2 = c0;  c0 = t;
+        }
+
+        if(norm.type == NormType::MeanStd){
+            c0 = (c0 * norm.alpha - norm.mean[0]) / norm.std[0];
+            c1 = (c1 * norm.alpha - norm.mean[1]) / norm.std[1];
+            c2 = (c2 * norm.alpha - norm.mean[2]) / norm.std[2];
+        }else if(norm.type == NormType::AlphaBeta){
+            c0 = c0 * norm.alpha + norm.beta;
+            c1 = c1 * norm.alpha + norm.beta;
+            c2 = c2 * norm.alpha + norm.beta;
+        }
+
+        int area = dst_width * dst_height;
+        float* pdst_c0 = dst + dy * dst_width + dx;
+        float* pdst_c1 = pdst_c0 + area;
+        float* pdst_c2 = pdst_c1 + area;
+        *pdst_c0 = c0;
+        *pdst_c1 = c1;
+        *pdst_c2 = c2;
+    }
+
+    static void warp_affine_bilinear_and_normalize_plane(
+        uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height,
+        float* matrix_2_3, uint8_t const_value, const Norm& norm,
+        cudaStream_t stream) {
+        
+        int jobs   = dst_width * dst_height;
+        auto grid  = grid_dims(jobs);
+        auto block = block_dims(jobs);
+        
+        checkCudaKernel(warp_affine_bilinear_and_normalize_plane_kernel << <grid, block, 0, stream >> > (
+            src, src_line_size,
+            src_width, src_height, dst,
+            dst_width, dst_height, const_value, matrix_2_3, norm, jobs
+        ));
+    }
+
+
+    //////////////////////////////class MixMemory/////////////////////////////////////////////////
+    /* gpu/cpu内存管理
+        自动对gpu和cpu内存进行分配和释放
+        这里的cpu使用的是pinned memory，当对gpu做内存复制时，性能比较好
+        因为是cudaMallocHost分配的，因此他与cuda context有关联
+    */
+    class MixMemory {
+    public:
+        MixMemory(int device_id = CURRENT_DEVICE_ID);
+        MixMemory(void* cpu, size_t cpu_size, void* gpu, size_t gpu_size);
+        virtual ~MixMemory();
+        void* gpu(size_t size);
+        void* cpu(size_t size);
+        void release_gpu();
+        void release_cpu();
+        void release_all();
+
+        inline bool owner_gpu() const{return owner_gpu_;}
+        inline bool owner_cpu() const{return owner_cpu_;}
+
+        inline size_t cpu_size() const{return cpu_size_;}
+        inline size_t gpu_size() const{return gpu_size_;}
+        inline int device_id() const{return device_id_;}
+
+        inline void* gpu() const { return gpu_; }
+
+        // Pinned Memory
+        inline void* cpu() const { return cpu_; }
+
+        void reference_data(void* cpu, size_t cpu_size, void* gpu, size_t gpu_size);
+
+    private:
+        void* cpu_ = nullptr;
+        size_t cpu_size_ = 0;
+        bool owner_cpu_ = true;
+        int device_id_ = 0;
+
+        void* gpu_ = nullptr;
+        size_t gpu_size_ = 0;
+        bool owner_gpu_ = true;
+    };
+
+    MixMemory::MixMemory(int device_id){
+        device_id_ = get_device(device_id);
+    }
+
+    MixMemory::MixMemory(void* cpu, size_t cpu_size, void* gpu, size_t gpu_size){
+        reference_data(cpu, cpu_size, gpu, gpu_size);		
+    }
+
+    void MixMemory::reference_data(void* cpu, size_t cpu_size, void* gpu, size_t gpu_size){
+        release_all();
+        
+        if(cpu == nullptr || cpu_size == 0){
+            cpu = nullptr;
+            cpu_size = 0;
+        }
+
+        if(gpu == nullptr || gpu_size == 0){
+            gpu = nullptr;
+            gpu_size = 0;
+        }
+
+        this->cpu_ = cpu;
+        this->cpu_size_ = cpu_size;
+        this->gpu_ = gpu;
+        this->gpu_size_ = gpu_size;
+
+        this->owner_cpu_ = !(cpu && cpu_size > 0);
+        this->owner_gpu_ = !(gpu && gpu_size > 0);
+        checkCudaRuntime(cudaGetDevice(&device_id_));
+    }
+
+    MixMemory::~MixMemory() {
+        release_all();
+    }
+
+    void* MixMemory::gpu(size_t size) {
+
+        if (gpu_size_ < size) {
+            release_gpu();
+
+            gpu_size_ = size;
+            AutoDevice auto_device_exchange(device_id_);
+            checkCudaRuntime(cudaMalloc(&gpu_, size));
+            checkCudaRuntime(cudaMemset(gpu_, 0, size));
+        }
+        return gpu_;
+    }
+
+    void* MixMemory::cpu(size_t size) {
+
+        if (cpu_size_ < size) {
+            release_cpu();
+
+            cpu_size_ = size;
+            AutoDevice auto_device_exchange(device_id_);
+            checkCudaRuntime(cudaMallocHost(&cpu_, size));
+            Assert(cpu_ != nullptr);
+            memset(cpu_, 0, size);
+        }
+        return cpu_;
+    }
+
+    void MixMemory::release_cpu() {
+        if (cpu_) {
+            if(owner_cpu_){
+                AutoDevice auto_device_exchange(device_id_);
+                checkCudaRuntime(cudaFreeHost(cpu_));
+            }
+            cpu_ = nullptr;
+        }
+        cpu_size_ = 0;
+    }
+
+    void MixMemory::release_gpu() {
+        if (gpu_) {
+            if(owner_gpu_){
+                AutoDevice auto_device_exchange(device_id_);
+                checkCudaRuntime(cudaFree(gpu_));
+            }
+            gpu_ = nullptr;
+        }
+        gpu_size_ = 0;
+    }
+
+    void MixMemory::release_all() {
+        release_cpu();
+        release_gpu();
+    }
+
+    /////////////////////////////////class Tensor////////////////////////////////////////////////
+    /* Tensor类，实现张量的管理
+        由于NN多用张量，必须有个类进行管理才方便，实现内存自动分配，计算索引等等
+        如果要调试，可以执行save_to_file，储存为文件后，在python中加载并查看
+    */
+    enum class DataHead : int{
+        Init   = 0,
+        Device = 1,
+        Host   = 2
+    };
+
+    class Tensor {
+    public:
+        Tensor(const Tensor& other) = delete;
+        Tensor& operator = (const Tensor& other) = delete;
+
+        explicit Tensor(std::shared_ptr<MixMemory> data = nullptr, int device_id = CURRENT_DEVICE_ID);
+        explicit Tensor(int n, int c, int h, int w, std::shared_ptr<MixMemory> data = nullptr, int device_id = CURRENT_DEVICE_ID);
+        explicit Tensor(int ndims, const int* dims, std::shared_ptr<MixMemory> data = nullptr, int device_id = CURRENT_DEVICE_ID);
+        explicit Tensor(const std::vector<int>& dims, std::shared_ptr<MixMemory> data = nullptr, int device_id = CURRENT_DEVICE_ID);
+        virtual ~Tensor();
+
+        int numel() const;
+        inline int ndims() const{return shape_.size();}
+        inline int size(int index)  const{return shape_[index];}
+        inline int shape(int index) const{return shape_[index];}
+
+        inline int batch()   const{return shape_[0];}
+        inline int channel() const{return shape_[1];}
+        inline int height()  const{return shape_[2];}
+        inline int width()   const{return shape_[3];}
+
+        inline const std::vector<int>& dims() const { return shape_; }
+        inline int bytes()                    const { return bytes_; }
+        inline int bytes(int start_axis)      const { return count(start_axis) * element_size(); }
+        inline int element_size()             const { return sizeof(float); }
+        inline DataHead head()                const { return head_; }
+
+        std::shared_ptr<Tensor> clone() const;
+        Tensor& release();
+        Tensor& set_to(float value);
+        bool empty() const;
+
+        template<typename ... _Args>
+        int offset(int index, _Args ... index_args) const{
+            const int index_array[] = {index, index_args...};
+            return offset_array(sizeof...(index_args) + 1, index_array);
+        }
+
+        int offset_array(const std::vector<int>& index) const;
+        int offset_array(size_t size, const int* index_array) const;
+
+        template<typename ... _Args>
+        Tensor& resize(int dim_size, _Args ... dim_size_args){
+            const int dim_size_array[] = {dim_size, dim_size_args...};
+            return resize(sizeof...(dim_size_args) + 1, dim_size_array);
+        }
+
+        Tensor& resize(int ndims, const int* dims);
+        Tensor& resize(const std::vector<int>& dims);
+        Tensor& resize_single_dim(int idim, int size);
+        int  count(int start_axis = 0) const;
+        int device() const{return device_id_;}
+
+        Tensor& to_gpu(bool copy=true);
+        Tensor& to_cpu(bool copy=true);
+
+        inline void* cpu() const { ((Tensor*)this)->to_cpu(); return data_->cpu(); }
+        inline void* gpu() const { ((Tensor*)this)->to_gpu(); return data_->gpu(); }
+        
+        template<typename DType> inline const DType* cpu() const { return (DType*)cpu(); }
+        template<typename DType> inline DType* cpu()             { return (DType*)cpu(); }
+
+        template<typename DType, typename ... _Args> 
+        inline DType* cpu(int i, _Args&& ... args) { return cpu<DType>() + offset(i, args...); }
+
+
+        template<typename DType> inline const DType* gpu() const { return (DType*)gpu(); }
+        template<typename DType> inline DType* gpu()             { return (DType*)gpu(); }
+
+        template<typename DType, typename ... _Args> 
+        inline DType* gpu(int i, _Args&& ... args) { return gpu<DType>() + offset(i, args...); }
+
+        template<typename DType, typename ... _Args> 
+        inline DType& at(int i, _Args&& ... args) { return *(cpu<DType>() + offset(i, args...)); }
+        
+        std::shared_ptr<MixMemory> get_data()             const {return data_;}
+        std::shared_ptr<MixMemory> get_workspace()        const {return workspace_;}
+        Tensor& set_workspace(std::shared_ptr<MixMemory> workspace) {workspace_ = workspace; return *this;}
+
+        cudaStream_t get_stream() const{return stream_;}
+        Tensor& set_stream(cudaStream_t stream){stream_ = stream; return *this;}
+
+        Tensor& set_mat     (int n, const cv::Mat& image);
+        Tensor& set_norm_mat(int n, const cv::Mat& image, float mean[3], float std[3]);
+        cv::Mat at_mat(int n = 0, int c = 0) { return cv::Mat(height(), width(), CV_32F, cpu<float>(n, c)); }
+
+        Tensor& synchronize();
+        const char* shape_string() const{return shape_string_;}
+        const char* descriptor() const;
+
+        Tensor& copy_from_gpu(size_t offset, const void* src, size_t num_element, int device_id = CURRENT_DEVICE_ID);
+
+        /**
+        
+        # 以下代码是python中加载Tensor
+        import numpy as np
+
+        def load_tensor(file):
+            
+            with open(file, "rb") as f:
+                binary_data = f.read()
+
+            magic_number, ndims, dtype = np.frombuffer(binary_data, np.uint32, count=3, offset=0)
+            assert magic_number == 0xFCCFE2E2, f"{file} not a tensor file."
+            
+            dims = np.frombuffer(binary_data, np.uint32, count=ndims, offset=3 * 4)
+
+            if dtype == 0:
+                np_dtype = np.float32
+            elif dtype == 1:
+                np_dtype = np.float16
+            else:
+                assert False, f"Unsupport dtype = {dtype}, can not convert to numpy dtype"
+                
+            return np.frombuffer(binary_data, np_dtype, offset=(ndims + 3) * 4).reshape(*dims)
+
+         **/
+        bool save_to_file(const std::string& file) const;
+
+    private:
+        Tensor& compute_shape_string();
+        Tensor& adajust_memory_by_update_dims_or_type();
+        void setup_data(std::shared_ptr<MixMemory> data);
+
+    private:
+        std::vector<int> shape_;
+        size_t bytes_    = 0;
+        DataHead head_   = DataHead::Init;
+        cudaStream_t stream_ = nullptr;
+        int device_id_   = 0;
+        char shape_string_[100];
+        char descriptor_string_[100];
+        std::shared_ptr<MixMemory> data_;
+        std::shared_ptr<MixMemory> workspace_;
+    };
+
+    Tensor::Tensor(int n, int c, int h, int w, shared_ptr<MixMemory> data, int device_id) {
+        this->device_id_ = get_device(device_id);
+        descriptor_string_[0] = 0;
+        setup_data(data);
+        resize(n, c, h, w);
+    }
+
+    Tensor::Tensor(const std::vector<int>& dims, shared_ptr<MixMemory> data, int device_id){
+        this->device_id_ = get_device(device_id);
+        descriptor_string_[0] = 0;
+        setup_data(data);
+        resize(dims);
+    }
+
+    Tensor::Tensor(int ndims, const int* dims, shared_ptr<MixMemory> data, int device_id) {
+        this->device_id_ = get_device(device_id);
+        descriptor_string_[0] = 0;
+        setup_data(data);
+        resize(ndims, dims);
+    }
+
+    Tensor::Tensor(shared_ptr<MixMemory> data, int device_id){
+        shape_string_[0] = 0;
+        descriptor_string_[0] = 0;
+        this->device_id_ = get_device(device_id);
+        setup_data(data);
+    }
+
+    Tensor::~Tensor() {
+        release();
+    }
+
+    const char* Tensor::descriptor() const{
+        
+        char* descriptor_ptr = (char*)descriptor_string_;
+        int device_id = device();
+        snprintf(descriptor_ptr, sizeof(descriptor_string_), 
+            "Tensor:%p, %s, CUDA:%d", 
+            data_.get(),
+            shape_string_, 
+            device_id
+        );
+        return descriptor_ptr;
+    }
+
+    Tensor& Tensor::compute_shape_string(){
+
+        // clean string
+        shape_string_[0] = 0;
+
+        char* buffer = shape_string_;
+        size_t buffer_size = sizeof(shape_string_);
+        for(int i = 0; i < shape_.size(); ++i){
+
+            int size = 0;
+            if(i < shape_.size() - 1)
+                size = snprintf(buffer, buffer_size, "%d x ", shape_[i]);
+            else
+                size = snprintf(buffer, buffer_size, "%d", shape_[i]);
+
+            buffer += size;
+            buffer_size -= size;
+        }
+        return *this;
+    }
+
+    void Tensor::setup_data(shared_ptr<MixMemory> data){
+        
+        data_ = data;
+        if(data_ == nullptr){
+            data_ = make_shared<MixMemory>(device_id_);
+        }else{
+            device_id_ = data_->device_id();
+        }
+
+        head_ = DataHead::Init;
+        if(data_->cpu()){
+            head_ = DataHead::Host;
+        }
+
+        if(data_->gpu()){
+            head_ = DataHead::Device;
+        }
+    }
+
+    Tensor& Tensor::copy_from_gpu(size_t offset, const void* src, size_t num_element, int device_id){
+
+        if(head_ == DataHead::Init)
+            to_gpu(false);
+
+        size_t offset_location = offset * element_size();
+        if(offset_location >= bytes_){
+            INFOE("Offset location[%lld] >= bytes_[%lld], out of range", offset_location, bytes_);
+            return *this;
+        }
+
+        size_t copyed_bytes = num_element * element_size();
+        size_t remain_bytes = bytes_ - offset_location;
+        if(copyed_bytes > remain_bytes){
+            INFOE("Copyed bytes[%lld] > remain bytes[%lld], out of range", copyed_bytes, remain_bytes);
+            return *this;
+        }
+        
+        if(head_ == DataHead::Device){
+            int current_device_id = get_device(device_id);
+            int gpu_device_id = device();
+            if(current_device_id != gpu_device_id){
+                checkCudaRuntime(cudaMemcpyPeerAsync(gpu<unsigned char>() + offset_location, gpu_device_id, src, current_device_id, copyed_bytes, stream_));
+                //checkCudaRuntime(cudaMemcpyAsync(gpu<unsigned char>() + offset_location, src, copyed_bytes, cudaMemcpyDeviceToDevice, stream_));
+            }
+            else{
+                checkCudaRuntime(cudaMemcpyAsync(gpu<unsigned char>() + offset_location, src, copyed_bytes, cudaMemcpyDeviceToDevice, stream_));
+            }
+        }else if(head_ == DataHead::Host){
+            AutoDevice auto_device_exchange(this->device());
+            checkCudaRuntime(cudaMemcpyAsync(cpu<unsigned char>() + offset_location, src, copyed_bytes, cudaMemcpyDeviceToHost, stream_));
+        }else{
+            INFOE("Unsupport head type %d", head_);
+        }
+        return *this;
+    }
+
+    Tensor& Tensor::release() {
+        data_->release_all();
+        shape_.clear();
+        bytes_ = 0;
+        head_ = DataHead::Init;
+        return *this;
+    }
+
+    bool Tensor::empty() const{
+        return data_->cpu() == nullptr && data_->gpu() == nullptr;
+    }
+
+    int Tensor::count(int start_axis) const {
+
+        if(start_axis >= 0 && start_axis < shape_.size()){
+            int size = 1;
+            for (int i = start_axis; i < shape_.size(); ++i) 
+                size *= shape_[i];
+            return size;
+        }else{
+            return 0;
+        }
+    }
+
+    Tensor& Tensor::resize(const std::vector<int>& dims) {
+        return resize(dims.size(), dims.data());
+    }
+
+    int Tensor::numel() const{
+        int value = shape_.empty() ? 0 : 1;
+        for(int i = 0; i < shape_.size(); ++i){
+            value *= shape_[i];
+        }
+        return value;
+    }
+
+    Tensor& Tensor::resize_single_dim(int idim, int size){
+
+        Assert(idim >= 0 && idim < shape_.size());
+
+        auto new_shape = shape_;
+        new_shape[idim] = size;
+        return resize(new_shape);
+    }
+
+    Tensor& Tensor::resize(int ndims, const int* dims) {
+
+        vector<int> setup_dims(ndims);
+        for(int i = 0; i < ndims; ++i){
+            int dim = dims[i];
+            if(dim == -1){
+                Assert(ndims == shape_.size());
+                dim = shape_[i];
+            }
+            setup_dims[i] = dim;
+        }
+        this->shape_ = setup_dims;
+        this->adajust_memory_by_update_dims_or_type();
+        this->compute_shape_string();
+        return *this;
+    }
+
+    Tensor& Tensor::adajust_memory_by_update_dims_or_type(){
+        
+        int needed_size = this->numel() * element_size();
+        if(needed_size > this->bytes_){
+            head_ = DataHead::Init;
+        }
+        this->bytes_ = needed_size;
+        return *this;
+    }
+
+    Tensor& Tensor::synchronize(){ 
+        AutoDevice auto_device_exchange(this->device());
+        checkCudaRuntime(cudaStreamSynchronize(stream_));
+        return *this;
+    }
+
+    Tensor& Tensor::to_gpu(bool copy) {
+
+        if (head_ == DataHead::Device)
+            return *this;
+
+        head_ = DataHead::Device;
+        data_->gpu(bytes_);
+
+        if (copy && data_->cpu() != nullptr) {
+            AutoDevice auto_device_exchange(this->device());
+            checkCudaRuntime(cudaMemcpyAsync(data_->gpu(), data_->cpu(), bytes_, cudaMemcpyHostToDevice, stream_));
+        }
+        return *this;
+    }
+    
+    Tensor& Tensor::to_cpu(bool copy) {
+
+        if (head_ == DataHead::Host)
+            return *this;
+
+        head_ = DataHead::Host;
+        data_->cpu(bytes_);
+
+        if (copy && data_->gpu() != nullptr) {
+            AutoDevice auto_device_exchange(this->device());
+            checkCudaRuntime(cudaMemcpyAsync(data_->cpu(), data_->gpu(), bytes_, cudaMemcpyDeviceToHost, stream_));
+            checkCudaRuntime(cudaStreamSynchronize(stream_));
+        }
+        return *this;
+    }
+
+    int Tensor::offset_array(size_t size, const int* index_array) const{
+
+        Assert(size <= shape_.size());
+        int value = 0;
+        for(int i = 0; i < shape_.size(); ++i){
+
+            if(i < size)
+                value += index_array[i];
+
+            if(i + 1 < shape_.size())
+                value *= shape_[i+1];
+        }
+        return value;
+    }
+
+    int Tensor::offset_array(const std::vector<int>& index_array) const{
+        return offset_array(index_array.size(), index_array.data());
+    }
+
+    bool Tensor::save_to_file(const std::string& file) const{
+
+        if(empty()) return false;
+
+        FILE* f = fopen(file.c_str(), "wb");
+        if(f == nullptr) return false;
+
+        int ndims = this->ndims();
+        int dtype_ = 0;
+        unsigned int head[3] = {0xFCCFE2E2, ndims, static_cast<unsigned int>(dtype_)};
+        fwrite(head, 1, sizeof(head), f);
+        fwrite(shape_.data(), 1, sizeof(shape_[0]) * shape_.size(), f);
+        fwrite(cpu(), 1, bytes_, f);
+        fclose(f);
+        return true;
+    }
+
+    /////////////////////////////////class TRTInferImpl////////////////////////////////////////////////
+    class Logger : public ILogger {
+    public:
+        virtual void log(Severity severity, const char* msg) noexcept override {
+
+            if (severity == Severity::kINTERNAL_ERROR) {
+                INFOE("NVInfer INTERNAL_ERROR: %s", msg);
+                abort();
+            }else if (severity == Severity::kERROR) {
+                INFOE("NVInfer: %s", msg);
+            }
+            else  if (severity == Severity::kWARNING) {
+                INFOW("NVInfer: %s", msg);
+            }
+            else  if (severity == Severity::kINFO) {
+                INFOD("NVInfer: %s", msg);
+            }
+            else {
+                INFOD("%s", msg);
+            }
+        }
+    };
+    static Logger gLogger;
+
+    template<typename _T>
+    static void destroy_nvidia_pointer(_T* ptr) {
+        if (ptr) ptr->destroy();
+    }
+
+    class EngineContext {
+    public:
+        virtual ~EngineContext() { destroy(); }
+
+        void set_stream(cudaStream_t stream){
+
+            if(owner_stream_){
+                if (stream_) {cudaStreamDestroy(stream_);}
+                owner_stream_ = false;
+            }
+            stream_ = stream;
+        }
+
+        bool build_model(const void* pdata, size_t size) {
+            destroy();
+
+            if(pdata == nullptr || size == 0)
+                return false;
+
+            owner_stream_ = true;
+            checkCudaRuntime(cudaStreamCreate(&stream_));
+            if(stream_ == nullptr)
+                return false;
+
+            runtime_ = shared_ptr<IRuntime>(createInferRuntime(gLogger), destroy_nvidia_pointer<IRuntime>);
+            if (runtime_ == nullptr)
+                return false;
+
+            engine_ = shared_ptr<ICudaEngine>(runtime_->deserializeCudaEngine(pdata, size, nullptr), destroy_nvidia_pointer<ICudaEngine>);
+            if (engine_ == nullptr)
+                return false;
+
+            //runtime_->setDLACore(0);
+            context_ = shared_ptr<IExecutionContext>(engine_->createExecutionContext(), destroy_nvidia_pointer<IExecutionContext>);
+            return context_ != nullptr;
+        }
+
+    private:
+        void destroy() {
+            context_.reset();
+            engine_.reset();
+            runtime_.reset();
+
+            if(owner_stream_){
+                if (stream_) {cudaStreamDestroy(stream_);}
+            }
+            stream_ = nullptr;
+        }
+
+    public:
+        cudaStream_t stream_ = nullptr;
+        bool owner_stream_ = false;
+        shared_ptr<IExecutionContext> context_;
+        shared_ptr<ICudaEngine> engine_;
+        shared_ptr<IRuntime> runtime_ = nullptr;
+    };
+
+    class TRTInferImpl{
+    public:
+        virtual ~TRTInferImpl();
+        bool load(const std::string& file);
+        bool load_from_memory(const void* pdata, size_t size);
+        void destroy();
+        void forward(bool sync);
+        int get_max_batch_size();
+        cudaStream_t get_stream();
+        void set_stream(cudaStream_t stream);
+        void synchronize();
+        size_t get_device_memory_size();
+        std::shared_ptr<MixMemory> get_workspace();
+        std::shared_ptr<Tensor> input(int index = 0);
+        std::string get_input_name(int index = 0);
+        std::shared_ptr<Tensor> output(int index = 0);
+        std::string get_output_name(int index = 0);
+        std::shared_ptr<Tensor> tensor(const std::string& name);
+        bool is_output_name(const std::string& name);
+        bool is_input_name(const std::string& name);
+        void set_input (int index, std::shared_ptr<Tensor> tensor);
+        void set_output(int index, std::shared_ptr<Tensor> tensor);
+        std::shared_ptr<std::vector<uint8_t>> serial_engine();
+
+        void print();
+
+        int num_output();
+        int num_input();
+        int device();
+
+    private:
+        void build_engine_input_and_outputs_mapper();
+
+    private:
+        std::vector<std::shared_ptr<Tensor>> inputs_;
+        std::vector<std::shared_ptr<Tensor>> outputs_;
+        std::vector<int> inputs_map_to_ordered_index_;
+        std::vector<int> outputs_map_to_ordered_index_;
+        std::vector<std::string> inputs_name_;
+        std::vector<std::string> outputs_name_;
+        std::vector<std::shared_ptr<Tensor>> orderdBlobs_;
+        std::map<std::string, int> blobsNameMapper_;
+        std::shared_ptr<EngineContext> context_;
+        std::vector<void*> bindingsPtr_;
+        std::shared_ptr<MixMemory> workspace_;
+        int device_ = 0;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    TRTInferImpl::~TRTInferImpl(){
+        destroy();
+    }
+
+    void TRTInferImpl::destroy() {
+
+        int old_device = 0;
+        checkCudaRuntime(cudaGetDevice(&old_device));
+        checkCudaRuntime(cudaSetDevice(device_));
+        this->context_.reset();
+        this->blobsNameMapper_.clear();
+        this->outputs_.clear();
+        this->inputs_.clear();
+        this->inputs_name_.clear();
+        this->outputs_name_.clear();
+        checkCudaRuntime(cudaSetDevice(old_device));
+    }
+
+    void TRTInferImpl::print(){
+        if(!context_){
+            INFOW("Infer print, nullptr.");
+            return;
+        }
+
+        INFO("Infer %p detail", this);
+        INFO("\tMax Batch Size: %d", this->get_max_batch_size());
+        INFO("\tInputs: %d", inputs_.size());
+        for(int i = 0; i < inputs_.size(); ++i){
+            auto& tensor = inputs_[i];
+            auto& name = inputs_name_[i];
+            INFO("\t\t%d.%s : shape {%s}", i, name.c_str(), tensor->shape_string());
+        }
+
+        INFO("\tOutputs: %d", outputs_.size());
+        for(int i = 0; i < outputs_.size(); ++i){
+            auto& tensor = outputs_[i];
+            auto& name = outputs_name_[i];
+            INFO("\t\t%d.%s : shape {%s}", i, name.c_str(), tensor->shape_string());
+        }
+    }
+
+    std::shared_ptr<std::vector<uint8_t>> TRTInferImpl::serial_engine() {
+        auto memory = this->context_->engine_->serialize();
+        auto output = make_shared<std::vector<uint8_t>>((uint8_t*)memory->data(), (uint8_t*)memory->data()+memory->size());
+        memory->destroy();
+        return output;
+    }
+
+    bool TRTInferImpl::load_from_memory(const void* pdata, size_t size) {
+
+        if (pdata == nullptr || size == 0)
+            return false;
+
+        context_.reset(new EngineContext());
+
+        //build model
+        if (!context_->build_model(pdata, size)) {
+            context_.reset();
+            return false;
+        }
+
+        workspace_.reset(new MixMemory());
+        cudaGetDevice(&device_);
+        build_engine_input_and_outputs_mapper();
+        return true;
+    }
+
+    static std::vector<uint8_t> load_file(const string& file){
+
+        ifstream in(file, ios::in | ios::binary);
+        if (!in.is_open())
+            return {};
+
+        in.seekg(0, ios::end);
+        size_t length = in.tellg();
+
+        std::vector<uint8_t> data;
+        if (length > 0){
+            in.seekg(0, ios::beg);
+            data.resize(length);
+
+            in.read((char*)&data[0], length);
+        }
+        in.close();
+        return data;
+    }
+
+    bool TRTInferImpl::load(const std::string& file) {
+
+        auto data = load_file(file);
+        if (data.empty())
+            return false;
+
+        context_.reset(new EngineContext());
+
+        //build model
+        if (!context_->build_model(data.data(), data.size())) {
+            context_.reset();
+            return false;
+        }
+
+        workspace_.reset(new MixMemory());
+        cudaGetDevice(&device_);
+        build_engine_input_and_outputs_mapper();
+        return true;
+    }
+
+    size_t TRTInferImpl::get_device_memory_size() {
+        EngineContext* context = (EngineContext*)this->context_.get();
+        return context->context_->getEngine().getDeviceMemorySize();
+    }
+
+    void TRTInferImpl::build_engine_input_and_outputs_mapper() {
+        
+        EngineContext* context = (EngineContext*)this->context_.get();
+        int nbBindings = context->engine_->getNbBindings();
+        int max_batchsize = context->engine_->getMaxBatchSize();
+
+        inputs_.clear();
+        inputs_name_.clear();
+        outputs_.clear();
+        outputs_name_.clear();
+        orderdBlobs_.clear();
+        bindingsPtr_.clear();
+        blobsNameMapper_.clear();
+        for (int i = 0; i < nbBindings; ++i) {
+
+            auto dims = context->engine_->getBindingDimensions(i);
+            auto type = context->engine_->getBindingDataType(i);
+            const char* bindingName = context->engine_->getBindingName(i);
+            dims.d[0] = max_batchsize;
+            auto newTensor = make_shared<Tensor>(dims.nbDims, dims.d);
+            newTensor->set_stream(this->context_->stream_);
+            newTensor->set_workspace(this->workspace_);
+            if (context->engine_->bindingIsInput(i)) {
+                //if is input
+                inputs_.push_back(newTensor);
+                inputs_name_.push_back(bindingName);
+                inputs_map_to_ordered_index_.push_back(orderdBlobs_.size());
+            }
+            else {
+                //if is output
+                outputs_.push_back(newTensor);
+                outputs_name_.push_back(bindingName);
+                outputs_map_to_ordered_index_.push_back(orderdBlobs_.size());
+            }
+            blobsNameMapper_[bindingName] = i;
+            orderdBlobs_.push_back(newTensor);
+        }
+        bindingsPtr_.resize(orderdBlobs_.size());
+    }
+
+    void TRTInferImpl::set_stream(cudaStream_t stream){
+        this->context_->set_stream(stream);
+
+        for(auto& t : orderdBlobs_)
+            t->set_stream(stream);
+    }
+
+    cudaStream_t TRTInferImpl::get_stream() {
+        return this->context_->stream_;
+    }
+
+    int TRTInferImpl::device() {
+        return device_;
+    }
+
+    void TRTInferImpl::synchronize() {
+        checkCudaRuntime(cudaStreamSynchronize(context_->stream_));
+    }
+
+    bool TRTInferImpl::is_output_name(const std::string& name){
+        return std::find(outputs_name_.begin(), outputs_name_.end(), name) != outputs_name_.end();
+    }
+
+    bool TRTInferImpl::is_input_name(const std::string& name){
+        return std::find(inputs_name_.begin(), inputs_name_.end(), name) != inputs_name_.end();
+    }
+
+    void TRTInferImpl::forward(bool sync) {
+
+        EngineContext* context = (EngineContext*)context_.get();
+        int inputBatchSize = inputs_[0]->size(0);
+        for(int i = 0; i < context->engine_->getNbBindings(); ++i){
+            auto dims = context->engine_->getBindingDimensions(i);
+            auto type = context->engine_->getBindingDataType(i);
+            dims.d[0] = inputBatchSize;
+            if(context->engine_->bindingIsInput(i)){
+                context->context_->setBindingDimensions(i, dims);
+            }
+        }
+
+        for (int i = 0; i < outputs_.size(); ++i) {
+            outputs_[i]->resize_single_dim(0, inputBatchSize);
+            outputs_[i]->to_gpu(false);
+        }
+
+        for (int i = 0; i < orderdBlobs_.size(); ++i)
+            bindingsPtr_[i] = orderdBlobs_[i]->gpu();
+
+        void** bindingsptr = bindingsPtr_.data();
+        //bool execute_result = context->context_->enqueue(inputBatchSize, bindingsptr, context->stream_, nullptr);
+        bool execute_result = context->context_->enqueueV2(bindingsptr, context->stream_, nullptr);
+        if(!execute_result){
+            auto code = cudaGetLastError();
+            INFOF("execute fail, code %d[%s], message %s", code, cudaGetErrorName(code), cudaGetErrorString(code));
+        }
+
+        if (sync) {
+            synchronize();
+        }
+    }
+
+    std::shared_ptr<MixMemory> TRTInferImpl::get_workspace() {
+        return workspace_;
+    }
+
+    int TRTInferImpl::num_input() {
+        return this->inputs_.size();
+    }
+
+    int TRTInferImpl::num_output() {
+        return this->outputs_.size();
+    }
+
+    void TRTInferImpl::set_input (int index, std::shared_ptr<Tensor> tensor){
+        Assert(index >= 0 && index < inputs_.size());
+        this->inputs_[index] = tensor;
+
+        int order_index = inputs_map_to_ordered_index_[index];
+        this->orderdBlobs_[order_index] = tensor;
+    }
+
+    void TRTInferImpl::set_output(int index, std::shared_ptr<Tensor> tensor){
+        Assert(index >= 0 && index < outputs_.size());
+        this->outputs_[index] = tensor;
+
+        int order_index = outputs_map_to_ordered_index_[index];
+        this->orderdBlobs_[order_index] = tensor;
+    }
+
+    std::shared_ptr<Tensor> TRTInferImpl::input(int index) {
+        Assert(index >= 0 && index < inputs_name_.size());
+        return this->inputs_[index];
+    }
+
+    std::string TRTInferImpl::get_input_name(int index){
+        Assert(index >= 0 && index < inputs_name_.size());
+        return inputs_name_[index];
+    }
+
+    std::shared_ptr<Tensor> TRTInferImpl::output(int index) {
+        Assert(index >= 0 && index < outputs_.size());
+        return outputs_[index];
+    }
+
+    std::string TRTInferImpl::get_output_name(int index){
+        Assert(index >= 0 && index < outputs_name_.size());
+        return outputs_name_[index];
+    }
+
+    int TRTInferImpl::get_max_batch_size() {
+        Assert(this->context_ != nullptr);
+        return this->context_->engine_->getMaxBatchSize();
+    }
+
+    std::shared_ptr<Tensor> TRTInferImpl::tensor(const std::string& name) {
+        Assert(this->blobsNameMapper_.find(name) != this->blobsNameMapper_.end());
+        return orderdBlobs_[blobsNameMapper_[name]];
+    }
+
+    std::shared_ptr<TRTInferImpl> load_infer(const string& file) {
+        
+        std::shared_ptr<TRTInferImpl> infer(new TRTInferImpl());
+        if (!infer->load(file))
+            infer.reset();
+        return infer;
+    }
+
+    //////////////////////////////class MonopolyAllocator//////////////////////////////////////
+    /* 独占分配器
+       通过对tensor做独占管理，具有max_batch * 2个tensor，通过query获取一个
+       当推理结束后，该tensor释放使用权，即可交给下一个图像使用，内存实现复用
+    */
+    template<class _ItemType>
+    class MonopolyAllocator{
+    public:
+        class MonopolyData{
+        public:
+            std::shared_ptr<_ItemType>& data(){ return data_; }
+            void release(){manager_->release_one(this);}
+
+        private:
+            MonopolyData(MonopolyAllocator* pmanager){manager_ = pmanager;}
+
+        private:
+            friend class MonopolyAllocator;
+            MonopolyAllocator* manager_ = nullptr;
+            std::shared_ptr<_ItemType> data_;
+            bool available_ = true;
+        };
+        typedef std::shared_ptr<MonopolyData> MonopolyDataPointer;
+
+        MonopolyAllocator(int size){
+            capacity_ = size;
+            num_available_ = size;
+            datas_.resize(size);
+
+            for(int i = 0; i < size; ++i)
+                datas_[i] = std::shared_ptr<MonopolyData>(new MonopolyData(this));
+        }
+
+        virtual ~MonopolyAllocator(){
+            run_ = false;
+            cv_.notify_all();
+            
+            std::unique_lock<std::mutex> l(lock_);
+            cv_exit_.wait(l, [&](){
+                return num_wait_thread_ == 0;
+            });
+        }
+
+        MonopolyDataPointer query(int timeout = 10000){
+
+            std::unique_lock<std::mutex> l(lock_);
+            if(!run_) return nullptr;
+            
+            if(num_available_ == 0){
+                num_wait_thread_++;
+
+                auto state = cv_.wait_for(l, std::chrono::milliseconds(timeout), [&](){
+                    return num_available_ > 0 || !run_;
+                });
+
+                num_wait_thread_--;
+                cv_exit_.notify_one();
+
+                // timeout, no available, exit program
+                if(!state || num_available_ == 0 || !run_)
+                    return nullptr;
+            }
+
+            auto item = std::find_if(datas_.begin(), datas_.end(), [](MonopolyDataPointer& item){return item->available_;});
+            if(item == datas_.end())
+                return nullptr;
+            
+            (*item)->available_ = false;
+            num_available_--;
+            return *item;
+        }
+
+        int num_available(){
+            return num_available_;
+        }
+
+        int capacity(){
+            return capacity_;
+        }
+
+    private:
+        void release_one(MonopolyData* prq){
+            std::unique_lock<std::mutex> l(lock_);
+            if(!prq->available_){
+                prq->available_ = true;
+                num_available_++;
+                cv_.notify_one();
+            }
+        }
+
+    private:
+        std::mutex lock_;
+        std::condition_variable cv_;
+        std::condition_variable cv_exit_;
+        std::vector<MonopolyDataPointer> datas_;
+        int capacity_ = 0;
+        volatile int num_available_ = 0;
+        volatile int num_wait_thread_ = 0;
+        volatile bool run_ = true;
+    };
+
+
+    /////////////////////////////////////////class ThreadSafedAsyncInfer/////////////////////////////////////////////
+    /* 异步线程安全的推理器
+       通过异步线程启动，使得调用方允许任意线程调用把图像做输入，并通过future来获取异步结果
+    */
+    template<class Input, class Output, class StartParam=std::tuple<std::string, int>, class JobAdditional=int>
+    class ThreadSafedAsyncInfer{
+    public:
+        struct Job{
+            Input input;
+            Output output;
+            JobAdditional additional;
+            MonopolyAllocator<Tensor>::MonopolyDataPointer mono_tensor;
+            std::shared_ptr<std::promise<Output>> pro;
+        };
+
+        virtual ~ThreadSafedAsyncInfer(){
+            stop();
+        }
+
+        void stop(){
+            run_ = false;
+            cond_.notify_all();
+
+            ////////////////////////////////////////// cleanup jobs
+            {
+                std::unique_lock<std::mutex> l(jobs_lock_);
+                while(!jobs_.empty()){
+                    auto& item = jobs_.front();
+                    if(item.pro)
+                        item.pro->set_value(Output());
+                    jobs_.pop();
+                }
+            };
+
+            if(worker_){
+                worker_->join();
+                worker_.reset();
+            }
+        }
+
+        bool startup(const StartParam& param){
+            run_ = true;
+
+            std::promise<bool> pro;
+            start_param_ = param;
+            worker_      = std::make_shared<std::thread>(&ThreadSafedAsyncInfer::worker, this, std::ref(pro));
+            return pro.get_future().get();
+        }
+
+        virtual std::shared_future<Output> commit(const Input& input){
+
+            Job job;
+            job.pro = std::make_shared<std::promise<Output>>();
+            if(!preprocess(job, input)){
+                job.pro->set_value(Output());
+                return job.pro->get_future();
+            }
+            
+            ///////////////////////////////////////////////////////////
+            {
+                std::unique_lock<std::mutex> l(jobs_lock_);
+                jobs_.push(job);
+            };
+            cond_.notify_one();
+            return job.pro->get_future();
+        }
+
+        virtual std::vector<std::shared_future<Output>> commits(const std::vector<Input>& inputs){
+
+            int batch_size = std::min((int)inputs.size(), this->tensor_allocator_->capacity());
+            std::vector<Job> jobs(inputs.size());
+            std::vector<std::shared_future<Output>> results(inputs.size());
+
+            int nepoch = (inputs.size() + batch_size - 1) / batch_size;
+            for(int epoch = 0; epoch < nepoch; ++epoch){
+                int begin = epoch * batch_size;
+                int end   = std::min((int)inputs.size(), begin + batch_size);
+
+                for(int i = begin; i < end; ++i){
+                    Job& job = jobs[i];
+                    job.pro = std::make_shared<std::promise<Output>>();
+                    if(!preprocess(job, inputs[i])){
+                        job.pro->set_value(Output());
+                    }
+                    results[i] = job.pro->get_future();
+                }
+
+                ///////////////////////////////////////////////////////////
+                {
+                    std::unique_lock<std::mutex> l(jobs_lock_);
+                    for(int i = begin; i < end; ++i){
+                        jobs_.emplace(std::move(jobs[i]));
+                    };
+                }
+                cond_.notify_one();
+            }
+            return results;
+        }
+
+    protected:
+        virtual void worker(std::promise<bool>& result) = 0;
+        virtual bool preprocess(Job& job, const Input& input) = 0;
+        
+        virtual bool get_jobs_and_wait(std::vector<Job>& fetch_jobs, int max_size){
+
+            std::unique_lock<std::mutex> l(jobs_lock_);
+            cond_.wait(l, [&](){
+                return !run_ || !jobs_.empty();
+            });
+
+            if(!run_) return false;
+            
+            fetch_jobs.clear();
+            for(int i = 0; i < max_size && !jobs_.empty(); ++i){
+                fetch_jobs.emplace_back(std::move(jobs_.front()));
+                jobs_.pop();
+            }
+            return true;
+        }
+
+        virtual bool get_job_and_wait(Job& fetch_job){
+
+            std::unique_lock<std::mutex> l(jobs_lock_);
+            cond_.wait(l, [&](){
+                return !run_ || !jobs_.empty();
+            });
+
+            if(!run_) return false;
+            
+            fetch_job = std::move(jobs_.front());
+            jobs_.pop();
+            return true;
+        }
+
+    protected:
+        StartParam start_param_;
+        std::atomic<bool> run_;
+        std::mutex jobs_lock_;
+        std::queue<Job> jobs_;
+        std::shared_ptr<std::thread> worker_;
+        std::condition_variable cond_;
+        std::shared_ptr<MonopolyAllocator<Tensor>> tensor_allocator_;
+    };
+
+
+    ///////////////////////////////////class YoloTRTInferImpl//////////////////////////////////////
+    /* Yolo的具体实现
+        通过上述类的特性，实现预处理的计算重叠、异步垮线程调用，最终拼接为多个图为一个batch进行推理。最大化的利用
+        显卡性能，实现高性能高可用好用的yolo推理
+    */
+    const char* type_name(Type type){
+        switch(type){
+        case Type::V5: return "YoloV5";
+        case Type::X: return "YoloX";
+        default: return "Unknow";
+        }
+    }
+
+    struct AffineMatrix{
+        float i2d[6];       // image to dst(network), 2x3 matrix
+        float d2i[6];       // dst to image, 2x3 matrix
+
+        void compute(const cv::Size& from, const cv::Size& to){
+            float scale_x = to.width / (float)from.width;
+            float scale_y = to.height / (float)from.height;
+            float scale = std::min(scale_x, scale_y);
+            i2d[0] = scale;  i2d[1] = 0;  i2d[2] = -scale * from.width  * 0.5  + to.width * 0.5 + scale * 0.5 - 0.5;
+            i2d[3] = 0;  i2d[4] = scale;  i2d[5] = -scale * from.height * 0.5 + to.height * 0.5 + scale * 0.5 - 0.5;
+
+            cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);
+            cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);
+            cv::invertAffineTransform(m2x3_i2d, m2x3_d2i);
+        }
+
+        cv::Mat i2d_mat(){
+            return cv::Mat(2, 3, CV_32F, i2d);
+        }
+    };
+
+    using ThreadSafedAsyncInferImpl = ThreadSafedAsyncInfer
+    <
+        cv::Mat,                    // input
+        BoxArray,                   // output
+        tuple<string, int>,         // start param
+        AffineMatrix                // additional
+    >;
+    class YoloTRTInferImpl : public Infer, public ThreadSafedAsyncInferImpl{
+    public:
+
+        /** 要求在TRTInferImpl里面执行stop，而不是在基类执行stop **/
+        virtual ~YoloTRTInferImpl(){
+            stop();
+        }
+
+        virtual bool startup(const string& file, Type type, int gpuid, float confidence_threshold, float nms_threshold){
+
+            if(type == Type::V5){
+                normalize_ = Norm::alpha_beta(1 / 255.0f, 0.0f, ChannelType::SwapRB);
+            }else if(type == Type::X){
+                //float mean[] = {0.485, 0.456, 0.406};
+                //float std[]  = {0.229, 0.224, 0.225};
+                //normalize_ = Norm::mean_std(mean, std, 1/255.0f, ChannelType::Invert);
+                normalize_ = Norm::None();
+            }else{
+                INFOE("Unsupport type %d", type);
+            }
+            
+            confidence_threshold_ = confidence_threshold;
+            nms_threshold_        = nms_threshold;
+            return ThreadSafedAsyncInferImpl::startup(make_tuple(file, gpuid));
+        }
+
+        virtual void worker(promise<bool>& result) override{
+
+            string file = get<0>(start_param_);
+            int gpuid   = get<1>(start_param_);
+
+            set_device(gpuid);
+            auto engine = load_infer(file);
+            if(engine == nullptr){
+                INFOE("Engine %s load failed", file.c_str());
+                result.set_value(false);
+                return;
+            }
+
+            engine->print();
+
+            const int MAX_IMAGE_BBOX  = 1024;
+            const int NUM_BOX_ELEMENT = 7;      // left, top, right, bottom, confidence, class, keepflag
+            Tensor affin_matrix_device;
+            Tensor output_array_device;
+            int max_batch_size = engine->get_max_batch_size();
+            auto input         = engine->tensor("images");
+            auto output        = engine->tensor("output");
+            int num_classes    = output->size(2) - 5;
+
+            input_width_       = input->size(3);
+            input_height_      = input->size(2);
+            tensor_allocator_  = make_shared<MonopolyAllocator<Tensor>>(max_batch_size * 2);
+            stream_            = engine->get_stream();
+            gpu_               = gpuid;
+            result.set_value(true);
+
+            input->resize_single_dim(0, max_batch_size).to_gpu();
+            affin_matrix_device.set_stream(stream_);
+
+            // 这里8个值的目的是保证 8 * sizeof(float) % 32 == 0
+            affin_matrix_device.resize(max_batch_size, 8).to_gpu();
+
+            // 这里的 1 + MAX_IMAGE_BBOX结构是，counter + bboxes ...
+            output_array_device.resize(max_batch_size, 1 + MAX_IMAGE_BBOX * NUM_BOX_ELEMENT).to_gpu();
+
+            vector<Job> fetch_jobs;
+            while(get_jobs_and_wait(fetch_jobs, max_batch_size)){
+
+                int infer_batch_size = fetch_jobs.size();
+                input->resize_single_dim(0, infer_batch_size);
+
+                for(int ibatch = 0; ibatch < infer_batch_size; ++ibatch){
+                    auto& job  = fetch_jobs[ibatch];
+                    auto& mono = job.mono_tensor->data();
+                    affin_matrix_device.copy_from_gpu(affin_matrix_device.offset(ibatch), mono->get_workspace()->gpu(), 6);
+                    input->copy_from_gpu(input->offset(ibatch), mono->gpu(), mono->count());
+                    job.mono_tensor->release();
+                }
+
+                engine->forward(false);
+                output_array_device.to_gpu(false);
+                for(int ibatch = 0; ibatch < infer_batch_size; ++ibatch){
+                    
+                    auto& job                 = fetch_jobs[ibatch];
+                    float* image_based_output = output->gpu<float>(ibatch);
+                    float* output_array_ptr   = output_array_device.gpu<float>(ibatch);
+                    auto affine_matrix        = affin_matrix_device.gpu<float>(ibatch);
+                    checkCudaRuntime(cudaMemsetAsync(output_array_ptr, 0, sizeof(int), stream_));
+                    decode_kernel_invoker(image_based_output, output->size(1), num_classes, confidence_threshold_, nms_threshold_, affine_matrix, output_array_ptr, MAX_IMAGE_BBOX, stream_);
+                }
+
+                output_array_device.to_cpu();
+                for(int ibatch = 0; ibatch < infer_batch_size; ++ibatch){
+                    float* parray = output_array_device.cpu<float>(ibatch);
+                    int count     = min(MAX_IMAGE_BBOX, (int)*parray);
+                    auto& job     = fetch_jobs[ibatch];
+                    auto& image_based_boxes   = job.output;
+                    for(int i = 0; i < count; ++i){
+                        float* pbox  = parray + 1 + i * NUM_BOX_ELEMENT;
+                        int label    = pbox[5];
+                        int keepflag = pbox[6];
+                        if(keepflag == 1){
+                            image_based_boxes.emplace_back(pbox[0], pbox[1], pbox[2], pbox[3], pbox[4], label);
+                        }
+                    }
+                    job.pro->set_value(image_based_boxes);
+                }
+                fetch_jobs.clear();
+            }
+            stream_ = nullptr;
+            tensor_allocator_.reset();
+            INFO("Engine destroy.");
+        }
+
+        virtual bool preprocess(Job& job, const Mat& image) override{
+
+            if(tensor_allocator_ == nullptr){
+                INFOE("tensor_allocator_ is nullptr");
+                return false;
+            }
+
+            job.mono_tensor = tensor_allocator_->query();
+            if(job.mono_tensor == nullptr){
+                INFOE("Tensor allocator query failed.");
+                return false;
+            }
+
+            AutoDevice auto_device(gpu_);
+            auto& tensor = job.mono_tensor->data();
+            if(tensor == nullptr){
+                // not init
+                tensor = make_shared<Tensor>();
+                tensor->set_workspace(make_shared<MixMemory>());
+            }
+
+            Size input_size(input_width_, input_height_);
+            job.additional.compute(image.size(), input_size);
+            
+            tensor->set_stream(stream_);
+            tensor->resize(1, 3, input_height_, input_width_);
+
+            size_t size_image      = image.cols * image.rows * 3;
+            size_t size_matrix     = upbound(sizeof(job.additional.d2i), 32);
+            auto workspace         = tensor->get_workspace();
+            uint8_t* gpu_workspace        = (uint8_t*)workspace->gpu(size_matrix + size_image);
+            float*   affine_matrix_device = (float*)gpu_workspace;
+            uint8_t* image_device         = size_matrix + gpu_workspace;
+
+            uint8_t* cpu_workspace        = (uint8_t*)workspace->cpu(size_matrix + size_image);
+            float* affine_matrix_host     = (float*)cpu_workspace;
+            uint8_t* image_host           = size_matrix + cpu_workspace;
+
+            //checkCudaRuntime(cudaMemcpyAsync(image_host,   image.data, size_image, cudaMemcpyHostToHost,   stream_));
+            // speed up
+            memcpy(image_host, image.data, size_image);
+            memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i));
+            checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_));
+            checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(job.additional.d2i), cudaMemcpyHostToDevice, stream_));
+
+            warp_affine_bilinear_and_normalize_plane(
+                image_device,         image.cols * 3,       image.cols,       image.rows, 
+                tensor->gpu<float>(), input_width_,         input_height_, 
+                affine_matrix_device, 114, 
+                normalize_, stream_
+            );
+            return true;
+        }
+
+        virtual vector<shared_future<BoxArray>> commits(const vector<Mat>& images) override{
+            return ThreadSafedAsyncInferImpl::commits(images);
+        }
+
+        virtual std::shared_future<BoxArray> commit(const Mat& image) override{
+            return ThreadSafedAsyncInferImpl::commit(image);
+        }
+
+    private:
+        int input_width_            = 0;
+        int input_height_           = 0;
+        int gpu_                    = 0;
+        float confidence_threshold_ = 0;
+        float nms_threshold_        = 0;
+        cudaStream_t stream_       = nullptr;
+        Norm normalize_;
+    };
+
+    void image_to_tensor(const cv::Mat& image, shared_ptr<Tensor>& tensor, Type type, int ibatch){
+
+        Norm normalize;
+        if(type == Type::V5){
+            normalize = Norm::alpha_beta(1 / 255.0f, 0.0f, ChannelType::SwapRB);
+        }else if(type == Type::X){
+            //float mean[] = {0.485, 0.456, 0.406};
+            //float std[]  = {0.229, 0.224, 0.225};
+            //normalize_ = CUDAKernel::Norm::mean_std(mean, std, 1/255.0f, CUDAKernel::ChannelType::Invert);
+            normalize = Norm::None();
+        }else{
+            INFOE("Unsupport type %d", type);
+        }
+        
+        Size input_size(tensor->size(3), tensor->size(2));
+        AffineMatrix affine;
+        affine.compute(image.size(), input_size);
+
+        size_t size_image      = image.cols * image.rows * 3;
+        size_t size_matrix     = upbound(sizeof(affine.d2i), 32);
+        auto workspace         = tensor->get_workspace();
+        uint8_t* gpu_workspace        = (uint8_t*)workspace->gpu(size_matrix + size_image);
+        float*   affine_matrix_device = (float*)gpu_workspace;
+        uint8_t* image_device         = size_matrix + gpu_workspace;
+
+        uint8_t* cpu_workspace        = (uint8_t*)workspace->cpu(size_matrix + size_image);
+        float* affine_matrix_host     = (float*)cpu_workspace;
+        uint8_t* image_host           = size_matrix + cpu_workspace;
+        auto stream                   = tensor->get_stream();
+
+        memcpy(image_host, image.data, size_image);
+        memcpy(affine_matrix_host, affine.d2i, sizeof(affine.d2i));
+        checkCudaRuntime(cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream));
+        checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, sizeof(affine.d2i), cudaMemcpyHostToDevice, stream));
+
+        warp_affine_bilinear_and_normalize_plane(
+            image_device,               image.cols * 3,       image.cols,       image.rows, 
+            tensor->gpu<float>(ibatch), input_size.width,     input_size.height, 
+            affine_matrix_device, 114, 
+            normalize, stream
+        );
+    }
+
+    shared_ptr<Infer> create_infer(const string& engine_file, Type type, int gpuid, float confidence_threshold, float nms_threshold){
+        shared_ptr<YoloTRTInferImpl> instance(new YoloTRTInferImpl());
+        if(!instance->startup(engine_file, type, gpuid, confidence_threshold, nms_threshold)){
+            instance.reset();
+        }
+        return instance;
+    }
+
+    //////////////////////////////////////Compile Model/////////////////////////////////////////////////////////////
+
+    const char* mode_string(Mode type) {
+        switch (type) {
+        case Mode::FP32:
+            return "FP32";
+        case Mode::FP16:
+            return "FP16";
+        case Mode::INT8:
+            return "INT8";
+        default:
+            return "UnknowCompileMode";
+        }
+    }
+
+    typedef std::function<void(int current, int count, const std::vector<std::string>& files, std::shared_ptr<Tensor>& tensor)> Int8Process;
+
+    class Int8EntropyCalibrator : public IInt8EntropyCalibrator2{
+    public:
+        Int8EntropyCalibrator(const vector<string>& imagefiles, nvinfer1::Dims dims, const Int8Process& preprocess) {
+
+            Assert(preprocess != nullptr);
+            this->dims_ = dims;
+            this->allimgs_ = imagefiles;
+            this->preprocess_ = preprocess;
+            this->fromCalibratorData_ = false;
+            files_.resize(dims.d[0]);
+            checkCudaRuntime(cudaStreamCreate(&stream_));
+        }
+
+        Int8EntropyCalibrator(const vector<uint8_t>& entropyCalibratorData, nvinfer1::Dims dims, const Int8Process& preprocess) {
+            Assert(preprocess != nullptr);
+
+            this->dims_ = dims;
+            this->entropyCalibratorData_ = entropyCalibratorData;
+            this->preprocess_ = preprocess;
+            this->fromCalibratorData_ = true;
+            files_.resize(dims.d[0]);
+            checkCudaRuntime(cudaStreamCreate(&stream_));
+        }
+
+        virtual ~Int8EntropyCalibrator(){
+            checkCudaRuntime(cudaStreamDestroy(stream_));
+        }
+
+        int getBatchSize() const noexcept {
+            return dims_.d[0];
+        }
+
+        bool next() {
+            int batch_size = dims_.d[0];
+            if (cursor_ + batch_size > allimgs_.size())
+                return false;
+
+            int old_cursor = cursor_;
+            for(int i = 0; i < batch_size; ++i)
+                files_[i] = allimgs_[cursor_++];
+
+            if (!tensor_){
+                tensor_.reset(new Tensor(dims_.nbDims, dims_.d));
+                tensor_->set_stream(stream_);
+                tensor_->set_workspace(make_shared<MixMemory>());
+            }
+
+            preprocess_(old_cursor, allimgs_.size(), files_, tensor_);
+            return true;
+        }
+
+        bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept {
+            if (!next()) return false;
+            bindings[0] = tensor_->gpu();
+            return true;
+        }
+
+        const vector<uint8_t>& getEntropyCalibratorData() {
+            return entropyCalibratorData_;
+        }
+
+        const void* readCalibrationCache(size_t& length) noexcept {
+            if (fromCalibratorData_) {
+                length = this->entropyCalibratorData_.size();
+                return this->entropyCalibratorData_.data();
+            }
+
+            length = 0;
+            return nullptr;
+        }
+
+        virtual void writeCalibrationCache(const void* cache, size_t length) noexcept {
+            entropyCalibratorData_.assign((uint8_t*)cache, (uint8_t*)cache + length);
+        }
+
+    private:
+        Int8Process preprocess_;
+        vector<string> allimgs_;
+        size_t batchCudaSize_ = 0;
+        int cursor_ = 0;
+        nvinfer1::Dims dims_;
+        vector<string> files_;
+        shared_ptr<Tensor> tensor_;
+        vector<uint8_t> entropyCalibratorData_;
+        bool fromCalibratorData_ = false;
+        cudaStream_t stream_ = nullptr;
+    };
+
+    bool compile(
+        Mode mode, Type type,
+        unsigned int max_batch_size,
+        const string& source_onnx,
+        const string& saveto,
+        size_t max_workspace_size,
+        const std::string& int8_images_folder,
+        const std::string& int8_entropy_calibrator_cache_file) {
+
+        bool hasEntropyCalibrator = false;
+        vector<uint8_t> entropyCalibratorData;
+        vector<string> entropyCalibratorFiles;
+
+        auto int8process = [=](int current, int count, const vector<string>& files, shared_ptr<Tensor>& tensor){
+
+            for(int i = 0; i < files.size(); ++i){
+
+                auto& file = files[i];
+                INFO("Int8 load %d / %d, %s", current + i + 1, count, file.c_str());
+
+                auto image = cv::imread(file);
+                if(image.empty()){
+                    INFOE("Load image failed, %s", file.c_str());
+                    continue;
+                }
+                image_to_tensor(image, tensor, type, i);
+            }
+            tensor->synchronize();
+        };
+
+        if (mode == Mode::INT8) {
+            if (!int8_entropy_calibrator_cache_file.empty()) {
+                if (exists(int8_entropy_calibrator_cache_file)) {
+                    entropyCalibratorData = load_file(int8_entropy_calibrator_cache_file);
+                    if (entropyCalibratorData.empty()) {
+                        INFOE("entropyCalibratorFile is set as: %s, but we read is empty.", int8_entropy_calibrator_cache_file.c_str());
+                        return false;
+                    }
+                    hasEntropyCalibrator = true;
+                }
+            }
+            
+            if (hasEntropyCalibrator) {
+                if (!int8_images_folder.empty()) {
+                    INFOW("int8_images_folder is ignore, when int8_entropy_calibrator_cache_file is set");
+                }
+            }
+            else {
+                entropyCalibratorFiles = glob_image_files(int8_images_folder);
+                if (entropyCalibratorFiles.empty()) {
+                    INFOE("Can not find any images(jpg/png/bmp/jpeg/tiff) from directory: %s", int8_images_folder.c_str());
+                    return false;
+                }
+
+                if(entropyCalibratorFiles.size() < max_batch_size){
+                    INFOW("Too few images provided, %d[provided] < %d[max batch size], image copy will be performed", entropyCalibratorFiles.size(), max_batch_size);
+                    
+                    int old_size = entropyCalibratorFiles.size();
+                    for(int i = old_size; i < max_batch_size; ++i)
+                        entropyCalibratorFiles.push_back(entropyCalibratorFiles[i % old_size]);
+                }
+            }
+        }
+        else {
+            if (hasEntropyCalibrator) {
+                INFOW("int8_entropy_calibrator_cache_file is ignore, when Mode is '%s'", mode_string(mode));
+            }
+        }
+
+        INFO("Compile %s %s.", mode_string(mode), source_onnx.c_str());
+        shared_ptr<IBuilder> builder(createInferBuilder(gLogger), destroy_nvidia_pointer<IBuilder>);
+        if (builder == nullptr) {
+            INFOE("Can not create builder.");
+            return false;
+        }
+
+        shared_ptr<IBuilderConfig> config(builder->createBuilderConfig(), destroy_nvidia_pointer<IBuilderConfig>);
+        if (mode == Mode::FP16) {
+            if (!builder->platformHasFastFp16()) {
+                INFOW("Platform not have fast fp16 support");
+            }
+            config->setFlag(BuilderFlag::kFP16);
+        }
+        else if (mode == Mode::INT8) {
+            if (!builder->platformHasFastInt8()) {
+                INFOW("Platform not have fast int8 support");
+            }
+            config->setFlag(BuilderFlag::kINT8);
+        }
+
+        shared_ptr<INetworkDefinition> network;
+        shared_ptr<nvonnxparser::IParser> onnxParser;
+        const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+        network = shared_ptr<INetworkDefinition>(builder->createNetworkV2(explicitBatch), destroy_nvidia_pointer<INetworkDefinition>);
+
+        //from onnx is not markOutput
+        onnxParser.reset(nvonnxparser::createParser(*network, gLogger), destroy_nvidia_pointer<nvonnxparser::IParser>);
+        if (onnxParser == nullptr) {
+            INFOE("Can not create parser.");
+            return false;
+        }
+
+        if (!onnxParser->parseFromFile(source_onnx.c_str(), 1)) {
+            INFOE("Can not parse OnnX file: %s", source_onnx.c_str());
+            return false;
+        }
+        
+        auto inputTensor = network->getInput(0);
+        auto inputDims = inputTensor->getDimensions();
+
+        shared_ptr<Int8EntropyCalibrator> int8Calibrator;
+        if (mode == Mode::INT8) {
+            auto calibratorDims = inputDims;
+            calibratorDims.d[0] = max_batch_size;
+
+            if (hasEntropyCalibrator) {
+                INFO("Using exist entropy calibrator data[%d bytes]: %s", entropyCalibratorData.size(), int8_entropy_calibrator_cache_file.c_str());
+                int8Calibrator.reset(new Int8EntropyCalibrator(
+                    entropyCalibratorData, calibratorDims, int8process
+                ));
+            }
+            else {
+                INFO("Using image list[%d files]: %s", entropyCalibratorFiles.size(), int8_images_folder.c_str());
+                int8Calibrator.reset(new Int8EntropyCalibrator(
+                    entropyCalibratorFiles, calibratorDims, int8process
+                ));
+            }
+            config->setInt8Calibrator(int8Calibrator.get());
+        }
+
+        INFO("Input shape is %s", join_dims(vector<int>(inputDims.d, inputDims.d + inputDims.nbDims)).c_str());
+        INFO("Set max batch size = %d", max_batch_size);
+        INFO("Set max workspace size = %.2f MB", max_workspace_size / 1024.0f / 1024.0f);
+
+        int net_num_input = network->getNbInputs();
+        INFO("Network has %d inputs:", net_num_input);
+        vector<string> input_names(net_num_input);
+        for(int i = 0; i < net_num_input; ++i){
+            auto tensor = network->getInput(i);
+            auto dims = tensor->getDimensions();
+            auto dims_str = join_dims(vector<int>(dims.d, dims.d+dims.nbDims));
+            INFO("      %d.[%s] shape is %s", i, tensor->getName(), dims_str.c_str());
+
+            input_names[i] = tensor->getName();
+        }
+
+        int net_num_output = network->getNbOutputs();
+        INFO("Network has %d outputs:", net_num_output);
+        for(int i = 0; i < net_num_output; ++i){
+            auto tensor = network->getOutput(i);
+            auto dims = tensor->getDimensions();
+            auto dims_str = join_dims(vector<int>(dims.d, dims.d+dims.nbDims));
+            INFO("      %d.[%s] shape is %s", i, tensor->getName(), dims_str.c_str());
+        }
+
+        int net_num_layers = network->getNbLayers();
+        INFO("Network has %d layers", net_num_layers);		
+        builder->setMaxBatchSize(max_batch_size);
+        config->setMaxWorkspaceSize(max_workspace_size);
+
+        auto profile = builder->createOptimizationProfile();
+        for(int i = 0; i < net_num_input; ++i){
+            auto input = network->getInput(i);
+            auto input_dims = input->getDimensions();
+            input_dims.d[0] = 1;
+            profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
+            profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
+            input_dims.d[0] = max_batch_size;
+            profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
+        }
+        config->addOptimizationProfile(profile);
+
+        INFO("Building engine...");
+        auto time_start = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
+        shared_ptr<ICudaEngine> engine(builder->buildEngineWithConfig(*network, *config), destroy_nvidia_pointer<ICudaEngine>);
+        if (engine == nullptr) {
+            INFOE("engine is nullptr");
+            return false;
+        }
+
+        if (mode == Mode::INT8) {
+            if (!hasEntropyCalibrator) {
+                if (!int8_entropy_calibrator_cache_file.empty()) {
+                    INFO("Save calibrator to: %s", int8_entropy_calibrator_cache_file.c_str());
+                    save_file(int8_entropy_calibrator_cache_file, int8Calibrator->getEntropyCalibratorData());
+                }
+                else {
+                    INFO("No set entropyCalibratorFile, and entropyCalibrator will not save.");
+                }
+            }
+        }
+
+        auto time_end = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
+        INFO("Build done %lld ms !", time_end - time_start);
+        
+        // serialize the engine, then close everything down
+        shared_ptr<IHostMemory> seridata(engine->serialize(), destroy_nvidia_pointer<IHostMemory>);
+        return save_file(saveto, seridata->data(), seridata->size());
+    }
+};
\ No newline at end of file
diff --git a/robot_omni/include/robot_detector/simple_yolo.hpp b/robot_omni/include/robot_detector/simple_yolo.hpp
new file mode 100644
index 0000000..f13e28a
--- /dev/null
+++ b/robot_omni/include/robot_detector/simple_yolo.hpp
@@ -0,0 +1,76 @@
+#ifndef SIMPLE_YOLO_HPP
+#define SIMPLE_YOLO_HPP
+
+/*
+  简单的yolo接口，容易集成但是高性能
+*/
+
+#include <vector>
+#include <memory>
+#include <string>
+#include <future>
+#include <opencv2/opencv.hpp>
+
+namespace SimpleYolo{
+
+    using namespace std;
+
+    enum class Type : int{
+        V5 = 0,
+        X  = 1
+    };
+
+    enum class Mode : int {
+        FP32,
+        FP16,
+        INT8
+    };
+
+    struct Box{
+        float left, top, right, bottom, confidence;
+        int class_label;
+
+        Box() = default;
+
+        Box(float left, float top, float right, float bottom, float confidence, int class_label)
+        :left(left), top(top), right(right), bottom(bottom), confidence(confidence), class_label(class_label){}
+    };
+
+    typedef std::vector<Box> BoxArray;
+
+    class Infer{
+    public:
+        virtual shared_future<BoxArray> commit(const cv::Mat& image) = 0;
+        virtual vector<shared_future<BoxArray>> commits(const vector<cv::Mat>& images) = 0;
+    };
+
+    const char* trt_version();
+    const char* type_name(Type type);
+    const char* mode_string(Mode type);
+    void set_device(int device_id);
+
+    /* 
+        模型编译
+        max batch size：为最大可以允许的batch数量
+        source_onnx：仅仅只支持onnx格式输入
+        saveto：储存的tensorRT模型，用于后续的加载
+        max workspace size：最大工作空间大小，一般给1GB，在嵌入式可以改为256MB，单位是byte
+        int8 images folder：对于Mode为INT8时，需要提供图像数据进行标定，请提供文件夹，会自动检索下面的jpg/jpeg/tiff/png/bmp
+        int8_entropy_calibrator_cache_file：对于int8模式下，熵文件可以缓存，避免二次加载数据，可以跨平台使用，是一个txt文件
+     */
+    // 1GB = 1<<30
+    bool compile(
+        Mode mode, Type type,
+        unsigned int max_batch_size,
+        const string& source_onnx,
+        const string& saveto,
+        size_t max_workspace_size = 1<<30,
+        const std::string& int8_images_folder = "",
+        const std::string& int8_entropy_calibrator_cache_file = ""
+    );
+
+    shared_ptr<Infer> create_infer(const string& engine_file, Type type, int gpuid, float confidence_threshold=0.25f, float nms_threshold=0.5f);
+
+}; // namespace SimpleYolo
+
+#endif // SIMPLE_YOLO_HPP
\ No newline at end of file
diff --git a/robot_omni/launch/robot_detector_node.launch b/robot_omni/launch/robot_detector_node.launch
new file mode 100644
index 0000000..b9b1c55
--- /dev/null
+++ b/robot_omni/launch/robot_detector_node.launch
@@ -0,0 +1,5 @@
+<launch>
+    <node name="robot_detector_node" pkg="omni_detect" type="robot_detector_node" output="screen"> 
+        <rosparam command="load" file="$(find omni_detect)/config/robot_detector.yaml"/>
+    </node>
+</launch>
diff --git a/robot_omni/maps/around.png b/robot_omni/maps/around.png
new file mode 100644
index 0000000..d99b7af
Binary files /dev/null and b/robot_omni/maps/around.png differ
diff --git a/robot_omni/package.xml b/robot_omni/package.xml
new file mode 100644
index 0000000..812f126
--- /dev/null
+++ b/robot_omni/package.xml
@@ -0,0 +1,78 @@
+<?xml version="1.0"?>
+<package format="2">
+  <name>omni_detect</name>
+  <version>0.0.0</version>
+  <description>The omni_detect package</description>
+
+  <!-- One maintainer tag required, multiple allowed, one person per tag -->
+  <!-- Example:  -->
+  <!-- <maintainer email="jane.doe@example.com">Jane Doe</maintainer> -->
+  <maintainer email="hitxjf@todo.todo">hitxjf</maintainer>
+
+
+  <!-- One license tag required, multiple allowed, one license per tag -->
+  <!-- Commonly used license strings: -->
+  <!--   BSD, MIT, Boost Software License, GPLv2, GPLv3, LGPLv2.1, LGPLv3 -->
+  <license>TODO</license>
+
+
+  <!-- Url tags are optional, but multiple are allowed, one per tag -->
+  <!-- Optional attribute type can be: website, bugtracker, or repository -->
+  <!-- Example: -->
+  <!-- <url type="website">http://wiki.ros.org/omni_detect</url> -->
+
+
+  <!-- Author tags are optional, multiple are allowed, one per tag -->
+  <!-- Authors do not have to be maintainers, but could be -->
+  <!-- Example: -->
+  <!-- <author email="jane.doe@example.com">Jane Doe</author> -->
+
+
+  <!-- The *depend tags are used to specify dependencies -->
+  <!-- Dependencies can be catkin packages or system dependencies -->
+  <!-- Examples: -->
+  <!-- Use depend as a shortcut for packages that are both build and exec dependencies -->
+  <!--   <depend>roscpp</depend> -->
+  <!--   Note that this is equivalent to the following: -->
+  <!--   <build_depend>roscpp</build_depend> -->
+  <!--   <exec_depend>roscpp</exec_depend> -->
+  <!-- Use build_depend for packages you need at compile time: -->
+  <!--   <build_depend>message_generation</build_depend> -->
+  <!-- Use build_export_depend for packages you need in order to build against this package: -->
+  <!--   <build_export_depend>message_generation</build_export_depend> -->
+  <!-- Use buildtool_depend for build tool packages: -->
+  <!--   <buildtool_depend>catkin</buildtool_depend> -->
+  <!-- Use exec_depend for packages you need at runtime: -->
+  <!--   <exec_depend>message_runtime</exec_depend> -->
+  <!-- Use test_depend for packages you need only for testing: -->
+  <!--   <test_depend>gtest</test_depend> -->
+  <!-- Use doc_depend for packages you need only for building documentation: -->
+  <!--   <doc_depend>doxygen</doc_depend> -->
+  <buildtool_depend>catkin</buildtool_depend>
+  <build_depend>cv_bridge</build_depend>
+  <build_depend>image_transport</build_depend>
+  <build_depend>message_filters</build_depend>
+  <build_depend>message_generation</build_depend>
+  <build_depend>roscpp</build_depend>
+  <build_depend>sensor_msgs</build_depend>
+  <build_depend>std_msgs</build_depend>
+  <build_export_depend>cv_bridge</build_export_depend>
+  <build_export_depend>image_transport</build_export_depend>
+  <build_export_depend>message_filters</build_export_depend>
+  <build_export_depend>roscpp</build_export_depend>
+  <build_export_depend>sensor_msgs</build_export_depend>
+  <build_export_depend>std_msgs</build_export_depend>
+  <exec_depend>cv_bridge</exec_depend>
+  <exec_depend>image_transport</exec_depend>
+  <exec_depend>message_filters</exec_depend>
+  <exec_depend>roscpp</exec_depend>
+  <exec_depend>sensor_msgs</exec_depend>
+  <exec_depend>std_msgs</exec_depend>
+  <build_depend>robot_msgs</build_depend>
+
+  <!-- The export tag contains other, unspecified, tags -->
+  <export>
+    <!-- Other tools can request additional information be placed here -->
+
+  </export>
+</package>
diff --git a/robot_omni/src/CMakeLists.txt b/robot_omni/src/CMakeLists.txt
new file mode 100644
index 0000000..9f05072
--- /dev/null
+++ b/robot_omni/src/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(robot_detector)
\ No newline at end of file
diff --git a/robot_omni/src/robot_detector/CMakeLists.txt b/robot_omni/src/robot_detector/CMakeLists.txt
new file mode 100644
index 0000000..1224d73
--- /dev/null
+++ b/robot_omni/src/robot_detector/CMakeLists.txt
@@ -0,0 +1,14 @@
+
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11 -O0 -Xcompiler -fPIC -g -w ${CUDA_GEN_CODE}")
+file(GLOB_RECURSE cuda_srcs ${PROJECT_SOURCE_DIR}/include/robot_detector/*.cu)
+
+cuda_add_library(cucodes SHARED ${cuda_srcs})
+target_link_libraries(cucodes nvinfer nvonnxparser)
+target_link_libraries(cucodes cuda cublas cudart cudnn)
+
+add_library (robot_detector_lib SHARED RobotDetector.cpp)
+
+target_link_libraries(robot_detector_lib  
+                       ${OpenCV_LIBS}
+                       cucodes)
+
diff --git a/robot_omni/src/robot_detector/RobotDetector.cpp b/robot_omni/src/robot_detector/RobotDetector.cpp
new file mode 100644
index 0000000..4fe0752
--- /dev/null
+++ b/robot_omni/src/robot_detector/RobotDetector.cpp
@@ -0,0 +1,363 @@
+#include "RobotDetector.h"
+
+namespace robot_detector
+{
+    using namespace std;
+
+    static bool exists(const string &path)
+    {
+        return access(path.c_str(), R_OK) == 0;
+    }
+
+    RobotDetector::RobotDetector() {}
+
+    RobotDetector::~RobotDetector() {}
+
+    bool RobotDetector::init(const string &model_name, SimpleYolo::Mode precision, const double cof_threshold, const double nms_threshold,
+                             const std::vector<float> heights, const std::vector<float> distances)
+    {
+        int device_id = 0;
+        SimpleYolo::set_device(device_id);
+        auto type = SimpleYolo::Type::V5;
+        const char *name = model_name.c_str();
+        string onnx_file = cv::format("%s.onnx", name);
+        string model_file = cv::format("%s.%s.trtmodel", name, SimpleYolo::mode_string(precision));
+        int test_batch_size = 6;
+        if (!exists(model_file) && !SimpleYolo::compile(
+                                       precision,
+                                       type,
+                                       test_batch_size,
+                                       onnx_file,
+                                       model_file,
+                                       1 << 30,
+                                       ""))
+        {
+            printf("Can't find trtmodel and compile trtmodel failed\n");
+            return false;
+        }
+
+        _cof_threshold = cof_threshold;
+        _nms_area_threshold = nms_threshold;
+
+        _yolo = SimpleYolo::create_infer(model_file, type, device_id, cof_threshold, nms_threshold);
+        if (_yolo == nullptr)
+        {
+            printf("Yolo infer is nullptr\n");
+            return false;
+        }
+
+        _distances = distances;
+        _heights = heights;
+
+        return true;
+    }
+
+    void RobotDetector::load_camera_params(const std::vector<std::vector<double>> intrinsics,
+                                           const std::vector<std::vector<double>> extrinsics,
+                                           const std::vector<std::vector<double>> discoeffs)
+    {
+        for (int i = 0; i < intrinsics.size(); i++)
+        {
+            cv::Mat intrinsic_temp = cv::Mat(3, 3, CV_64FC1);
+            memcpy(intrinsic_temp.data, intrinsics[i].data(), intrinsics[i].size() * sizeof(double));
+            _intrinsic_cvs.push_back(intrinsic_temp);
+        }
+        for (int i = 0; i < extrinsics.size(); i++)
+        {
+            cv::Mat extrinsic_temp = cv::Mat(4, 4, CV_64FC1);
+            memcpy(extrinsic_temp.data, extrinsics[i].data(), extrinsics[i].size() * sizeof(double));
+            _extrinsic_cvs.push_back(extrinsic_temp);
+        }
+        for (int i = 0; i < discoeffs.size(); i++)
+        {
+            _discoff_cvs.push_back(cv::Mat(discoeffs[i]).t());
+        }
+    }
+
+    bool RobotDetector::process_frame(cv::Mat inframe,
+                                      std::vector<Object> &detected_objects,
+                                      const int camera_id)
+    {
+        if (inframe.empty())
+        {
+            std::cout << "Invalid Picture Input" << std::endl;
+            return false;
+        }
+
+        cv::Mat resize_img = letterBox(inframe);
+
+        auto objs = _yolo->commit(resize_img).get();
+
+        std::vector<cv::Rect> origin_rect;
+        std::vector<float> origin_rect_cof;
+        std::vector<int> origin_classId;
+        for (auto &obj : objs)
+        {
+            cv::Rect resize_rect(obj.left, obj.top, obj.right - obj.left, obj.bottom - obj.top);
+            origin_rect.push_back(resize_rect);
+            origin_rect_cof.push_back(obj.confidence);
+            origin_classId.push_back(obj.class_label);
+        }
+
+        std::vector<int> final_id;
+        cv::dnn::NMSBoxes(origin_rect, origin_rect_cof, _cof_threshold, _nms_area_threshold, final_id);
+        for (int i = 0; i < final_id.size(); ++i)
+        {
+            cv::Rect resize_rect = origin_rect[final_id[i]];
+            cv::Rect rawrect = detect2origin(resize_rect, _ratio, _topPad, _leftPad);
+            detected_objects.push_back(Object{
+                origin_rect_cof[final_id[i]],
+                origin_classId[final_id[i]],
+                rawrect});
+        }
+
+        return true;
+    }
+
+    cv::Mat RobotDetector::letterBox(cv::Mat src)
+    {
+        if (src.empty())
+        {
+            std::cout << "input image invalid" << std::endl;
+            return cv::Mat();
+        }
+        int in_w = src.cols;
+        int in_h = src.rows;
+        int tar_w = 640;
+        int tar_h = 640;
+        _ratio = std::min(float(tar_h) / in_h, float(tar_w) / in_w);
+        int inside_w = std::round(in_w * _ratio);
+        int inside_h = std::round(in_h * _ratio);
+        int pad_w = tar_w - inside_w;
+        int pad_h = tar_h - inside_h;
+
+        cv::Mat resize_img;
+        cv::resize(src, resize_img, cv::Size(inside_w, inside_h));
+        pad_w = pad_w % 32;
+        pad_h = pad_h % 32;
+        pad_w = pad_w / 2;
+        pad_h = pad_h / 2;
+
+        _topPad = int(std::round(pad_h - 0.1));
+        _btmPad = int(std::round(pad_h + 0.1));
+        _leftPad = int(std::round(pad_w - 0.1));
+        _rightPad = int(std::round(pad_w + 0.1));
+
+        cv::copyMakeBorder(resize_img, resize_img, _topPad, _btmPad, _leftPad, _rightPad, cv::BORDER_CONSTANT, cv::Scalar(114, 114, 114));
+        return resize_img;
+    }
+
+    cv::Rect RobotDetector::detect2origin(const cv::Rect &det_rect, float rate_to, int top, int left)
+    {
+
+        int inside_x = det_rect.x - left;
+        int inside_y = det_rect.y - top;
+        int ox = std::round(float(inside_x) / rate_to);
+        int oy = std::round(float(inside_y) / rate_to);
+        int ow = std::round(float(det_rect.width) / rate_to);
+        int oh = std::round(float(det_rect.height) / rate_to);
+        cv::Rect origin_rect(ox, oy, ow, oh);
+        return origin_rect;
+    }
+
+    float linear_inter(const std::vector<float> X_Arr, const std::vector<float> Y_Arr, const int length, const float x)
+    {
+        int j;
+        float y = 0;
+
+        if (x <= X_Arr[0])
+        {
+            y = Y_Arr[0];
+            return y;
+        }
+        else if (x >= X_Arr[length - 1])
+        {
+            y = Y_Arr[length - 1];
+            return y;
+        }
+        else
+        {
+            for (j = 1; j < length; j++)
+            {
+                if (x <= X_Arr[j])
+                {
+                    j--;
+                    break;
+                }
+            }
+        }
+
+        y = Y_Arr[j] * ((x - X_Arr[j + 1]) / (X_Arr[j] - X_Arr[j + 1])) + Y_Arr[j + 1] * ((x - X_Arr[j]) / (X_Arr[j + 1] - X_Arr[j]));
+
+        return y;
+    }
+
+    void RobotDetector::location_estimation(const int camera_id, std::vector<Robot> &Robots, bool is_robot_from_armors)
+    {
+
+        for (int i = 0; i < Robots.size(); ++i)
+        {
+            float fx = _intrinsic_cvs[camera_id].at<double>(0, 0);
+            float fy = _intrinsic_cvs[camera_id].at<double>(1, 1);
+            float cx = _intrinsic_cvs[camera_id].at<double>(0, 2);
+            float cy = _intrinsic_cvs[camera_id].at<double>(1, 2);
+            float center_u = Robots[i].m_rect.x + Robots[i].m_rect.width / 2;
+            float center_v = Robots[i].m_rect.y + Robots[i].m_rect.height / 2;
+
+            if (!is_robot_from_armors)
+            {
+                Robots[i].XYZ_camera.z = linear_inter(_heights, _distances, _heights.size(), Robots[i].m_rect.height);
+            }
+            else
+            {
+                Robots[i].XYZ_camera.z = 0.35;
+            }
+            Robots[i].XYZ_camera.x = (center_u - cx) * Robots[i].XYZ_camera.z / fx;
+            Robots[i].XYZ_camera.y = (center_v - cy) * Robots[i].XYZ_camera.z / fy;
+
+            Eigen::Matrix4f extrinsic_temp;
+            cv::cv2eigen(_extrinsic_cvs[camera_id], extrinsic_temp);
+            Eigen::Vector3f P_camera;
+            P_camera << Robots[i].XYZ_camera.x, Robots[i].XYZ_camera.y, Robots[i].XYZ_camera.z;
+            Eigen::Matrix3f Rbc = extrinsic_temp.topLeftCorner(3, 3).inverse();
+            Eigen::Vector3f tbc = -Rbc * extrinsic_temp.topRightCorner(3, 1);
+            Eigen::Vector3f P_body = Rbc * P_camera + tbc;
+            Robots[i].XYZ_body.x = P_body[0];
+            Robots[i].XYZ_body.y = P_body[1];
+            Robots[i].XYZ_body.z = 0;
+            //std::cout<<"In Camera "<< camera_id << " Find "<< "Robot "<< Robots[i].m_id <<" x: "<< Robots[i].XYZ_body.x << " y: "<< Robots[i].XYZ_body.y << " z: "<< Robots[i].XYZ_body.z <<std::endl;
+        }
+    }
+
+    void RobotDetector::RobotMatch(const int camera_id, const std::vector<Object> &results, std::vector<Robot> &Robots)
+    {
+        //split Robot and Armor
+        std::vector<Object> armors;
+        for (int i = 0; i < results.size(); ++i)
+        {
+            if (results[i].class_id == 0)
+            {
+                Robot Robot_item;
+                Robot_item.m_rect = results[i].box;
+                Robots.push_back(Robot_item);
+            }
+            else
+            {
+                armors.push_back(results[i]);
+            }
+        }
+
+        std::vector<int> matched_armor_indexes;
+
+        for (int i = 0; i < Robots.size(); ++i)
+        {
+            for (int j = 0; j < armors.size(); ++j)
+            {
+                if (armors[j].box.x >= Robots[i].m_rect.x - 3 &&
+                    armors[j].box.y > Robots[i].m_rect.y &&
+                    (armors[j].box.width + armors[j].box.x) <= (Robots[i].m_rect.width + Robots[i].m_rect.x + 5) && 
+                    (armors[j].box.height + armors[j].box.y) < (Robots[i].m_rect.height + Robots[i].m_rect.y) &&
+                    (armors[j].box.y + armors[j].box.height / 2) > (Robots[i].m_rect.height / 2 + Robots[i].m_rect.y))
+                {
+                    Robots[i].m_armornums += 1;
+                    switch (armors[j].class_id)
+                    {
+                    case 1: //red_1
+                        Robots[i].m_color_prb += armors[j].confidence;
+                        Robots[i].m_number_prb += armors[j].confidence;
+                        break;
+                    case 2: //red_2
+                        Robots[i].m_color_prb += armors[j].confidence;
+                        Robots[i].m_number_prb -= armors[j].confidence;
+                        break;
+                    case 3: //blue_1
+                        Robots[i].m_color_prb -= armors[j].confidence;
+                        Robots[i].m_number_prb += armors[j].confidence;
+                        break;
+                    case 4: //blue_2
+                        Robots[i].m_color_prb -= armors[j].confidence;
+                        Robots[i].m_number_prb -= armors[j].confidence;
+                        break;
+                    case 5: //grey
+                        // Robots[i].m_color = 3; 
+                        break;
+                    default:
+                        break;
+                    }
+                    matched_armor_indexes.push_back(j);
+                }
+            }
+            if (Robots[i].m_armornums > 0)
+            {
+                if (Robots[i].m_color_prb > 0.0)
+                {                          
+                    Robots[i].m_color = 1; //red
+                }
+                else if (Robots[i].m_color_prb < 0.0)
+                {
+                    Robots[i].m_color = 2; //blue
+                }
+                else
+                {
+                    Robots[i].m_color = 3; //grey
+                    Robots[i].m_id = 5;
+                }
+                if (Robots[i].m_number_prb > 0.0)
+                {
+                    Robots[i].m_number = 1;
+                }
+                else if (Robots[i].m_number_prb < 0.0)
+                {
+                    Robots[i].m_number = 2;
+                }
+                if (Robots[i].m_number == 1 && Robots[i].m_color == 1)
+                {
+                    Robots[i].m_id = 1;
+                }
+                else if (Robots[i].m_number == 2 && Robots[i].m_color == 1)
+                {
+                    Robots[i].m_id = 2;
+                }
+                else if (Robots[i].m_number == 1 && Robots[i].m_color == 2)
+                {
+                    Robots[i].m_id = 3;
+                }
+                else if (Robots[i].m_number == 2 && Robots[i].m_color == 2)
+                {
+                    Robots[i].m_id = 4;
+                }
+            }
+            else
+            {
+                Robots[i].m_id = 0; //unknown
+            }
+        }
+
+        location_estimation(camera_id, Robots);
+
+        std::vector<Object> unmatched_armors;
+        for (int i = 0; i < armors.size(); i++)
+        {
+            std::vector<int>::iterator it = std::find(matched_armor_indexes.begin(), matched_armor_indexes.end(), i);
+            if (it == matched_armor_indexes.end())
+            {
+                unmatched_armors.push_back(armors[i]);
+            }
+        }
+        std::vector<Robot> robots_by_armors;
+        for (int i = 0; i < unmatched_armors.size(); i++)
+        {
+            //The distance is too close, and only the armor is detected
+            if (unmatched_armors[i].box.height > 150 || unmatched_armors[i].box.height * unmatched_armors[i].box.width > 30000)
+            {
+                Robot Robot_item;
+                Robot_item.m_rect = unmatched_armors[i].box;
+                Robot_item.m_id = unmatched_armors[i].class_id;
+                robots_by_armors.push_back(Robot_item);
+            }
+        }
+        location_estimation(camera_id, robots_by_armors, true);
+        Robots.insert(Robots.end(), robots_by_armors.begin(), robots_by_armors.end());
+    }
+
+} //namespace
diff --git a/robot_omni/src/robot_detector_node.cpp b/robot_omni/src/robot_detector_node.cpp
new file mode 100644
index 0000000..5c1e004
--- /dev/null
+++ b/robot_omni/src/robot_detector_node.cpp
@@ -0,0 +1,371 @@
+#include <ros/ros.h>
+#include <csignal>
+#include <ros/package.h>
+#include <cmath>
+#include <image_transport/image_transport.h>
+#include "cv_bridge/cv_bridge.h"
+#include "std_msgs/String.h"
+#include "RobotDetector.h"
+#include "robot_msgs/GlobalMap.h"
+
+namespace robot_detector
+{
+    typedef RobotDetector::Object Object;
+    typedef RobotDetector::Robot Robot;
+    static const char *labels[] = {"robot", "red1", "red2", "blue1", "blue2", "grey"};
+    const std::vector<cv::Scalar> colors = {cv::Scalar(0, 255, 0), cv::Scalar(0, 0, 255), cv::Scalar(0, 0, 255),
+                                            cv::Scalar(255, 0, 0), cv::Scalar(255, 0, 0), cv::Scalar(128, 128, 128)};
+
+    RobotDetector *detector = new RobotDetector;
+    bool showResult;
+    bool flg_exit = false;
+    std::mutex mtx_buffer;
+
+    struct MeasureGroup
+    {
+        double img1_time;
+        double img2_time;
+        double img3_time;
+        double img4_time;
+
+        std::vector<Object> cam1_objects;
+        std::vector<Object> cam2_objects;
+        std::vector<Object> cam3_objects;
+        std::vector<Object> cam4_objects;
+    };
+    MeasureGroup Measures;
+
+    void SigHandle(int sig)
+    {
+        flg_exit = true;
+        ROS_WARN("catch sig %d", sig);
+    }
+
+    bool sync_packages(MeasureGroup meas, std::vector<std::vector<Object>> &detected_objects)
+    {
+        std::vector<std::vector<Object>> detected_objects_temp;
+        detected_objects_temp.push_back(meas.cam1_objects);
+        detected_objects_temp.push_back(meas.cam2_objects);
+        detected_objects_temp.push_back(meas.cam3_objects);
+        detected_objects_temp.push_back(meas.cam4_objects);
+
+        std::vector<double> src_times;
+        src_times.push_back(meas.img1_time);
+        src_times.push_back(meas.img2_time);
+        src_times.push_back(meas.img3_time);
+        src_times.push_back(meas.img4_time);
+
+        bool syncFlag = false;
+
+        for (int i = 0; i < src_times.size(); i++)
+        {
+            if (src_times[i] != 0 && (ros::Time::now().toSec() - src_times[i]) < 0.5)
+            {
+                detected_objects.push_back(detected_objects_temp[i]);
+                syncFlag = true;
+            }
+            else
+            {
+                std::vector<Object> emptyObject;
+                detected_objects.push_back(emptyObject);
+                ROS_WARN("Camera [%d] get image failed", i + 1);
+            }
+        }
+        if (!syncFlag)
+        {
+            ROS_WARN("ALL Camera Failed, please check camera connection");
+        }
+        return syncFlag;
+    }
+
+    void PublishDetectRobots(ros::Publisher robot_detector_pub, const std::vector<Robot> robot_results)
+    {
+        robot_msgs::RobotLocation robots_temp;
+        robot_msgs::GlobalMap submap_msg;
+        submap_msg.header.stamp = ros::Time::now();
+
+        for (int i = 0; i < robot_results.size(); ++i)
+        {
+            robots_temp.id = robot_results[i].m_id;
+            robots_temp.x = robot_results[i].XYZ_body.x;
+            robots_temp.y = robot_results[i].XYZ_body.y;
+            robots_temp.yaw = 0;
+            submap_msg.robots.push_back(robots_temp);
+        }
+
+        robot_detector_pub.publish(submap_msg);
+    }
+
+    void im_show(cv::Mat &src, std::string window_name, const std::vector<Object> detected_objects)
+    {
+        //Display at half the resolution of the original image
+        cv::Mat srcHalf;
+        cv::resize(src, srcHalf, cv::Size(), 0.5, 0.5, cv::INTER_AREA);
+        for (int i = 0; i < detected_objects.size(); ++i)
+        {
+            int xmin = detected_objects[i].box.x / 2;
+            int ymin = detected_objects[i].box.y / 2;
+            int width = detected_objects[i].box.width / 2;
+            int height = detected_objects[i].box.height / 2;
+            cv::Rect rect(xmin, ymin, width, height);
+            int classId = detected_objects[i].class_id;
+            cv::rectangle(srcHalf, rect, colors[classId], 2);
+            auto name = labels[detected_objects[i].class_id];
+            auto caption = cv::format("%s %.2f", name, detected_objects[i].confidence);
+            int name_width = cv::getTextSize(caption, 0, 1, 2, nullptr).width - 50;
+            cv::rectangle(srcHalf, cv::Point(xmin - 3, ymin - 20), cv::Point(xmin + name_width, ymin), colors[classId], -1);
+            cv::putText(srcHalf, caption, cv::Point(xmin, ymin - 5), 0, 0.6, cv::Scalar::all(0), 2, 16);
+        }
+        cv::imshow(window_name, srcHalf);
+        //cv::waitKey(1);
+    }
+
+    void im_show_around(cv::Mat &src, const std::vector<Robot> robot_results)
+    {
+        for (int i = 0; i < robot_results.size(); ++i)
+        {
+            int x_around = -robot_results[i].XYZ_body.y * 100 + 300;
+            int y_around = -robot_results[i].XYZ_body.x * 100 + 400;
+            if (robot_results[i].m_id == 1)
+            {
+                cv::putText(src, "1", cv::Point(x_around, y_around), 0, 1.5, cv::Scalar(0, 0, 255), 3, 8);
+            }
+            else if (robot_results[i].m_id == 2)
+            {
+                cv::putText(src, "2", cv::Point(x_around, y_around), 0, 1.5, cv::Scalar(0, 0, 255), 3, 8);
+            }
+            else if (robot_results[i].m_id == 3)
+            {
+                cv::putText(src, "1", cv::Point(x_around, y_around), 0, 1.5, cv::Scalar(255, 0, 0), 3, 8);
+            }
+            else if (robot_results[i].m_id == 4)
+            {
+                cv::putText(src, "2", cv::Point(x_around, y_around), 0, 1.5, cv::Scalar(255, 0, 0), 3, 8);
+            }
+            else if (robot_results[i].m_id == 5)
+            {
+                cv::putText(src, "0", cv::Point(x_around, y_around), 0, 1.5, cv::Scalar(100, 100, 100), 3, 8);
+            }
+        }
+        cv::imshow("around", src);
+        cv::waitKey(1);
+    }
+
+    void Image1Callback(const sensor_msgs::ImageConstPtr &image1)
+    {
+        cv::Mat src_image1 = cv_bridge::toCvShare(image1, "bgr8")->image;
+        double img1_time = image1->header.stamp.toSec();
+        ;
+
+        std::vector<Object> cam1_objects;
+        detector->process_frame(src_image1, cam1_objects, 1);
+
+        mtx_buffer.lock();
+        Measures.cam1_objects = cam1_objects;
+        Measures.img1_time = img1_time;
+        mtx_buffer.unlock();
+        if (showResult)
+        {
+            im_show(src_image1, "cam1", cam1_objects);
+        }
+    }
+
+    void Image2Callback(const sensor_msgs::ImageConstPtr &image2)
+    {
+        cv::Mat src_image2 = cv_bridge::toCvShare(image2, "bgr8")->image;
+        double img2_time = image2->header.stamp.toSec();
+        ;
+
+        std::vector<Object> cam2_objects;
+        detector->process_frame(src_image2, cam2_objects, 2);
+
+        mtx_buffer.lock();
+        Measures.cam2_objects = cam2_objects;
+        Measures.img2_time = img2_time;
+        mtx_buffer.unlock();
+        if (showResult)
+        {
+            im_show(src_image2, "cam2", cam2_objects);
+        }
+    }
+
+    void Image3Callback(const sensor_msgs::ImageConstPtr &image3)
+    {
+        cv::Mat src_image3 = cv_bridge::toCvShare(image3, "bgr8")->image;
+        double img3_time = image3->header.stamp.toSec();
+        ;
+
+        std::vector<Object> cam3_objects;
+        detector->process_frame(src_image3, cam3_objects, 3);
+
+        mtx_buffer.lock();
+        Measures.cam3_objects = cam3_objects;
+        Measures.img3_time = img3_time;
+        mtx_buffer.unlock();
+        if (showResult)
+        {
+            im_show(src_image3, "cam3", cam3_objects);
+        }
+    }
+
+    void Image4Callback(const sensor_msgs::ImageConstPtr &image4)
+    {
+        cv::Mat src_image4 = cv_bridge::toCvShare(image4, "bgr8")->image;
+        double img4_time = image4->header.stamp.toSec();
+        ;
+
+        std::vector<Object> cam4_objects;
+        detector->process_frame(src_image4, cam4_objects, 4);
+
+        mtx_buffer.lock();
+        Measures.cam4_objects = cam4_objects;
+        Measures.img4_time = img4_time;
+        mtx_buffer.unlock();
+        if (showResult)
+        {
+            im_show(src_image4, "cam4", cam4_objects);
+        }
+    }
+
+} //namespace robot_detector
+
+int main(int argc, char **argv)
+{
+    using namespace robot_detector;
+    ros::init(argc, argv, "robot_detector_node");
+    ros::NodeHandle ros_nh;
+    std::string img1_topic_name, img2_topic_name, img3_topic_name, img4_topic_name;
+    double cof_threshold, nms_area_threshold;
+    std::string model_path, around_png_path;
+    float distance_thresh1, distance_thresh2;
+    int camera_num;
+    std::vector<std::vector<double>> intrinsics(4), extrinsics(4), discoeffs(4);
+    std::vector<float> distances, heights;
+
+    ros_nh.param<std::string>("/robot_detector_node/yolo_model_path", model_path, "/YoloModel/0430");
+    ros_nh.param<std::string>("/robot_detector_node/img1_topic_name", img1_topic_name, "/mvsua_cam/image_raw2");
+    ros_nh.param<std::string>("/robot_detector_node/img2_topic_name", img2_topic_name, "/mvsua_cam/image_raw3");
+    ros_nh.param<std::string>("/robot_detector_node/img3_topic_name", img3_topic_name, "/mvsua_cam/image_raw4");
+    ros_nh.param<std::string>("/robot_detector_node/img4_topic_name", img4_topic_name, "/mvsua_cam/image_raw5");
+    ros_nh.param<double>("/robot_detector_node/cof_threshold", cof_threshold, 0.5);
+    ros_nh.param<double>("/robot_detector_node/nms_area_threshold", nms_area_threshold, 0.5);
+    ros_nh.param<bool>("/robot_detector_node/imshow", showResult, false);
+    ros_nh.param("/robot_detector_node/CamNums", camera_num, 4);
+    for (int i = 1; i <= camera_num; i++)
+    {
+        ros_nh.getParam("/robot_detector_node/In" + std::to_string(i), intrinsics[i - 1]);
+        ros_nh.getParam("/robot_detector_node/Ex" + std::to_string(i), extrinsics[i - 1]);
+        ros_nh.getParam("/robot_detector_node/Dis" + std::to_string(i), discoeffs[i - 1]);
+    }
+    ros_nh.getParam("/robot_detector_node/Heights", heights);
+    ros_nh.getParam("/robot_detector_node/Distances", distances);
+
+    cv::Mat around_png = cv::imread(ros::package::getPath("omni_detect") + "/maps/around.png");
+    model_path = ros::package::getPath("omni_detect") + model_path;
+
+    image_transport::ImageTransport it(ros_nh);
+    image_transport::Subscriber img1_sub, img2_sub, img3_sub, img4_sub;
+    img1_sub = it.subscribe(img1_topic_name, 1, Image1Callback);
+    img2_sub = it.subscribe(img2_topic_name, 1, Image2Callback);
+    img3_sub = it.subscribe(img3_topic_name, 1, Image3Callback);
+    img4_sub = it.subscribe(img4_topic_name, 1, Image4Callback);
+
+    ros::Publisher robot_detector_pub = ros_nh.advertise<robot_msgs::GlobalMap>("/omni_detector", 1, true);
+
+    detector->init(model_path, SimpleYolo::Mode::FP16, cof_threshold, nms_area_threshold, heights, distances);
+    detector->load_camera_params(intrinsics, extrinsics, discoeffs);
+    ROS_INFO("\033[1;32m----> Omni Robot Detector Init Successfully.\033[0m");
+    //------------------------------------------------------------------------------------------------------
+    signal(SIGINT, SigHandle);
+    ros::Rate rate(30);
+    bool status = ros::ok();
+    while (status)
+    {
+        if (flg_exit)
+            break;
+        ros::Time strat_time = ros::Time::now();
+
+        ros::spinOnce();
+
+        std::vector<std::vector<Object>> detected_objects;
+        std::vector<Robot> robot_results;       // All detected robots
+        std::vector<Robot> final_robot_results; // Final results after filtering out duplicate robots
+        int robot_nums = 0;
+
+        if (sync_packages(Measures, detected_objects))
+        {
+            for (int camera_id = 0; camera_id < detected_objects.size(); camera_id++)
+            {
+                std::vector<Object> obj = detected_objects[camera_id];
+                std::vector<Robot> robots_temp;
+                detector->RobotMatch(camera_id, obj, robots_temp);
+                for (int j = 0; j < robots_temp.size(); j++)
+                {
+                    if (robots_temp[j].m_id != 0)
+                    {
+                        robot_results.push_back(robots_temp[j]);
+                    }
+                }
+
+                // The following code is for measuring the data used for interpolation
+                // for(int j = 0; j < robots_temp.size(); j++){
+                //         //if(robots_temp[j].m_id == 4){
+                //         int area = robots_temp[j].m_rect.height * robots_temp[j].m_rect.width;
+                //         std::cout<< "Robot area: "<< area << " height: "<<robots_temp[j].m_rect.height<< " width: "<< robots_temp[j].m_rect.width<<std::endl;
+                //     }
+                // }
+                // for(int k = 0; k < obj.size(); k++){
+                //     if(obj[k].class_id == 3){
+                //         int area = obj[k].box.height * obj[k].box.width;
+                //         std::cout<< "Armor area: "<< area << " height: "<<obj[k].box.height << " width: "<< obj[k].box.width<<std::endl;
+                //     }
+                // }
+            }
+
+            // Filter out duplicate results.
+            std::vector<int> repeat_indexes;
+            for (int j = 0; j < robot_results.size(); ++j)
+            {
+                std::vector<int>::iterator it = std::find(repeat_indexes.begin(), repeat_indexes.end(), j);
+                if (it != repeat_indexes.end())
+                    continue;
+
+                bool repeat_flag = false;
+                for (int k = j + 1; k < robot_results.size(); ++k)
+                {
+                    if (robot_results[j].m_id == robot_results[k].m_id)
+                    {
+                        float dis = sqrt((robot_results[j].XYZ_body.x - robot_results[k].XYZ_body.x) * (robot_results[j].XYZ_body.x - robot_results[k].XYZ_body.x) +
+                                         (robot_results[j].XYZ_body.y - robot_results[k].XYZ_body.y) * (robot_results[j].XYZ_body.y - robot_results[k].XYZ_body.y));
+                        if (dis < 0.5)
+                        {
+                            repeat_flag = true;
+                            repeat_indexes.push_back(k);
+                            Robot robot_temp = robot_results[j];
+                            robot_temp.XYZ_body.x = (robot_results[j].XYZ_body.x + robot_results[k].XYZ_body.x) / 2;
+                            robot_temp.XYZ_body.y = (robot_results[j].XYZ_body.y + robot_results[k].XYZ_body.y) / 2;
+                            final_robot_results.push_back(robot_temp);
+                            continue;
+                        }
+                    }
+                }
+                if (!repeat_flag)
+                    final_robot_results.push_back(robot_results[j]);
+            }
+
+            if (showResult)
+            {
+                cv::Mat around = around_png.clone();
+                im_show_around(around, final_robot_results);
+            }
+            PublishDetectRobots(robot_detector_pub, final_robot_results);
+        }
+
+        std::cout << "total time: " << (ros::Time::now() - strat_time).toSec() * 1000 << "ms"
+                  << " Find " << final_robot_results.size() << " robots" << std::endl;
+        status = ros::ok();
+        rate.sleep();
+    }
+
+    return 0;
+}