From 697280b0158ae609cf601c1e7f84e4985fba75cc Mon Sep 17 00:00:00 2001
From: Li Zhang <lzhang329@gmail.com>
Date: Tue, 31 Jan 2023 11:24:24 +0800
Subject: [PATCH 01/11] optimize demos

---
 csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h |   2 +-
 demo/csrc/cpp/classifier.cxx                 |  22 +-
 demo/csrc/cpp/det_pose.cpp                   | 137 +++------
 demo/csrc/cpp/detector.cxx                   |  93 +++---
 demo/csrc/cpp/pose_tracker.cpp               | 185 ++---------
 demo/csrc/cpp/pose_tracker_utils.hpp         |  61 ++++
 demo/csrc/cpp/restorer.cxx                   |  55 ++--
 demo/csrc/cpp/rotated_detector.cxx           |  66 ++--
 demo/csrc/cpp/segmentor.cxx                  |  75 ++---
 demo/csrc/cpp/text_ocr.cxx                   |  63 ++--
 demo/csrc/cpp/utils/argparse.h               | 255 +++++++++++++++
 demo/csrc/cpp/utils/mediaio.h                | 308 +++++++++++++++++++
 demo/csrc/cpp/utils/skeleton.h               |  88 ++++++
 demo/csrc/cpp/utils/visualize.h              | 165 ++++++++++
 14 files changed, 1140 insertions(+), 435 deletions(-)
 create mode 100644 demo/csrc/cpp/pose_tracker_utils.hpp
 create mode 100644 demo/csrc/cpp/utils/argparse.h
 create mode 100644 demo/csrc/cpp/utils/mediaio.h
 create mode 100644 demo/csrc/cpp/utils/skeleton.h
 create mode 100644 demo/csrc/cpp/utils/visualize.h
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h b/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h
index f895af4150..4b27fbab8a 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h
@@ -36,7 +36,7 @@ typedef struct mmdeploy_pose_tracker_param_t {
   int32_t pose_max_num_bboxes;
   // threshold for visible key-points, default = 0.5
   float pose_kpt_thr;
-  // min number of key-points for valid poses, default = -1
+  // min number of key-points for valid poses (-1 indicates ceil(n_kpts/2)), default = -1
   int32_t pose_min_keypoints;
   // scale for expanding key-points to bbox, default = 1.25
   float pose_bbox_scale;
diff --git a/demo/csrc/cpp/classifier.cxx b/demo/csrc/cpp/classifier.cxx
index 2d8b2d1e25..a8976fa506 100644
--- a/demo/csrc/cpp/classifier.cxx
+++ b/demo/csrc/cpp/classifier.cxx
@@ -4,29 +4,29 @@
 #include <string>
 
 #include "opencv2/imgcodecs/imgcodecs.hpp"
+#include "utils/argparse.h"
+
+DEFINE_ARG_string(model, "Model path");
+DEFINE_ARG_string(image, "Input image path");
+DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
 
 int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  image_classification device_name model_path image_path\n");
-    return 1;
+  if (!utils::ParseArguments(argc, argv)) {
+    return -1;
   }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
+  cv::Mat img = cv::imread(ARGS_image);
   if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
+    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
     return 1;
   }
 
-  mmdeploy::Model model(model_path);
-  mmdeploy::Classifier classifier(model, mmdeploy::Device{device_name, 0});
+  mmdeploy::Model model(ARGS_model);
+  mmdeploy::Classifier classifier(model, mmdeploy::Device{FLAGS_device, 0});
 
   auto res = classifier.Apply(img);
 
   for (const auto& cls : res) {
     fprintf(stderr, "label: %d, score: %.4f\n", cls.label_id, cls.score);
   }
-
   return 0;
 }
diff --git a/demo/csrc/cpp/det_pose.cpp b/demo/csrc/cpp/det_pose.cpp
index a12dd3c0b1..983a6c598a 100644
--- a/demo/csrc/cpp/det_pose.cpp
+++ b/demo/csrc/cpp/det_pose.cpp
@@ -4,110 +4,69 @@
 
 #include "mmdeploy/detector.hpp"
 #include "mmdeploy/pose_detector.hpp"
-#include "opencv2/imgcodecs/imgcodecs.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
+#include "utils/argparse.h"
+#include "utils/mediaio.h"
+#include "utils/visualize.h"
 
-using std::vector;
+DEFINE_ARG_string(det_model, "Object detection model path");
+DEFINE_ARG_string(pose_model, "Pose estimation model path");
+DEFINE_ARG_string(input, "Path to input image, video, camera index or image list (.txt)");
 
-cv::Mat Visualize(cv::Mat frame, const std::vector<mmdeploy_pose_detection_t>& poses, int size);
+DEFINE_string(device, "cpu", "Device name, e.g. \"cpu\", \"cuda\"");
+DEFINE_string(output, "det_pose_%04d.jpg",
+              "Output image, video path or format string; use `cv::imshow` if empty");
+DEFINE_int32(output_size, 1024, "ong-edge of output frames");
+DEFINE_int32(delay, 0, "Delay passed to `cv::waitKey` when using `cv::imshow`");
 
-int main(int argc, char* argv[]) {
-  const auto device_name = argv[1];
-  const auto det_model_path = argv[2];
-  const auto pose_model_path = argv[3];
-  const auto image_path = argv[4];
+DEFINE_int32(det_label, 0, "Detection label use for pose estimation");
+DEFINE_double(det_thr, 0.5, "Detection score threshold");
+DEFINE_double(det_min_bbox_size, -1, "Detection minimum bbox size");
 
-  if (argc != 5) {
-    std::cerr << "usage:\n\tpose_tracker device_name det_model_path pose_model_path image_path\n";
-    return -1;
-  }
-  auto img = cv::imread(image_path);
-  if (!img.data) {
-    std::cerr << "failed to load image: " << image_path << "\n";
+DEFINE_double(pose_thr, 0, "Pose key-point threshold");
+
+int main(int argc, char* argv[]) {
+  if (!utils::ParseArguments(argc, argv)) {
     return -1;
   }
 
-  using namespace mmdeploy;
+  utils::mediaio::Input input(ARGS_input);
+  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
 
-  Context context(Device{device_name});  // create context for holding the device handle
-  Detector detector(Model(det_model_path), context);   // create object detector
-  PoseDetector pose(Model(pose_model_path), context);  // create pose detector
+  utils::Visualize v(FLAGS_output_size);
 
-  // apply detector
-  auto dets = detector.Apply(img);
+  mmdeploy::Device device{FLAGS_device};
+  mmdeploy::Detector detector(mmdeploy::Model(ARGS_det_model), device);   // create object detector
+  mmdeploy::PoseDetector pose(mmdeploy::Model(ARGS_pose_model), device);  // create pose detector
 
-  // filter detections and extract bboxes for pose model
-  std::vector<mmdeploy_rect_t> bboxes;
-  for (const auto& det : dets) {
-    if (det.label_id == 0 && det.score > .6f) {
-      bboxes.push_back(det.bbox);
+  for (const cv::Mat& img : input) {
+    // apply detector
+    mmdeploy::Detector::Result dets = detector.Apply(img);
+
+    // filter detections and extract bboxes for pose model
+    std::vector<mmdeploy_rect_t> bboxes;
+    for (const mmdeploy_detection_t& det : dets) {
+      if (det.label_id == FLAGS_det_label && det.score > FLAGS_det_thr) {
+        bboxes.push_back(det.bbox);
+      }
     }
-  }
-  // apply pose detector
-  auto poses = pose.Apply(img, bboxes);
 
-  // visualize
-  auto vis = Visualize(img, {poses.begin(), poses.end()}, 1280);
-  cv::imwrite("det_pose_output.jpg", vis);
+    // apply pose detector
+    mmdeploy::PoseDetector::Result poses = pose.Apply(img, bboxes);
 
-  return 0;
-}
-
-struct Skeleton {
-  vector<std::pair<int, int>> skeleton;
-  vector<cv::Scalar> palette;
-  vector<int> link_color;
-  vector<int> point_color;
-};
-
-const Skeleton& gCocoSkeleton() {
-  static const Skeleton inst{
-      {
-          {15, 13}, {13, 11}, {16, 14}, {14, 12}, {11, 12}, {5, 11}, {6, 12},
-          {5, 6},   {5, 7},   {6, 8},   {7, 9},   {8, 10},  {1, 2},  {0, 1},
-          {0, 2},   {1, 3},   {2, 4},   {3, 5},   {4, 6},
-      },
-      {
-          {255, 128, 0},   {255, 153, 51},  {255, 178, 102}, {230, 230, 0},   {255, 153, 255},
-          {153, 204, 255}, {255, 102, 255}, {255, 51, 255},  {102, 178, 255}, {51, 153, 255},
-          {255, 153, 153}, {255, 102, 102}, {255, 51, 51},   {153, 255, 153}, {102, 255, 102},
-          {51, 255, 51},   {0, 255, 0},     {0, 0, 255},     {255, 0, 0},     {255, 255, 255},
-      },
-      {0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16},
-      {16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0},
-  };
-  return inst;
-}
+    assert(bboxes.size() == poses.size());
 
-cv::Mat Visualize(cv::Mat frame, const vector<mmdeploy_pose_detection_t>& poses, int size) {
-  auto& [skeleton, palette, link_color, point_color] = gCocoSkeleton();
-  auto scale = (float)size / (float)std::max(frame.cols, frame.rows);
-  if (scale != 1) {
-    cv::resize(frame, frame, {}, scale, scale);
-  } else {
-    frame = frame.clone();
-  }
-  for (const auto& pose : poses) {
-    vector<float> kpts(&pose.point[0].x, &pose.point[pose.length - 1].y + 1);
-    vector<float> scores(pose.score, pose.score + pose.length);
-    std::for_each(kpts.begin(), kpts.end(), [&](auto& x) { x *= scale; });
-    constexpr auto score_thr = .5f;
-    vector<int> used(kpts.size());
-    for (size_t i = 0; i < skeleton.size(); ++i) {
-      auto [u, v] = skeleton[i];
-      if (scores[u] > score_thr && scores[v] > score_thr) {
-        used[u] = used[v] = 1;
-        cv::Point p_u(kpts[u * 2], kpts[u * 2 + 1]);
-        cv::Point p_v(kpts[v * 2], kpts[v * 2 + 1]);
-        cv::line(frame, p_u, p_v, palette[link_color[i]], 1, cv::LINE_AA);
-      }
+    // visualize
+    auto sess = v.get_session(img);
+    for (size_t i = 0; i < bboxes.size(); ++i) {
+      // draw bounding boxes
+      sess.add_bbox(bboxes[i], -1, -1);
+      // draw pose key-points
+      sess.add_pose(poses[i].point, poses[i].score, poses[i].length, FLAGS_pose_thr);
     }
-    for (size_t i = 0; i < kpts.size(); i += 2) {
-      if (used[i / 2]) {
-        cv::Point p(kpts[i], kpts[i + 1]);
-        cv::circle(frame, p, 1, palette[point_color[i / 2]], 2, cv::LINE_AA);
-      }
+
+    if (!output.write(sess.get())) {
+      break;
     }
   }
-  return frame;
+  return 0;
 }
diff --git a/demo/csrc/cpp/detector.cxx b/demo/csrc/cpp/detector.cxx
index 7009d2fd21..f4f8407a83 100644
--- a/demo/csrc/cpp/detector.cxx
+++ b/demo/csrc/cpp/detector.cxx
@@ -1,69 +1,56 @@
 #include "mmdeploy/detector.hpp"
 
-#include <opencv2/imgcodecs/imgcodecs.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
 #include <string>
 
-int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  object_detection device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
+#include "utils/argparse.h"
+#include "utils/mediaio.h"
+#include "utils/visualize.h"
 
-  mmdeploy::Model model(model_path);
-  mmdeploy::Detector detector(model, mmdeploy::Device{device_name, 0});
+DEFINE_ARG_string(model, "Object detection model path");
+DEFINE_ARG_string(input, "Path to input image, video, camera index or image list (.txt)");
+DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
 
-  auto dets = detector.Apply(img);
+DEFINE_string(output, "detection_%04d.jpg", "Output image, video path, format string or SHOW");
+DEFINE_int32(output_size, 1024, "Long-edge of output frames");
+DEFINE_int32(delay, 0, "Delay passed to `cv::waitKey` when using `cv::imshow`");
 
-  fprintf(stdout, "bbox_count=%d\n", (int)dets.size());
+DEFINE_double(det_thr, 0.5, "Detection score threshold");
 
-  for (int i = 0; i < dets.size(); ++i) {
-    const auto& box = dets[i].bbox;
-    const auto& mask = dets[i].mask;
-
-    fprintf(stdout, "box %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, label=%d, score=%.4f\n",
-            i, box.left, box.top, box.right, box.bottom, dets[i].label_id, dets[i].score);
-
-    // skip detections with invalid bbox size (bbox height or width < 1)
-    if ((box.right - box.left) < 1 || (box.bottom - box.top) < 1) {
-      continue;
-    }
+int main(int argc, char* argv[]) {
+  if (!utils::ParseArguments(argc, argv)) {
+    return -1;
+  }
 
-    // skip detections less than specified score threshold
-    if (dets[i].score < 0.3) {
-      continue;
+  // utils for handling input/output of images/videos
+  utils::mediaio::Input input(ARGS_input);
+  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
+  // util for visualization
+  utils::Visualize v(FLAGS_output_size);
+
+  /// ! construct a detector instance
+  mmdeploy::Detector detector(mmdeploy::Model(ARGS_model), mmdeploy::Device{FLAGS_device, 0});
+
+  for (const cv::Mat& img : input) {
+    /// ! apply detector on the image
+    mmdeploy::Detector::Result dets = detector.Apply(img);
+
+    // visualize detection results
+    auto sess = v.get_session(img);
+    for (const mmdeploy_detection_t& det : dets) {
+      auto& bbox = det.bbox;
+      auto bbox_w = bbox.right - bbox.left;
+      auto bbox_h = bbox.bottom - bbox.top;
+      if (bbox_w > 1 && bbox_h > 1 && det.score > FLAGS_det_thr) {  // filter bboxes
+        sess.add_det(det.bbox, det.label_id, det.score, det.mask);
+      }
     }
 
-    // generate mask overlay if model exports masks
-    if (mask != nullptr) {
-      fprintf(stdout, "mask %d, height=%d, width=%d\n", i, mask->height, mask->width);
-
-      cv::Mat imgMask(mask->height, mask->width, CV_8UC1, &mask->data[0]);
-      auto x0 = std::max(std::floor(box.left) - 1, 0.f);
-      auto y0 = std::max(std::floor(box.top) - 1, 0.f);
-      cv::Rect roi((int)x0, (int)y0, mask->width, mask->height);
-
-      // split the RGB channels, overlay mask to a specific color channel
-      cv::Mat ch[3];
-      split(img, ch);
-      int col = 0;  // int col = i % 3;
-      cv::bitwise_or(imgMask, ch[col](roi), ch[col](roi));
-      merge(ch, 3, img);
+    // write visualization to output
+    if (!output.write(sess.get())) {
+      // user request exit
+      break;
     }
-
-    cv::rectangle(img, cv::Point{(int)box.left, (int)box.top},
-                  cv::Point{(int)box.right, (int)box.bottom}, cv::Scalar{0, 255, 0});
   }
 
-  cv::imwrite("output_detection.png", img);
-
   return 0;
 }
diff --git a/demo/csrc/cpp/pose_tracker.cpp b/demo/csrc/cpp/pose_tracker.cpp
index 077b3d39ea..5f4ea14b5c 100644
--- a/demo/csrc/cpp/pose_tracker.cpp
+++ b/demo/csrc/cpp/pose_tracker.cpp
@@ -2,179 +2,48 @@
 
 #include "mmdeploy/pose_tracker.hpp"
 
-#include <iostream>
-
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgcodecs/imgcodecs.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/videoio/videoio.hpp"
-
-struct Args {
-  std::string device;
-  std::string det_model;
-  std::string pose_model;
-  std::string video;
-  std::string output_dir;
-};
-
-Args ParseArgs(int argc, char* argv[]);
-
-using std::vector;
-using namespace mmdeploy;
-
-bool Visualize(cv::Mat frame, const PoseTracker::Result& result, int size,
-               const std::string& output_dir, int frame_id, bool with_bbox);
+#include "pose_tracker_utils.hpp"
+#include "utils/mediaio.h"
+#include "utils/visualize.h"
 
 int main(int argc, char* argv[]) {
-  auto args = ParseArgs(argc, argv);
-  if (args.device.empty()) {
-    return 0;
+  if (!utils::ParseArguments(argc, argv)) {
+    return -1;
   }
 
   // create pose tracker pipeline
-  PoseTracker tracker(Model(args.det_model), Model(args.pose_model), Context{Device{args.device}});
-
-  // set parameters
-  PoseTracker::Params params;
-  params->det_min_bbox_size = 100;
-  params->det_interval = 1;
-  params->pose_max_num_bboxes = 6;
+  mmdeploy::PoseTracker tracker(mmdeploy::Model(ARGS_det_model), mmdeploy::Model(ARGS_pose_model),
+                                mmdeploy::Device{FLAGS_device});
 
-  // optionally use OKS for keypoints similarity comparison
-  std::array<float, 17> sigmas{0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072,
-                               0.062, 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089};
-  params->keypoint_sigmas = sigmas.data();
-  params->keypoint_sigmas_size = sigmas.size();
+  mmdeploy::PoseTracker::Params params;
+  // initialize tracker params with program arguments
+  InitTrackerParams(params);
 
   // create a tracker state for each video
-  PoseTracker::State state = tracker.CreateState(params);
+  mmdeploy::PoseTracker::State state = tracker.CreateState(params);
 
-  cv::VideoCapture video;
-  if (args.video.size() == 1 && std::isdigit(args.video[0])) {
-    video.open(std::stoi(args.video));  // open by camera index
-  } else {
-    video.open(args.video);  // open video file
-  }
-  if (!video.isOpened()) {
-    std::cerr << "failed to open video: " << args.video << "\n";
-  }
+  utils::mediaio::Input input(ARGS_input);
+  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
 
-  cv::Mat frame;
-  int frame_id = 0;
-  while (true) {
-    video >> frame;
-    if (!frame.data) {
-      break;
-    }
+  utils::Visualize v(FLAGS_output_size);
+  v.set_skeleton(utils::Skeleton::get(FLAGS_skeleton));
+
+  for (const cv::Mat& frame : input) {
     // apply the pipeline with the tracker state and video frame
-    auto result = tracker.Apply(state, frame);
+    mmdeploy::PoseTracker::Result result = tracker.Apply(state, frame);
+
     // visualize the results
-    if (!Visualize(frame, result, 1280, args.output_dir, frame_id++, false)) {
-      break;
+    auto sess = v.get_session(frame);
+    for (const mmdeploy_pose_tracker_target_t& target : result) {
+      sess.add_pose(target.keypoints, target.scores, target.keypoint_count, FLAGS_pose_kpt_thr);
     }
-  }
 
-  return 0;
-}
-
-struct Skeleton {
-  vector<std::pair<int, int>> skeleton;
-  vector<cv::Scalar> palette;
-  vector<int> link_color;
-  vector<int> point_color;
-};
-
-const Skeleton& gCocoSkeleton() {
-  static const Skeleton inst{
-      {
-          {15, 13}, {13, 11}, {16, 14}, {14, 12}, {11, 12}, {5, 11}, {6, 12},
-          {5, 6},   {5, 7},   {6, 8},   {7, 9},   {8, 10},  {1, 2},  {0, 1},
-          {0, 2},   {1, 3},   {2, 4},   {3, 5},   {4, 6},
-      },
-      {
-          {255, 128, 0},   {255, 153, 51},  {255, 178, 102}, {230, 230, 0},   {255, 153, 255},
-          {153, 204, 255}, {255, 102, 255}, {255, 51, 255},  {102, 178, 255}, {51, 153, 255},
-          {255, 153, 153}, {255, 102, 102}, {255, 51, 51},   {153, 255, 153}, {102, 255, 102},
-          {51, 255, 51},   {0, 255, 0},     {0, 0, 255},     {255, 0, 0},     {255, 255, 255},
-      },
-      {0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16},
-      {16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0},
-  };
-  return inst;
-}
-
-bool Visualize(cv::Mat frame, const PoseTracker::Result& result, int size,
-               const std::string& output_dir, int frame_id, bool with_bbox) {
-  auto& [skeleton, palette, link_color, point_color] = gCocoSkeleton();
-  auto scale = (float)size / (float)std::max(frame.cols, frame.rows);
-  if (scale != 1) {
-    cv::resize(frame, frame, {}, scale, scale);
-  } else {
-    frame = frame.clone();
-  }
-  auto draw_bbox = [&](std::array<float, 4> bbox, const cv::Scalar& color) {
-    std::for_each(bbox.begin(), bbox.end(), [&](auto& x) { x *= scale; });
-    cv::rectangle(frame, cv::Point2f(bbox[0], bbox[1]), cv::Point2f(bbox[2], bbox[3]), color);
-  };
-  for (const auto& r : result) {
-    vector<float> kpts(&r.keypoints[0].x, &r.keypoints[0].x + r.keypoint_count * 2);
-    vector<float> scores(r.scores, r.scores + r.keypoint_count);
-    std::for_each(kpts.begin(), kpts.end(), [&](auto& x) { x *= scale; });
-    constexpr auto score_thr = .5f;
-    vector<int> used(kpts.size());
-    for (size_t i = 0; i < skeleton.size(); ++i) {
-      auto [u, v] = skeleton[i];
-      if (scores[u] > score_thr && scores[v] > score_thr) {
-        used[u] = used[v] = 1;
-        cv::Point2f p_u(kpts[u * 2], kpts[u * 2 + 1]);
-        cv::Point2f p_v(kpts[v * 2], kpts[v * 2 + 1]);
-        cv::line(frame, p_u, p_v, palette[link_color[i]], 1, cv::LINE_AA);
-      }
-    }
-    for (size_t i = 0; i < kpts.size(); i += 2) {
-      if (used[i / 2]) {
-        cv::Point2f p(kpts[i], kpts[i + 1]);
-        cv::circle(frame, p, 1, palette[point_color[i / 2]], 2, cv::LINE_AA);
-      }
-    }
-    if (with_bbox) {
-      draw_bbox((std::array<float, 4>&)r.bbox, cv::Scalar(0, 255, 0));
+    // write to output stream
+    if (!output.write(sess.get())) {
+      // user requested exit by pressing 'q'
+      break;
     }
   }
-  if (output_dir.empty()) {
-    cv::imshow("pose_tracker", frame);
-    return cv::waitKey(1) != 'q';
-  }
-  auto join = [](const std::string& a, const std::string& b) {
-#if _MSC_VER
-    return a + "\\" + b;
-#else
-    return a + "/" + b;
-#endif
-  };
-  cv::imwrite(join(output_dir, std::to_string(frame_id) + ".jpg"), frame,
-              {cv::IMWRITE_JPEG_QUALITY, 90});
-  return true;
-}
 
-Args ParseArgs(int argc, char* argv[]) {
-  if (argc < 5 || argc > 6) {
-    std::cout << R"(Usage: pose_tracker device_name det_model pose_model video [output]
-  device_name  device name for execution, e.g. "cpu", "cuda"
-  det_model    object detection model path
-  pose_model   pose estimation model path
-  video        video path or camera index
-  output       output directory, will cv::imshow if omitted
-)";
-    return {};
-  }
-  Args args;
-  args.device = argv[1];
-  args.det_model = argv[2];
-  args.pose_model = argv[3];
-  args.video = argv[4];
-  if (argc == 6) {
-    args.output_dir = argv[5];
-  }
-  return args;
+  return 0;
 }
diff --git a/demo/csrc/cpp/pose_tracker_utils.hpp b/demo/csrc/cpp/pose_tracker_utils.hpp
new file mode 100644
index 0000000000..b2163ab2a8
--- /dev/null
+++ b/demo/csrc/cpp/pose_tracker_utils.hpp
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_POSE_TRACKER_UTILS_HPP
+#define MMDEPLOY_POSE_TRACKER_UTILS_HPP
+
+#include <iostream>
+#include <map>
+
+#include "mmdeploy/pose_tracker.hpp"
+#include "utils/argparse.h"
+
+DEFINE_ARG_string(det_model, "object detection model path");
+DEFINE_ARG_string(pose_model, "pose estimation model path");
+DEFINE_ARG_string(input, "input video path or camera index");
+
+DEFINE_string(device, "cpu", "Device name, e.g. \"cpu\", \"cuda\"");
+DEFINE_string(output, "", "Output video path or format string; use `cv::imshow` if empty");
+DEFINE_int32(output_size, 0, "Long-edge of output frames");
+DEFINE_int32(delay, 1, "Delay passed to `cv::waitKey` when using `cv::imshow`");
+
+DEFINE_string(skeleton, "coco", "Path to skeleton data");
+
+DEFINE_int32(det_interval, 1, "Detection interval");
+DEFINE_int32(det_label, 0, "Detection label use for pose estimation");
+DEFINE_double(det_thr, 0.5, "Detection score threshold");
+DEFINE_double(det_min_bbox_size, -1, "Detection minimum bbox size");
+DEFINE_double(det_nms_thr, .7,
+              "NMS IOU threshold for merging detected bboxes and bboxes from tracked targets");
+
+DEFINE_int32(pose_max_num_bboxes, -1, "Max number of bboxes used for pose estimation per frame");
+DEFINE_double(pose_kpt_thr, .5, "Threshold for visible key-points");
+DEFINE_int32(pose_min_keypoints, -1,
+             "Min number of key-points for valid poses, -1 indicates ceil(n_kpts/2)");
+DEFINE_double(pose_bbox_scale, 1.25, "Scale for expanding key-points to bbox");
+DEFINE_double(
+    pose_min_bbox_size, -1,
+    "Min pose bbox size, tracks with bbox size smaller than the threshold will be dropped");
+DEFINE_double(pose_nms_thr, 0.5,
+              "NMS OKS/IOU threshold for suppressing overlapped poses, useful when multiple pose "
+              "estimations collapse to the same target");
+
+DEFINE_double(track_iou_thr, 0.4, "IOU threshold for associating missing tracks");
+DEFINE_int32(track_max_missing, 10,
+             "Max number of missing frames before a missing tracks is removed");
+
+void InitTrackerParams(mmdeploy::PoseTracker::Params& params) {
+  params->det_interval = FLAGS_det_interval;
+  params->det_label = FLAGS_det_label;
+  params->det_thr = FLAGS_det_thr;
+  params->det_min_bbox_size = FLAGS_det_min_bbox_size;
+  params->pose_max_num_bboxes = FLAGS_pose_max_num_bboxes;
+  params->pose_kpt_thr = FLAGS_pose_kpt_thr;
+  params->pose_min_keypoints = FLAGS_pose_min_keypoints;
+  params->pose_bbox_scale = FLAGS_pose_bbox_scale;
+  params->pose_min_bbox_size = FLAGS_pose_min_bbox_size;
+  params->pose_nms_thr = FLAGS_pose_nms_thr;
+  params->track_iou_thr = FLAGS_track_iou_thr;
+  params->track_max_missing = FLAGS_track_max_missing;
+}
+
+#endif  // MMDEPLOY_POSE_TRACKER_UTILS_HPP
diff --git a/demo/csrc/cpp/restorer.cxx b/demo/csrc/cpp/restorer.cxx
index 7c3eefd82c..62612834ea 100644
--- a/demo/csrc/cpp/restorer.cxx
+++ b/demo/csrc/cpp/restorer.cxx
@@ -2,33 +2,48 @@
 
 #include "mmdeploy/restorer.hpp"
 
-#include <opencv2/imgcodecs/imgcodecs.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
-#include <string>
+#include "opencv2/imgproc/imgproc.hpp"
+#include "utils/argparse.h"
+#include "utils/mediaio.h"
+#include "utils/visualize.h"
+
+DEFINE_ARG_string(model, "Super-resolution model path");
+DEFINE_ARG_string(input, "Path to input image, video, camera index or image list (.txt)");
+DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
+
+DEFINE_string(output, "upsampled_%04d.jpg", "Output image, video path, format string or SHOW");
+DEFINE_int32(output_size, 0, "Long-edge of output frames");
+DEFINE_int32(delay, 0, "Delay passed to `cv::waitKey` when using `cv::imshow`");
 
 int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  image_restorer device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
+  if (!utils::ParseArguments(argc, argv)) {
+    return -1;
   }
 
-  using namespace mmdeploy;
+  // utils for handling input/output of images/videos
+  utils::mediaio::Input input(ARGS_input);
+  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
 
-  Restorer restorer{Model{model_path}, Device{device_name}};
+  // util for visualization
+  utils::Visualize v(FLAGS_output_size);
 
-  auto result = restorer.Apply(img);
+  /// ! construct a restorer instance
+  mmdeploy::Restorer restorer{mmdeploy::Model{ARGS_model}, mmdeploy::Device{FLAGS_device}};
 
-  cv::Mat sr_img(result->height, result->width, CV_8UC3, result->data);
-  cv::cvtColor(sr_img, sr_img, cv::COLOR_RGB2BGR);
-  cv::imwrite("output_restorer.bmp", sr_img);
+  for (const cv::Mat& img : input) {
+    /// ! apply restorer to the image
+    mmdeploy::Restorer::Result result = restorer.Apply(img);
+
+    // convert to BGR
+    cv::Mat upsampled(result->height, result->width, CV_8UC3, result->data);
+    cv::cvtColor(upsampled, upsampled, cv::COLOR_RGB2BGR);
+
+    // optionally rescale to output size & write to output
+    if (!output.write(v.get_session(upsampled).get())) {
+      // user request exit
+      break;
+    }
+  }
 
   return 0;
 }
diff --git a/demo/csrc/cpp/rotated_detector.cxx b/demo/csrc/cpp/rotated_detector.cxx
index d590273d32..9fffe48214 100644
--- a/demo/csrc/cpp/rotated_detector.cxx
+++ b/demo/csrc/cpp/rotated_detector.cxx
@@ -1,51 +1,43 @@
 
 #include "mmdeploy/rotated_detector.hpp"
 
-#include <opencv2/imgcodecs.hpp>
-#include <opencv2/imgproc.hpp>
-#include <string>
+#include "utils/argparse.h"
+#include "utils/mediaio.h"
+#include "utils/visualize.h"
+
+DEFINE_ARG_string(model, "Detection model path");
+DEFINE_ARG_string(input, "Path to input image, video, camera index or image list (.txt)");
+DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
+
+DEFINE_string(output, "detector_%04d.jpg", "Output image, video path, format string or SHOW");
+DEFINE_int32(output_size, 1024, "Long-edge of output frames");
+DEFINE_int32(delay, 0, "timeout for user input when showing images");
+
+DEFINE_double(det_thr, 0.5, "Detection score threshold");
 
 int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  oriented_object_detection device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
+  if (!utils::ParseArguments(argc, argv)) {
+    return -1;
   }
 
-  mmdeploy::Model model(model_path);
-  mmdeploy::RotatedDetector detector(model, mmdeploy::Device{device_name, 0});
+  mmdeploy::Model model(ARGS_model);
+  mmdeploy::RotatedDetector detector(model, mmdeploy::Device{FLAGS_device, 0});
+
+  utils::mediaio::Input input(ARGS_input);
+  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
 
-  auto dets = detector.Apply(img);
+  utils::Visualize v(FLAGS_output_size);
 
-  for (const auto& det : dets) {
-    if (det.score < 0.1) {
-      continue;
+  for (const cv::Mat& img : input) {
+    mmdeploy::RotatedDetector::Result dets = detector.Apply(img);
+    auto sess = v.get_session(img);
+    for (const mmdeploy_rotated_detection_t& det : dets) {
+      if (det.score > FLAGS_det_thr) {
+        sess.add_rotated_det(det.rbbox, det.label_id, det.score);
+      }
     }
-    auto& rbbox = det.rbbox;
-    float xc = rbbox[0];
-    float yc = rbbox[1];
-    float w = rbbox[2];
-    float h = rbbox[3];
-    float ag = rbbox[4];
-    float wx = w / 2 * std::cos(ag);
-    float wy = w / 2 * std::sin(ag);
-    float hx = -h / 2 * std::sin(ag);
-    float hy = h / 2 * std::cos(ag);
-    cv::Point p1 = {int(xc - wx - hx), int(yc - wy - hy)};
-    cv::Point p2 = {int(xc + wx - hx), int(yc + wy - hy)};
-    cv::Point p3 = {int(xc + wx + hx), int(yc + wy + hy)};
-    cv::Point p4 = {int(xc - wx + hx), int(yc - wy + hy)};
-    cv::drawContours(img, std::vector<std::vector<cv::Point>>{{p1, p2, p3, p4}}, -1, {0, 255, 0},
-                     2);
+    output.write(sess.get());
   }
-  cv::imwrite("output_rotated_detection.png", img);
 
   return 0;
 }
diff --git a/demo/csrc/cpp/segmentor.cxx b/demo/csrc/cpp/segmentor.cxx
index 0c1dde49d6..2f56093152 100644
--- a/demo/csrc/cpp/segmentor.cxx
+++ b/demo/csrc/cpp/segmentor.cxx
@@ -2,57 +2,58 @@
 
 #include "mmdeploy/segmentor.hpp"
 
-#include <fstream>
-#include <opencv2/imgcodecs/imgcodecs.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
 #include <random>
 #include <string>
 #include <vector>
 
-using namespace std;
+#include "utils/argparse.h"
+#include "utils/mediaio.h"
+#include "utils/visualize.h"
 
-vector<cv::Vec3b> gen_palette(int num_classes) {
-  std::mt19937 gen;
-  std::uniform_int_distribution<ushort> uniform_dist(0, 255);
+DEFINE_ARG_string(model, "Object detection model path");
+DEFINE_ARG_string(input, "Path to input image, video, camera index or image list (.txt)");
+DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
 
-  vector<cv::Vec3b> palette;
-  palette.reserve(num_classes);
-  for (auto i = 0; i < num_classes; ++i) {
-    palette.emplace_back(uniform_dist(gen), uniform_dist(gen), uniform_dist(gen));
-  }
-  return palette;
-}
+DEFINE_string(output, "segmentation_%04d.jpg", "Output image, video path, format string or SHOW");
+DEFINE_int32(output_size, 1024, "Long-edge of output frames");
+DEFINE_int32(delay, 0, "Delay passed to `cv::waitKey` when using `cv::imshow`");
+
+std::vector<cv::Vec3b> gen_palette(int num_classes = 256);
 
 int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  image_segmentation device_name model_path image_path\n");
-    return 1;
+  if (!utils::ParseArguments(argc, argv)) {
+    return -1;
   }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
-
-  using namespace mmdeploy;
 
-  Segmentor segmentor{Model{model_path}, Device{device_name}};
+  mmdeploy::Segmentor segmentor{mmdeploy::Model{ARGS_model}, mmdeploy::Device{FLAGS_device}};
 
-  auto result = segmentor.Apply(img);
+  utils::mediaio::Input input(ARGS_input);
+  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
 
-  auto palette = gen_palette(result->classes + 1);
+  utils::Visualize v(FLAGS_output_size);
+  v.set_palette(gen_palette());
 
-  cv::Mat color_mask = cv::Mat::zeros(result->height, result->width, CV_8UC3);
-  int pos = 0;
-  for (auto iter = color_mask.begin<cv::Vec3b>(); iter != color_mask.end<cv::Vec3b>(); ++iter) {
-    *iter = palette[result->mask[pos++]];
+  for (const cv::Mat& img : input) {
+    mmdeploy::Segmentor::Result result = segmentor.Apply(img);
+    auto sess = v.get_session(img);
+    mmdeploy_segmentation_t& seg = *result;
+    sess.add_mask(seg.height, seg.width, seg.classes, seg.mask, nullptr);
+    if (!output.write(sess.get())) {
+      break;
+    }
   }
 
-  img = img * 0.5 + color_mask * 0.5;
-  cv::imwrite("output_segmentation.png", img);
-
   return 0;
 }
+
+std::vector<cv::Vec3b> gen_palette(int num_classes) {
+  std::mt19937 gen{};  // NOLINT
+  std::uniform_int_distribution<ushort> uniform_dist(0, 255);
+
+  std::vector<cv::Vec3b> palette;
+  palette.reserve(num_classes);
+  for (auto i = 0; i < num_classes; ++i) {
+    palette.emplace_back(uniform_dist(gen), uniform_dist(gen), uniform_dist(gen));
+  }
+  return palette;
+}
diff --git a/demo/csrc/cpp/text_ocr.cxx b/demo/csrc/cpp/text_ocr.cxx
index 853d681071..b7c8358c58 100644
--- a/demo/csrc/cpp/text_ocr.cxx
+++ b/demo/csrc/cpp/text_ocr.cxx
@@ -1,46 +1,51 @@
 
-#include <opencv2/imgcodecs/imgcodecs.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
 #include <string>
 
 #include "mmdeploy/text_detector.hpp"
 #include "mmdeploy/text_recognizer.hpp"
+#include "utils/argparse.h"
+#include "utils/mediaio.h"
+#include "utils/visualize.h"
+
+DEFINE_ARG_string(det_model, "Text detection model path");
+DEFINE_ARG_string(reg_model, "Text recognition model path");
+DEFINE_ARG_string(input, "Path to input image, video, camera index or image list (.txt)");
+DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
+
+DEFINE_string(output, "segmentation_%04d.jpg", "Output image, video path, format string or SHOW");
+DEFINE_int32(output_size, 1024, "Long-edge of output frames");
+DEFINE_int32(delay, 0, "Delay passed to `cv::waitKey` when using `cv::imshow`");
+
+using mmdeploy::TextDetector;
+using mmdeploy::TextRecognizer;
 
 int main(int argc, char* argv[]) {
-  if (argc != 5) {
-    fprintf(stderr, "usage:\n  ocr device_name det_model_path reg_model_path image_path\n");
-    return 1;
-  }
-  const auto device_name = argv[1];
-  auto det_model_path = argv[2];
-  auto reg_model_path = argv[3];
-  auto image_path = argv[4];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
+  if (!utils::ParseArguments(argc, argv)) {
+    return -1;
   }
 
-  using namespace mmdeploy;
+  mmdeploy::Device device(FLAGS_device);
+  TextDetector detector{mmdeploy::Model(ARGS_det_model), device};
+  TextRecognizer recognizer{mmdeploy::Model(ARGS_reg_model), device};
 
-  TextDetector detector{Model(det_model_path), Device(device_name)};
-  TextRecognizer recognizer{Model(reg_model_path), Device(device_name)};
+  utils::mediaio::Input input(ARGS_input);
+  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
 
-  auto bboxes = detector.Apply(img);
-  auto texts = recognizer.Apply(img, {bboxes.begin(), bboxes.size()});
+  utils::Visualize v(FLAGS_output_size);
 
-  for (int i = 0; i < bboxes.size(); ++i) {
-    fprintf(stdout, "box[%d]: %s\n", i, texts[i].text);
-    std::vector<cv::Point> poly_points;
-    for (const auto& pt : bboxes[i].bbox) {
-      fprintf(stdout, "x: %.2f, y: %.2f, ", pt.x, pt.y);
-      poly_points.emplace_back((int)pt.x, (int)pt.y);
+  for (const cv::Mat& img : input) {
+    TextDetector::Result bboxes = detector.Apply(img);
+    TextRecognizer::Result texts = recognizer.Apply(img, {bboxes.begin(), bboxes.size()});
+    auto sess = v.get_session(img);
+    for (size_t i = 0; i < bboxes.size(); ++i) {
+      mmdeploy_text_detection_t& bbox = bboxes[i];
+      mmdeploy_text_recognition_t& text = texts[i];
+      sess.add_text_det(bbox.bbox, bbox.score, text.text, text.length);
+    }
+    if (!output.write(sess.get())) {
+      break;
     }
-    fprintf(stdout, "\n");
-    cv::polylines(img, poly_points, true, cv::Scalar{0, 255, 0});
   }
 
-  cv::imwrite("output_ocr.png", img);
-
   return 0;
 }
diff --git a/demo/csrc/cpp/utils/argparse.h b/demo/csrc/cpp/utils/argparse.h
new file mode 100644
index 0000000000..be9e9dff3b
--- /dev/null
+++ b/demo/csrc/cpp/utils/argparse.h
@@ -0,0 +1,255 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_ARGPARSE_H
+#define MMDEPLOY_ARGPARSE_H
+
+#include <algorithm>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#define DEFINE_int32(name, init, msg) _MMDEPLOY_DEFINE_FLAG(int32_t, name, init, msg)
+#define DEFINE_double(name, init, msg) _MMDEPLOY_DEFINE_FLAG(double, name, init, msg)
+#define DEFINE_string(name, init, msg) _MMDEPLOY_DEFINE_FLAG(std::string, name, init, msg)
+
+#define DEFINE_ARG_int32(name, msg) _MMDEPLOY_DEFINE_ARG(int32_t, name, msg)
+#define DEFINE_ARG_double(name, msg) _MMDEPLOY_DEFINE_ARG(double, name, msg)
+#define DEFINE_ARG_string(name, msg) _MMDEPLOY_DEFINE_ARG(std::string, name, msg)
+
+namespace utils {
+
+class ArgParse {
+ public:
+  template <typename T>
+  static T Register(const std::string& type, const std::string& name, T init,
+                    const std::string& msg, void* ptr) {
+    instance()._Register(type, name, msg, true, ptr);
+    return init;
+  }
+
+  template <typename T>
+  static T Register(const std::string& type, const std::string& name, const std::string& msg,
+                    void* ptr) {
+    instance()._Register(type, name, msg, false, ptr);
+    return {};
+  }
+
+  static bool ParseArguments(int argc, char* argv[]) {
+    if (!instance()._Parse(argc, argv)) {
+      ShowUsageWithFlags(argv[0]);
+      return false;
+    }
+    return true;
+  }
+
+  static void ShowUsageWithFlags(const char* argv0) { instance()._ShowUsageWithFlags(argv0); }
+
+ private:
+  static ArgParse& instance() {
+    static ArgParse inst;
+    return inst;
+  }
+
+  struct Info {
+    std::string name;
+    std::string type;
+    std::string msg;
+    bool is_flag;
+    void* ptr;
+  };
+
+  void _Register(std::string type, const std::string& name, const std::string& msg, bool is_flag,
+                 void* ptr) {
+    if (type == "std::string") {
+      type = "string";
+    } else if (type == "int32_t") {
+      type = "int32";
+    }
+    infos_.push_back({name, type, msg, is_flag, ptr});
+  }
+
+  bool _Parse(int argc, char* argv[]) {
+    int arg_idx{-1};
+    std::vector<const char*> args(infos_.size());
+    for (int i = 1; i < argc; ++i) {
+      if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+        return false;
+      }
+      if (argv[i][0] == '-' && argv[i][1] == '-') {
+        bool found{};
+        for (int j = 0; j < infos_.size(); ++j) {
+          auto& flag = infos_[j];
+          auto name = "--" + flag.name;
+          if (strncmp(argv[i], name.c_str(), name.size()) == 0) {
+            if (argv[i][name.size()] == '=') {
+              args[j] = argv[i] + name.size() + 1;
+            } else if (i + 1 < argc) {
+              args[j] = argv[++i];
+            }
+            found = true;
+            break;
+          }
+        }
+        if (!found) {
+          std::cout << "error: unknown option: " << argv[i] << std::endl;
+        }
+      } else {
+        for (arg_idx++; arg_idx < infos_.size(); ++arg_idx) {
+          if (!infos_[arg_idx].is_flag) {
+            args[arg_idx] = argv[i];
+            break;
+          }
+        }
+        if (arg_idx == infos_.size()) {
+          std::cout << "error: unknown argument: " << argv[i] << std::endl;
+          return false;
+        }
+      }
+    }
+    std::vector<std::string> missing;
+    for (arg_idx++; arg_idx < infos_.size(); ++arg_idx) {
+      if (!infos_[arg_idx].is_flag) {
+        missing.push_back(infos_[arg_idx].name);
+      }
+    }
+    if (!missing.empty()) {
+      std::cout << "error: the following arguments are required:";
+      for (int i = 0; i < missing.size(); ++i) {
+        std::cout << " " << missing[i];
+        if (i != missing.size() - 1) {
+          std::cout << ",";
+        }
+      }
+      std::cout << "\n";
+      return false;
+    }
+
+    for (int i = 0; i < infos_.size(); ++i) {
+      if (args[i]) {
+        try {
+          parse_str(infos_[i], args[i]);
+        } catch (...) {
+          std::cout << "error: failed to parse " << infos_[i].name << ": " << args[i] << std::endl;
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+  static void parse_str(Info& info, const std::string& str) {
+    if (info.type == "int32") {
+      *static_cast<int32_t*>(info.ptr) = std::stoi(str);
+    } else if (info.type == "double") {
+      *static_cast<double*>(info.ptr) = std::stod(str);
+    } else if (info.type == "string") {
+      *static_cast<std::string*>(info.ptr) = str;
+    } else {
+      // pass
+    }
+  }
+
+  static std::string get_default_str(const Info& info) {
+    if (info.type == "int32") {
+      return std::to_string(*static_cast<int32_t*>(info.ptr));
+    } else if (info.type == "double") {
+      std::ostringstream os;
+      os << std::setprecision(3) << *static_cast<double*>(info.ptr);
+      return os.str();
+    } else if (info.type == "string") {
+      return "\"" + *(static_cast<std::string*>(info.ptr)) + "\"";
+    } else {
+      return "<unknown type>";
+    }
+  }
+
+  void _ShowUsageWithFlags(const char* argv0) const {
+    ShowUsage(argv0);
+    static constexpr const auto kLineLength = 80;
+    std::cout << std::endl;
+    int max_name_length = 0;
+    for (const auto& info : infos_) {
+      max_name_length = std::max(max_name_length, (int)info.name.length());
+    }
+    max_name_length += 4;
+    auto name_col_size = max_name_length + 1;
+    auto msg_col_size = kLineLength - name_col_size;
+    std::cout << "required arguments:\n";
+    ShowFlags(name_col_size, msg_col_size, false);
+    std::cout << std::endl;
+    std::cout << "optional arguments:\n";
+    ShowFlags(name_col_size, msg_col_size, true);
+  }
+
+  void ShowFlags(int name_col_size, int msg_col_size, bool is_flag) const {
+    for (const auto& info : infos_) {
+      if (info.is_flag != is_flag) {
+        continue;
+      }
+      std::string name = "  ";
+      if (info.is_flag) {
+        name.append("--");
+      }
+      name.append(info.name);
+      while (name.length() < name_col_size) {
+        name.append(" ");
+      }
+      std::cout << name;
+      std::string msg = info.msg;
+      while (msg.length() > msg_col_size) {  // insert line-breaks when msg is too long
+        auto pos = msg.rend() - std::find(std::make_reverse_iterator(msg.begin() + msg_col_size),
+                                          msg.rend(), ' ');
+        std::cout << msg.substr(0, pos - 1) << std::endl;
+        std::cout << std::string(name_col_size, ' ');
+        msg = msg.substr(pos);
+      }
+      std::cout << msg;
+      std::string type;
+      type.append("[").append(info.type);
+      if (info.is_flag) {
+        type.append(" = ").append(get_default_str(info));
+      }
+      type.append("]");
+      if (msg.length() + type.length() + 1 > msg_col_size) {
+        std::cout << std::endl << std::string(name_col_size, ' ') << type;
+      } else {
+        std::cout << " " << type;
+      }
+      std::cout << std::endl;
+    }
+  }
+
+  void ShowUsage(const char* argv0) const {
+    for (auto p = argv0; *p; ++p) {
+      if (*p == '/' || *p == '\'') {
+        argv0 = p + 1;
+      }
+    }
+    std::cout << "Usage: " << argv0 << " [options]";
+    for (const auto& info : infos_) {
+      if (!info.is_flag) {
+        std::cout << " " << info.name;
+      }
+    }
+    std::cout << std::endl;
+  }
+
+ private:
+  std::vector<Info> infos_;
+};
+
+inline bool ParseArguments(int argc, char* argv[]) { return ArgParse::ParseArguments(argc, argv); }
+
+}  // namespace utils
+
+#define _MMDEPLOY_DEFINE_FLAG(type, name, init, msg) \
+  type FLAGS_##name = ::utils::ArgParse::Register(#type, #name, type(init), msg, &FLAGS_##name)
+
+#define _MMDEPLOY_DEFINE_ARG(type, name, msg) \
+  type ARGS_##name = ::utils::ArgParse::Register<type>(#type, #name, msg, &ARGS_##name)
+
+#endif  // MMDEPLOY_ARGPARSE_H
diff --git a/demo/csrc/cpp/utils/mediaio.h b/demo/csrc/cpp/utils/mediaio.h
new file mode 100644
index 0000000000..e9e47e2f0a
--- /dev/null
+++ b/demo/csrc/cpp/utils/mediaio.h
@@ -0,0 +1,308 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_MEDIAIO_H
+#define MMDEPLOY_MEDIAIO_H
+
+#include <fstream>
+#include <set>
+
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgcodecs/imgcodecs.hpp"
+#include "opencv2/videoio/videoio.hpp"
+
+namespace utils {
+namespace mediaio {
+
+enum class MediaType { kUnknown, kImage, kVideo, kImageList, kWebcam, kFmtStr, kShow };
+
+namespace detail {
+
+static std::string get_extension(const std::string& path) {
+  std::string ext;
+  for (auto i = (int)path.size() - 1; i >= 0; --i) {
+    if (path[i] == '.') {
+      ext.push_back(path[i]);
+      for (++i; i < path.size(); ++i) {
+        ext.push_back((char)std::tolower((unsigned char)path[i]));
+      }
+      return ext;
+    }
+  }
+  return {};
+}
+
+static bool is_video(const std::string& ext) {
+  static const std::set<std::string> es{".mp4", ".avi", ".mkv", ".webm", ".mov", ".mpg", ".wmv"};
+  return es.count(ext);
+}
+
+static bool is_list(const std::string& ext) {
+  static const std::set<std::string> es{".txt"};
+  return es.count(ext);
+}
+
+static bool is_image(const std::string& ext) {
+  static const std::set<std::string> es{".jpg", ".jpeg", ".png", ".tif", ".tiff",
+                                        ".bmp", ".ppm",  ".pgm", ".webp"};
+  return es.count(ext);
+}
+
+static bool is_fmtstr(const std::string& str) {
+  for (const auto& c : str) {
+    if (c == '%') {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace detail
+
+class Input;
+
+class InputIterator {
+ public:
+  using iterator_category = std::input_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+  using reference = cv::Mat&;
+  using value_type = reference;
+  using pointer = void;
+
+ public:
+  InputIterator() = default;
+  explicit InputIterator(Input& input) : input_(&input) { next(); }
+  InputIterator& operator++() {
+    next();
+    return *this;
+  }
+  reference operator*() { return frame_; }
+  friend bool operator==(const InputIterator& a, const InputIterator& b) {
+    return &a == &b || a.is_end() == b.is_end();
+  }
+  friend bool operator!=(const InputIterator& a, const InputIterator& b) { return !(a == b); }
+
+ private:
+  void next();
+  bool is_end() const noexcept { return frame_.data != nullptr; }
+
+ private:
+  cv::Mat frame_;
+  Input* input_{};
+};
+
+class Input {
+ public:
+  explicit Input(const std::string& path, MediaType type = MediaType::kUnknown)
+      : path_(path), type_(type) {
+    if (type_ == MediaType::kUnknown) {
+      auto ext = detail::get_extension(path);
+      if (detail::is_image(ext)) {
+        type_ = MediaType::kImage;
+      } else if (detail::is_video(ext)) {
+        type_ = MediaType::kVideo;
+      } else if (path.size() == 1 && std::isdigit((unsigned char)path[0])) {
+        type_ = MediaType::kWebcam;
+      } else if (detail::is_list(ext) || try_image_list(path)) {
+        type_ = MediaType::kImageList;
+      } else if (try_image(path)) {
+        type_ = MediaType::kImage;
+      } else if (try_video(path)) {
+        type_ = MediaType::kVideo;
+      } else {
+        std::cout << "unknown file type: " << path << "\n";
+      }
+    }
+    if (type_ != MediaType::kUnknown) {
+      if (type_ == MediaType::kVideo) {
+        cap_.open(path_);
+        if (!cap_.isOpened()) {
+          std::cerr << "failed to open video file: " << path_ << "\n";
+        }
+      } else if (type_ == MediaType::kWebcam) {
+        cap_.open(std::stoi(path_));
+        if (!cap_.isOpened()) {
+          std::cerr << "failed to open camera index: " << path_ << "\n";
+        }
+        type_ = MediaType::kVideo;
+      } else if (type_ == MediaType::kImage) {
+        items_ = {path_};
+        type_ = MediaType::kImageList;
+      } else if (type_ == MediaType::kImageList) {
+        if (items_.empty()) {
+          items_ = load_image_list(path);
+        }
+      }
+    }
+  }
+  InputIterator begin() { return InputIterator(*this); }
+  InputIterator end() { return {}; }  // NOLINT
+
+  cv::Mat read() {
+    cv::Mat img;
+    if (type_ == MediaType::kVideo) {
+      cap_ >> img;
+    } else if (type_ == MediaType::kImageList) {
+      while (!img.data && index_ < items_.size()) {
+        auto path = items_[index_++];
+        img = cv::imread(path);
+        if (!img.data) {
+          std::cerr << "failed to load image: " << path << "\n";
+        }
+      }
+    }
+    return img;
+  }
+
+ private:
+  static bool try_image(const std::string& path) { return cv::imread(path).data; }
+
+  static bool try_video(const std::string& path) { return cv::VideoCapture(path).isOpened(); }
+
+  static std::vector<std::string> load_image_list(const std::string& path, size_t max_bytes = 0) {
+    std::ifstream ifs(path);
+    ifs.seekg(0, std::ifstream::end);
+    auto size = ifs.tellg();
+    ifs.seekg(0, std::ifstream::beg);
+    if (max_bytes && size > max_bytes) {
+      return {};
+    }
+    auto strip = [](std::string& s) {
+      while (!s.empty() && std::isspace((unsigned char)s.back())) {
+        s.pop_back();
+      }
+    };
+    std::vector<std::string> ret;
+    std::string line;
+    while (std::getline(ifs, line)) {
+      strip(line);
+      if (!line.empty()) {
+        ret.push_back(std::move(line));
+      }
+    }
+    return ret;
+  }
+
+  bool try_image_list(const std::string& path) {
+    auto items = load_image_list(path, 1 << 20);
+    size_t count = 0;
+    for (const auto& item : items) {
+      if (detail::is_image(detail::get_extension(item)) && ++count > items.size() / 10) {
+        items_ = std::move(items);
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  MediaType type_{MediaType::kUnknown};
+  std::string path_;
+  std::vector<std::string> items_;
+  cv::VideoCapture cap_;
+  size_t index_{};
+};
+
+inline void InputIterator::next() {
+  assert(input_);
+  frame_ = input_->read();
+}
+
+class Output;
+
+class OutputIterator {
+ public:
+  using iterator_category = std::output_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+  using reference = void;
+  using value_type = void;
+  using pointer = void;
+
+ public:
+  explicit OutputIterator(Output& output) : output_(&output) {}
+
+  OutputIterator& operator=(const cv::Mat& frame);
+
+  OutputIterator& operator*() { return *this; }
+  OutputIterator& operator++() { return *this; }
+  OutputIterator& operator++(int) { return *this; }  // NOLINT
+
+ private:
+  Output* output_{};
+};
+
+class Output {
+ public:
+  explicit Output(const std::string& path, int delay, MediaType type = MediaType::kUnknown)
+      : path_(path), type_(type), delay_(delay) {
+    if (type_ == MediaType::kUnknown) {
+      auto ext = detail::get_extension(path);
+      if (path_.empty()) {
+        type_ = MediaType::kShow;
+      } else if (detail::is_image(ext)) {
+        if (detail::is_fmtstr(path)) {
+          type_ = MediaType::kFmtStr;
+        } else {
+          type_ = MediaType::kImage;
+        }
+      } else if (detail::is_video(ext)) {
+        type_ = MediaType::kVideo;
+      } else {
+        std::cout << "unknown file type: " << path << "\n";
+      }
+    }
+  }
+
+  bool write(const cv::Mat& frame) {
+    bool exit = false;
+    switch (type_) {
+      case MediaType::kImage:
+        cv::imwrite(path_, frame);
+        break;
+      case MediaType::kFmtStr: {
+        char buf[256];
+        snprintf(buf, sizeof(buf), path_.c_str(), frame_id_);
+        cv::imwrite(buf, frame);
+        break;
+      }
+      case MediaType::kVideo:
+        write_video(frame);
+        break;
+      case MediaType::kShow:
+        cv::imshow("", frame);
+        exit = cv::waitKey(delay_) == 'q';
+        break;
+      default:
+        std::cout << "unsupported output media type\n";
+        assert(0);
+    }
+    ++frame_id_;
+    return !exit;
+  }
+
+  OutputIterator inserter() { return OutputIterator{*this}; }
+
+ private:
+  void write_video(const cv::Mat& frame) {
+    if (!video_.isOpened()) {
+      video_.open(path_, cv::VideoWriter::fourcc('M', 'P', '4', 'V'), 30, frame.size());
+    }
+    video_ << frame;
+  }
+
+ private:
+  std::string path_;
+  MediaType type_{MediaType::kUnknown};
+  int delay_{1};
+  size_t frame_id_{0};
+  cv::VideoWriter video_;
+};
+
+OutputIterator& OutputIterator::operator=(const cv::Mat& frame) {
+  assert(output_);
+  output_->write(frame);
+  return *this;
+}
+
+}  // namespace mediaio
+}  // namespace utils
+#endif  // MMDEPLOY_MEDIAIO_H
diff --git a/demo/csrc/cpp/utils/skeleton.h b/demo/csrc/cpp/utils/skeleton.h
new file mode 100644
index 0000000000..650b9089c8
--- /dev/null
+++ b/demo/csrc/cpp/utils/skeleton.h
@@ -0,0 +1,88 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SKELETON_H
+#define MMDEPLOY_SKELETON_H
+
+#include <fstream>
+#include <opencv2/core/core.hpp>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace utils {
+
+struct Skeleton {
+  std::vector<std::pair<int, int>> links;
+  std::vector<cv::Scalar> palette;
+  std::vector<int> link_colors;
+  std::vector<int> point_colors;
+  static Skeleton get(const std::string& path);
+};
+
+const Skeleton& gCocoSkeleton() {
+  static const Skeleton inst{
+      {
+          {15, 13}, {13, 11}, {16, 14}, {14, 12}, {11, 12}, {5, 11}, {6, 12},
+          {5, 6},   {5, 7},   {6, 8},   {7, 9},   {8, 10},  {1, 2},  {0, 1},
+          {0, 2},   {1, 3},   {2, 4},   {3, 5},   {4, 6},
+      },
+      {
+          {255, 128, 0},   {255, 153, 51},  {255, 178, 102}, {230, 230, 0},   {255, 153, 255},
+          {153, 204, 255}, {255, 102, 255}, {255, 51, 255},  {102, 178, 255}, {51, 153, 255},
+          {255, 153, 153}, {255, 102, 102}, {255, 51, 51},   {153, 255, 153}, {102, 255, 102},
+          {51, 255, 51},   {0, 255, 0},     {0, 0, 255},     {255, 0, 0},     {255, 255, 255},
+      },
+      {0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16},
+      {16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0},
+  };
+  return inst;
+}
+
+// n_links
+// u0, v0, u1, v1, ..., un-1, vn-1
+// n_palette
+// b0, g0, r0, ..., bn-1, gn-1, rn-1
+// n_link_color
+// i0, i1, ..., in-1
+// n_point_color
+// j0, j1, ..., jn-1
+inline Skeleton Skeleton::get(const std::string& path) {
+  if (path == "coco") {
+    return gCocoSkeleton();
+  }
+  std::ifstream ifs(path);
+  if (!ifs.is_open()) {
+    assert(0 && "Failed to skeleton file");
+  }
+  Skeleton skel;
+  int n = 0;
+  ifs >> n;
+  for (int i = 0; i < n; ++i) {
+    int u{}, v{};
+    ifs >> u >> v;
+    skel.links.emplace_back(u, v);
+  }
+  ifs >> n;
+  for (int i = 0; i < n; ++i) {
+    int b{}, g{}, r{};
+    ifs >> b >> g >> r;
+    skel.palette.emplace_back(b, g, r);
+  }
+  ifs >> n;
+  for (int i = 0; i < n; ++i) {
+    int x{};
+    ifs >> x;
+    skel.link_colors.push_back(x);
+  }
+  ifs >> n;
+  for (int i = 0; i < n; ++i) {
+    int x{};
+    ifs >> x;
+    skel.point_colors.push_back(x);
+  }
+  return skel;
+}
+
+}  // namespace utils
+
+#endif  // MMDEPLOY_SKELETON_H
diff --git a/demo/csrc/cpp/utils/visualize.h b/demo/csrc/cpp/utils/visualize.h
new file mode 100644
index 0000000000..ab51ad5696
--- /dev/null
+++ b/demo/csrc/cpp/utils/visualize.h
@@ -0,0 +1,165 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_VISUALIZE_H
+#define MMDEPLOY_VISUALIZE_H
+
+#include <iomanip>
+#include <vector>
+
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "skeleton.h"
+
+namespace utils {
+
+class Visualize {
+ public:
+  class Session {
+   public:
+    explicit Session(Visualize& v, const cv::Mat& frame) : v_(v) {
+      if (v_.size_) {
+        scale_ = (float)v_.size_ / (float)std::max(frame.cols, frame.rows);
+      }
+      if (scale_ != 1) {
+        cv::resize(frame, img_, {}, scale_, scale_);
+      } else {
+        img_ = frame.clone();
+      }
+    }
+
+    void add_label(int label_id, float score) {
+      static constexpr const int font_face = cv::FONT_HERSHEY_SIMPLEX;
+      static constexpr const int thickness = 1;
+      static constexpr const double font_scale = 0.5;
+      int baseline = 0;
+      std::stringstream ss;
+      ss << std::to_string(label_id) << ": " << std::setw(6) << std::fixed << score * 100;
+      auto text_size = cv::getTextSize(ss.str(), font_face, font_scale, thickness, &baseline);
+      offset_ += text_size.height;
+      cv::Point origin(0, offset_);
+      cv::rectangle(img_, origin + cv::Point(0, thickness),
+                    origin + cv::Point(text_size.width, -text_size.height), cv::Scalar{},
+                    cv::FILLED);
+      cv::putText(img_, ss.str(), origin, font_face, font_scale, cv::Scalar::all(255), thickness,
+                  cv::LINE_8);
+      offset_ += thickness;
+    }
+
+    // TODO: show label & score
+    template <typename Mask>
+    void add_det(const mmdeploy_rect_t& rect, int label_id, float score, const Mask* mask) {
+      add_bbox(rect, label_id, score);
+      if (mask) {
+        cv::Mat mask_img(mask->height, mask->width, CV_8UC1, const_cast<char*>(mask->data));
+        auto x0 = std::max(std::floor(rect.left) - 1, 0.f);
+        auto y0 = std::max(std::floor(rect.top) - 1, 0.f);
+        cv::Rect roi((int)x0, (int)y0, mask->width, mask->height);
+
+        // split the RGB channels, overlay mask to a specific color channel
+        cv::Mat ch[3]{};
+        cv::split(img_, ch);
+        int col = 0;  // int col = i % 3;
+        cv::bitwise_or(mask_img, ch[col](roi), ch[col](roi));
+        merge(ch, 3, img_);
+      }
+    }
+
+    // TODO: show label & score
+    void add_bbox(const mmdeploy_rect_t& rect, int label_id, float score) {
+      cv::rectangle(img_, cv::Point2f(rect.left, rect.top), cv::Point2f(rect.right, rect.bottom),
+                    cv::Scalar(0, 255, 0));
+    }
+
+    // TODO: show text
+    void add_text_det(mmdeploy_point_t bbox[4], float score, const char* text, size_t text_size) {
+      std::vector<cv::Point2f> poly_points;
+      for (int i = 0; i < 4; ++i) {
+        poly_points.emplace_back(bbox[i].x * scale_, bbox[i].y * scale_);
+      }
+      cv::polylines(img_, poly_points, true, cv::Scalar{0, 255, 0});
+    }
+
+    void add_rotated_det(const float bbox[5], int label_id, float score) {
+      float xc = bbox[0];
+      float yc = bbox[1];
+      float w = bbox[2];
+      float h = bbox[3];
+      float ag = bbox[4];
+      float wx = w / 2 * std::cos(ag);
+      float wy = w / 2 * std::sin(ag);
+      float hx = -h / 2 * std::sin(ag);
+      float hy = h / 2 * std::cos(ag);
+      cv::Point2f p1{xc - wx - hx, yc - wy - hy};
+      cv::Point2f p2{xc + wx - hx, yc + wy - hy};
+      cv::Point2f p3{xc + wx + hx, yc + wy + hy};
+      cv::Point2f p4{xc - wx + hx, yc - wy + hy};
+      cv::drawContours(
+          img_,
+          std::vector<std::vector<cv::Point>>{{p1 * scale_, p2 * scale_, p3 * scale_, p4 * scale_}},
+          -1, {0, 255, 0}, 2);
+    }
+
+    // TODO: handle score output
+    void add_mask(int height, int width, int n_classes, const int* mask, const float* score) {
+      cv::Mat color_mask = cv::Mat::zeros(height, width, CV_8UC3);
+      int pos = 0;
+      for (auto iter = color_mask.begin<cv::Vec3b>(); iter != color_mask.end<cv::Vec3b>(); ++iter) {
+        *iter = v_.palette_[mask[pos++] % v_.palette_.size()];
+      }
+      if (color_mask.size() != img_.size()) {
+        cv::resize(color_mask, color_mask, img_.size(), 0., 0.);
+      }
+      img_ = img_ * 0.5 + color_mask * 0.5;
+    }
+
+    void add_pose(const mmdeploy_point_t* pts, const float* scores, int32_t pts_size, double thr) {
+      auto& skel = v_.skel_;
+      std::vector<int> used(pts_size);
+      std::vector<int> is_end_point(pts_size);
+      for (size_t i = 0; i < skel.links.size(); ++i) {
+        auto u = skel.links[i].first;
+        auto v = skel.links[i].second;
+        is_end_point[u] = is_end_point[v] = 1;
+        if (scores[u] > thr && scores[v] > thr) {
+          used[u] = used[v] = 1;
+          cv::Point2f p0(pts[u].x, pts[u].y);
+          cv::Point2f p1(pts[v].x, pts[v].y);
+          cv::line(img_, p0 * scale_, p1 * scale_, skel.palette[skel.link_colors[i]], 1,
+                   cv::LINE_AA);
+        }
+      }
+      for (size_t i = 0; i < pts_size; ++i) {
+        if (!is_end_point[i] && scores[i] > thr || used[i]) {
+          cv::Point2f p(pts[i].x, pts[i].y);
+          cv::circle(img_, p * scale_, 1, skel.palette[skel.point_colors[i]], 2, cv::LINE_AA);
+        }
+      }
+    }
+
+    cv::Mat get() { return img_; }
+
+   private:
+    Visualize& v_;
+    float scale_{1};
+    int offset_{};
+    cv::Mat img_;
+  };
+
+  explicit Visualize(int size) : size_(size) {}
+
+  Session get_session(const cv::Mat& frame) { return Session(*this, frame); }
+
+  void set_skeleton(const Skeleton& skel) { skel_ = skel; }
+
+  void set_palette(const std::vector<cv::Vec3b>& palette) { palette_ = palette; }
+
+ private:
+  friend Session;
+  Skeleton skel_;
+  std::vector<cv::Vec3b> palette_;
+  int size_{};
+};
+
+}  // namespace utils
+
+#endif  // MMDEPLOY_VISUALIZE_H

From 01e03d4b7f7e02f73bf332cd980814d866560b9b Mon Sep 17 00:00:00 2001
From: zhangli <lzhang329@gmail.com>
Date: Mon, 6 Feb 2023 00:30:06 +0800
Subject: [PATCH 02/11] show text in image

---
 demo/csrc/cpp/classifier.cxx    | 11 ++++++
 demo/csrc/cpp/det_pose.cpp      |  2 --
 demo/csrc/cpp/detector.cxx      |  2 +-
 demo/csrc/cpp/text_ocr.cxx      |  4 +--
 demo/csrc/cpp/utils/mediaio.h   | 60 +++++++++++++++++++++++++++++++
 demo/csrc/cpp/utils/visualize.h | 63 ++++++++++++++++++++++-----------
 6 files changed, 117 insertions(+), 25 deletions(-)

diff --git a/demo/csrc/cpp/classifier.cxx b/demo/csrc/cpp/classifier.cxx
index a8976fa506..e3a095363c 100644
--- a/demo/csrc/cpp/classifier.cxx
+++ b/demo/csrc/cpp/classifier.cxx
@@ -5,10 +5,12 @@
 
 #include "opencv2/imgcodecs/imgcodecs.hpp"
 #include "utils/argparse.h"
+#include "utils/visualize.h"
 
 DEFINE_ARG_string(model, "Model path");
 DEFINE_ARG_string(image, "Input image path");
 DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
+DEFINE_string(output, "classifier_output.jpg", "output path");
 
 int main(int argc, char* argv[]) {
   if (!utils::ParseArguments(argc, argv)) {
@@ -23,8 +25,17 @@ int main(int argc, char* argv[]) {
   mmdeploy::Model model(ARGS_model);
   mmdeploy::Classifier classifier(model, mmdeploy::Device{FLAGS_device, 0});
 
+  utils::Visualize v;
+
   auto res = classifier.Apply(img);
 
+  auto sess = v.get_session(img);
+  for (const mmdeploy_classification_t& label : res) {
+    sess.add_label(label.label_id, label.score);
+  }
+
+  cv::imwrite(FLAGS_output, sess.get());
+
   for (const auto& cls : res) {
     fprintf(stderr, "label: %d, score: %.4f\n", cls.label_id, cls.score);
   }
diff --git a/demo/csrc/cpp/det_pose.cpp b/demo/csrc/cpp/det_pose.cpp
index 983a6c598a..2ef35b29e3 100644
--- a/demo/csrc/cpp/det_pose.cpp
+++ b/demo/csrc/cpp/det_pose.cpp
@@ -58,9 +58,7 @@ int main(int argc, char* argv[]) {
     // visualize
     auto sess = v.get_session(img);
     for (size_t i = 0; i < bboxes.size(); ++i) {
-      // draw bounding boxes
       sess.add_bbox(bboxes[i], -1, -1);
-      // draw pose key-points
       sess.add_pose(poses[i].point, poses[i].score, poses[i].length, FLAGS_pose_thr);
     }
 
diff --git a/demo/csrc/cpp/detector.cxx b/demo/csrc/cpp/detector.cxx
index f4f8407a83..2e9da4da47 100644
--- a/demo/csrc/cpp/detector.cxx
+++ b/demo/csrc/cpp/detector.cxx
@@ -11,7 +11,7 @@ DEFINE_ARG_string(input, "Path to input image, video, camera index or image list
 DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
 
 DEFINE_string(output, "detection_%04d.jpg", "Output image, video path, format string or SHOW");
-DEFINE_int32(output_size, 1024, "Long-edge of output frames");
+DEFINE_int32(output_size, 0, "Long-edge of output frames");
 DEFINE_int32(delay, 0, "Delay passed to `cv::waitKey` when using `cv::imshow`");
 
 DEFINE_double(det_thr, 0.5, "Detection score threshold");
diff --git a/demo/csrc/cpp/text_ocr.cxx b/demo/csrc/cpp/text_ocr.cxx
index b7c8358c58..ee24f3d535 100644
--- a/demo/csrc/cpp/text_ocr.cxx
+++ b/demo/csrc/cpp/text_ocr.cxx
@@ -12,8 +12,8 @@ DEFINE_ARG_string(reg_model, "Text recognition model path");
 DEFINE_ARG_string(input, "Path to input image, video, camera index or image list (.txt)");
 DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
 
-DEFINE_string(output, "segmentation_%04d.jpg", "Output image, video path, format string or SHOW");
-DEFINE_int32(output_size, 1024, "Long-edge of output frames");
+DEFINE_string(output, "text_ocr_%04d.jpg", "Output image, video path, format string or SHOW");
+DEFINE_int32(output_size, 0, "Long-edge of output frames");
 DEFINE_int32(delay, 0, "Delay passed to `cv::waitKey` when using `cv::imshow`");
 
 using mmdeploy::TextDetector;
diff --git a/demo/csrc/cpp/utils/mediaio.h b/demo/csrc/cpp/utils/mediaio.h
index e9e47e2f0a..9e723e520c 100644
--- a/demo/csrc/cpp/utils/mediaio.h
+++ b/demo/csrc/cpp/utils/mediaio.h
@@ -90,6 +90,53 @@ class InputIterator {
   Input* input_{};
 };
 
+class BatchInputIterator {
+ public:
+  using iterator_category = std::input_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+  using reference = std::vector<cv::Mat>&;
+  using value_type = reference;
+  using pointer = void;
+
+ public:
+  BatchInputIterator() = default;
+  BatchInputIterator(InputIterator iter, InputIterator end, size_t batch_size)
+      : iter_(std::move(iter)), end_(std::move(end)), batch_size_(batch_size) {
+    next();
+  }
+
+  BatchInputIterator& operator++() {
+    next();
+    return *this;
+  }
+
+  reference operator*() { return data_; }
+
+  friend bool operator==(const BatchInputIterator& a, const BatchInputIterator& b) {
+    return &a == &b || a.is_end() == b.is_end();
+  }
+
+  friend bool operator!=(const BatchInputIterator& a, const BatchInputIterator& b) {
+    return !(a == b);
+  }
+
+ private:
+  void next() {
+    data_.clear();
+    for (size_t i = 0; i < batch_size_ && ++iter_ != end_; ++i) {
+      data_.push_back(*iter_);
+    }
+  }
+
+  bool is_end() const { return data_.empty(); }
+
+ private:
+  InputIterator iter_;
+  InputIterator end_;
+  size_t batch_size_{1};
+  std::vector<cv::Mat> data_;
+};
+
 class Input {
  public:
   explicit Input(const std::string& path, MediaType type = MediaType::kUnknown)
@@ -153,6 +200,19 @@ class Input {
     return img;
   }
 
+  class Batch {
+   public:
+    Batch(Input& input, size_t batch_size) : input_(&input), batch_size_(batch_size) {}
+    BatchInputIterator begin() { return {input_->begin(), input_->end(), batch_size_}; }
+    BatchInputIterator end() { return {}; }  // NOLINT
+
+   private:
+    Input* input_{};
+    size_t batch_size_{1};
+  };
+
+  Batch batch(size_t batch_size) { return {*this, batch_size}; }
+
  private:
   static bool try_image(const std::string& path) { return cv::imread(path).data; }
 
diff --git a/demo/csrc/cpp/utils/visualize.h b/demo/csrc/cpp/utils/visualize.h
index ab51ad5696..775d576081 100644
--- a/demo/csrc/cpp/utils/visualize.h
+++ b/demo/csrc/cpp/utils/visualize.h
@@ -28,24 +28,41 @@ class Visualize {
     }
 
     void add_label(int label_id, float score) {
+      offset_ +=
+          add_text(to_text(label_id, score), {1, (float)offset_}, .5f * (img_.rows + img_.cols)) +
+          2;
+    }
+
+    int add_text(const std::string& text, const cv::Point2f& origin, float size) {
       static constexpr const int font_face = cv::FONT_HERSHEY_SIMPLEX;
       static constexpr const int thickness = 1;
-      static constexpr const double font_scale = 0.5;
-      int baseline = 0;
+      static constexpr const auto max_font_scale = .5f;
+      static constexpr const auto min_font_scale = .25f;
+      float font_scale = 1;
+      if (size < 20) {
+        font_scale = min_font_scale;
+      } else if (size > 200) {
+        font_scale = max_font_scale;
+      } else {
+        font_scale = min_font_scale + (size - 20) / (200 - 20) * (max_font_scale - min_font_scale);
+      }
+      int baseline{};
+      auto text_size = cv::getTextSize(text, font_face, font_scale, thickness, &baseline);
+      cv::Rect rect(origin + cv::Point2f(0, text_size.height + 2 * thickness),
+                    origin + cv::Point2f(text_size.width, 0));
+      rect &= cv::Rect({}, img_.size());
+      img_(rect) *= .35f;
+      cv::putText(img_, text, origin + cv::Point2f(0, text_size.height), font_face, font_scale,
+                  cv::Scalar::all(255), thickness, cv::LINE_AA);
+      return text_size.height;
+    }
+
+    static std::string to_text(int label_id, float score) {
       std::stringstream ss;
-      ss << std::to_string(label_id) << ": " << std::setw(6) << std::fixed << score * 100;
-      auto text_size = cv::getTextSize(ss.str(), font_face, font_scale, thickness, &baseline);
-      offset_ += text_size.height;
-      cv::Point origin(0, offset_);
-      cv::rectangle(img_, origin + cv::Point(0, thickness),
-                    origin + cv::Point(text_size.width, -text_size.height), cv::Scalar{},
-                    cv::FILLED);
-      cv::putText(img_, ss.str(), origin, font_face, font_scale, cv::Scalar::all(255), thickness,
-                  cv::LINE_8);
-      offset_ += thickness;
+      ss << label_id << ": " << std::fixed << std::setprecision(1) << score * 100;
+      return ss.str();
     }
 
-    // TODO: show label & score
     template <typename Mask>
     void add_det(const mmdeploy_rect_t& rect, int label_id, float score, const Mask* mask) {
       add_bbox(rect, label_id, score);
@@ -64,19 +81,25 @@ class Visualize {
       }
     }
 
-    // TODO: show label & score
     void add_bbox(const mmdeploy_rect_t& rect, int label_id, float score) {
+      if (label_id >= 0 && score > 0) {
+        auto area = std::max(0.f, (rect.right - rect.left) * (rect.bottom - rect.top));
+        add_text(to_text(label_id, score), {rect.left, rect.top}, std::sqrt(area * scale_));
+      }
       cv::rectangle(img_, cv::Point2f(rect.left, rect.top), cv::Point2f(rect.right, rect.bottom),
                     cv::Scalar(0, 255, 0));
     }
 
-    // TODO: show text
     void add_text_det(mmdeploy_point_t bbox[4], float score, const char* text, size_t text_size) {
-      std::vector<cv::Point2f> poly_points;
+      std::vector<cv::Point> poly_points;
+      cv::Point2f center{};
       for (int i = 0; i < 4; ++i) {
         poly_points.emplace_back(bbox[i].x * scale_, bbox[i].y * scale_);
+        center += cv::Point2f(poly_points.back());
       }
-      cv::polylines(img_, poly_points, true, cv::Scalar{0, 255, 0});
+      cv::polylines(img_, poly_points, true, cv::Scalar{0, 255, 0}, 1, cv::LINE_AA);
+      auto area = cv::contourArea(poly_points);
+      add_text(std::string(text, text + text_size), center / 4, std::sqrt(area));
     }
 
     void add_rotated_det(const float bbox[5], int label_id, float score) {
@@ -109,7 +132,7 @@ class Visualize {
       if (color_mask.size() != img_.size()) {
         cv::resize(color_mask, color_mask, img_.size(), 0., 0.);
       }
-      img_ = img_ * 0.5 + color_mask * 0.5;
+      cv::addWeighted(img_, .5, color_mask, .5, 0., img_);
     }
 
     void add_pose(const mmdeploy_point_t* pts, const float* scores, int32_t pts_size, double thr) {
@@ -141,11 +164,11 @@ class Visualize {
    private:
     Visualize& v_;
     float scale_{1};
-    int offset_{};
+    int offset_{1};
     cv::Mat img_;
   };
 
-  explicit Visualize(int size) : size_(size) {}
+  explicit Visualize(int size = 0) : size_(size) {}
 
   Session get_session(const cv::Mat& frame) { return Session(*this, frame); }
 

From 9f3481320f74092fe5a69eb4f246fa6e08da1d72 Mon Sep 17 00:00:00 2001
From: zhangli <lzhang329@gmail.com>
Date: Mon, 6 Feb 2023 20:05:34 +0800
Subject: [PATCH 03/11] optimize demos

---
 .../mmseg/segmentation_onnxruntime_dynamic.py |   2 +
 .../codebase/mmpose/pose_tracker/utils.h      |   2 +-
 demo/csrc/cpp/classifier.cxx                  |  35 ++--
 demo/csrc/cpp/det_pose.cpp                    |  79 ++++-----
 demo/csrc/cpp/detector.cxx                    |  67 ++++----
 demo/csrc/cpp/pose_tracker.cpp                |  27 +++-
 ...cker_utils.hpp => pose_tracker_params.inc} |  24 +--
 demo/csrc/cpp/restorer.cxx                    |  43 ++---
 demo/csrc/cpp/rotated_detector.cxx            |  47 +++---
 demo/csrc/cpp/segmentor.cxx                   |  56 +++----
 demo/csrc/cpp/text_ocr.cxx                    |  52 +++---
 demo/csrc/cpp/utils/argparse.h                |  32 ++--
 demo/csrc/cpp/utils/mediaio.h                 |  22 +--
 demo/csrc/cpp/utils/palette.h                 |  93 +++++++++++
 demo/csrc/cpp/utils/skeleton.h                |   2 +-
 demo/csrc/cpp/utils/visualize.h               | 150 +++++++++++++-----
 demo/csrc/cpp/video_cls.cxx                   |  20 +--
 17 files changed, 455 insertions(+), 298 deletions(-)
 rename demo/csrc/cpp/{pose_tracker_utils.hpp => pose_tracker_params.inc} (71%)
 create mode 100644 demo/csrc/cpp/utils/palette.h

diff --git a/configs/mmseg/segmentation_onnxruntime_dynamic.py b/configs/mmseg/segmentation_onnxruntime_dynamic.py
index 5553ee373c..10979ff3fe 100644
--- a/configs/mmseg/segmentation_onnxruntime_dynamic.py
+++ b/configs/mmseg/segmentation_onnxruntime_dynamic.py
@@ -1 +1,3 @@
 _base_ = ['./segmentation_dynamic.py', '../_base_/backends/onnxruntime.py']
+
+codebase_config = dict(type='mmseg', task='Segmentation', with_argmax=False)
\ No newline at end of file
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.h b/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.h
index 3643c99387..676e87157d 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.h
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.h
@@ -22,7 +22,7 @@ using Points = vector<cv::Point2f>;
 using Score = float;
 using Scores = vector<float>;
 
-#define POSE_TRACKER_DEBUG(...) MMDEPLOY_INFO(__VA_ARGS__)
+#define POSE_TRACKER_DEBUG(...) MMDEPLOY_DEBUG(__VA_ARGS__)
 
 // opencv3 can't construct cv::Mat from std::array
 template <size_t N>
diff --git a/demo/csrc/cpp/classifier.cxx b/demo/csrc/cpp/classifier.cxx
index e3a095363c..a4a12eea16 100644
--- a/demo/csrc/cpp/classifier.cxx
+++ b/demo/csrc/cpp/classifier.cxx
@@ -1,43 +1,44 @@
 
 #include "mmdeploy/classifier.hpp"
 
-#include <string>
-
 #include "opencv2/imgcodecs/imgcodecs.hpp"
 #include "utils/argparse.h"
 #include "utils/visualize.h"
 
 DEFINE_ARG_string(model, "Model path");
 DEFINE_ARG_string(image, "Input image path");
-DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
-DEFINE_string(output, "classifier_output.jpg", "output path");
+DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
+DEFINE_string(output, "classifier_output.jpg", "Output image path");
 
 int main(int argc, char* argv[]) {
   if (!utils::ParseArguments(argc, argv)) {
     return -1;
   }
+
   cv::Mat img = cv::imread(ARGS_image);
-  if (!img.data) {
+  if (img.empty()) {
     fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return 1;
+    return -1;
   }
 
-  mmdeploy::Model model(ARGS_model);
-  mmdeploy::Classifier classifier(model, mmdeploy::Device{FLAGS_device, 0});
-
-  utils::Visualize v;
+  // construct a classifier instance
+  mmdeploy::Classifier classifier(mmdeploy::Model{ARGS_model}, mmdeploy::Device{FLAGS_device});
 
-  auto res = classifier.Apply(img);
+  // apply the classifier; the result is an array-like class holding references to
+  // `mmdeploy_classification_t`, will be released automatically on destruction
+  mmdeploy::Classifier::Result result = classifier.Apply(img);
 
+  // visualize results
+  utils::Visualize v;
   auto sess = v.get_session(img);
-  for (const mmdeploy_classification_t& label : res) {
-    sess.add_label(label.label_id, label.score);
+  int count = 0;
+  for (const mmdeploy_classification_t& cls : result) {
+    sess.add_label(cls.label_id, cls.score, count++);
   }
 
-  cv::imwrite(FLAGS_output, sess.get());
-
-  for (const auto& cls : res) {
-    fprintf(stderr, "label: %d, score: %.4f\n", cls.label_id, cls.score);
+  if (!FLAGS_output.empty()) {
+    cv::imwrite(FLAGS_output, sess.get());
   }
+
   return 0;
 }
diff --git a/demo/csrc/cpp/det_pose.cpp b/demo/csrc/cpp/det_pose.cpp
index 2ef35b29e3..b18fbe003b 100644
--- a/demo/csrc/cpp/det_pose.cpp
+++ b/demo/csrc/cpp/det_pose.cpp
@@ -4,22 +4,20 @@
 
 #include "mmdeploy/detector.hpp"
 #include "mmdeploy/pose_detector.hpp"
+#include "opencv2/imgcodecs/imgcodecs.hpp"
 #include "utils/argparse.h"
-#include "utils/mediaio.h"
 #include "utils/visualize.h"
 
 DEFINE_ARG_string(det_model, "Object detection model path");
 DEFINE_ARG_string(pose_model, "Pose estimation model path");
-DEFINE_ARG_string(input, "Path to input image, video, camera index or image list (.txt)");
+DEFINE_ARG_string(image, "Input image path");
 
-DEFINE_string(device, "cpu", "Device name, e.g. \"cpu\", \"cuda\"");
-DEFINE_string(output, "det_pose_%04d.jpg",
-              "Output image, video path or format string; use `cv::imshow` if empty");
-DEFINE_int32(output_size, 1024, "ong-edge of output frames");
-DEFINE_int32(delay, 0, "Delay passed to `cv::waitKey` when using `cv::imshow`");
+DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
+DEFINE_string(output, "det_pose_output.jpg", "Output image path");
+DEFINE_string(skeleton, "coco", R"(Path to skeleton data or name of predefined skeletons: "coco")");
 
 DEFINE_int32(det_label, 0, "Detection label use for pose estimation");
-DEFINE_double(det_thr, 0.5, "Detection score threshold");
+DEFINE_double(det_thr, .5, "Detection score threshold");
 DEFINE_double(det_min_bbox_size, -1, "Detection minimum bbox size");
 
 DEFINE_double(pose_thr, 0, "Pose key-point threshold");
@@ -29,42 +27,49 @@ int main(int argc, char* argv[]) {
     return -1;
   }
 
-  utils::mediaio::Input input(ARGS_input);
-  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
-
-  utils::Visualize v(FLAGS_output_size);
+  cv::Mat img = cv::imread(ARGS_image);
+  if (img.empty()) {
+    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+    return -1;
+  }
 
   mmdeploy::Device device{FLAGS_device};
-  mmdeploy::Detector detector(mmdeploy::Model(ARGS_det_model), device);   // create object detector
-  mmdeploy::PoseDetector pose(mmdeploy::Model(ARGS_pose_model), device);  // create pose detector
-
-  for (const cv::Mat& img : input) {
-    // apply detector
-    mmdeploy::Detector::Result dets = detector.Apply(img);
-
-    // filter detections and extract bboxes for pose model
-    std::vector<mmdeploy_rect_t> bboxes;
-    for (const mmdeploy_detection_t& det : dets) {
-      if (det.label_id == FLAGS_det_label && det.score > FLAGS_det_thr) {
-        bboxes.push_back(det.bbox);
-      }
+  // create object detector
+  mmdeploy::Detector detector(mmdeploy::Model(ARGS_det_model), device);
+  // create pose detector
+  mmdeploy::PoseDetector pose(mmdeploy::Model(ARGS_pose_model), device);
+
+  // apply the detector, the result is an array-like class holding references to
+  // `mmdeploy_detection_t`, will be released automatically on destruction
+  mmdeploy::Detector::Result dets = detector.Apply(img);
+
+  // filter detections and extract bboxes for pose model
+  std::vector<mmdeploy_rect_t> bboxes;
+  for (const mmdeploy_detection_t& det : dets) {
+    if (det.label_id == FLAGS_det_label && det.score > FLAGS_det_thr) {
+      bboxes.push_back(det.bbox);
     }
+  }
 
-    // apply pose detector
-    mmdeploy::PoseDetector::Result poses = pose.Apply(img, bboxes);
+  // apply pose detector, if no bboxes are provided, full image will be used; the result is an
+  // array-like class holding references to `mmdeploy_pose_detection_t`, will be released
+  // automatically on destruction
+  mmdeploy::PoseDetector::Result poses = pose.Apply(img, bboxes);
 
-    assert(bboxes.size() == poses.size());
+  assert(bboxes.size() == poses.size());
 
-    // visualize
-    auto sess = v.get_session(img);
-    for (size_t i = 0; i < bboxes.size(); ++i) {
-      sess.add_bbox(bboxes[i], -1, -1);
-      sess.add_pose(poses[i].point, poses[i].score, poses[i].length, FLAGS_pose_thr);
-    }
+  // visualize results
+  utils::Visualize v;
+  v.set_skeleton(utils::Skeleton::get(FLAGS_skeleton));
+  auto sess = v.get_session(img);
+  for (size_t i = 0; i < bboxes.size(); ++i) {
+    sess.add_bbox(bboxes[i], -1, -1);
+    sess.add_pose(poses[i].point, poses[i].score, poses[i].length, FLAGS_pose_thr);
+  }
 
-    if (!output.write(sess.get())) {
-      break;
-    }
+  if (!FLAGS_output.empty()) {
+    cv::imwrite(FLAGS_output, sess.get());
   }
+
   return 0;
 }
diff --git a/demo/csrc/cpp/detector.cxx b/demo/csrc/cpp/detector.cxx
index 2e9da4da47..e5b9f58183 100644
--- a/demo/csrc/cpp/detector.cxx
+++ b/demo/csrc/cpp/detector.cxx
@@ -1,56 +1,47 @@
 #include "mmdeploy/detector.hpp"
 
-#include <string>
-
+#include "opencv2/imgcodecs/imgcodecs.hpp"
 #include "utils/argparse.h"
-#include "utils/mediaio.h"
 #include "utils/visualize.h"
 
-DEFINE_ARG_string(model, "Object detection model path");
-DEFINE_ARG_string(input, "Path to input image, video, camera index or image list (.txt)");
-DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
-
-DEFINE_string(output, "detection_%04d.jpg", "Output image, video path, format string or SHOW");
-DEFINE_int32(output_size, 0, "Long-edge of output frames");
-DEFINE_int32(delay, 0, "Delay passed to `cv::waitKey` when using `cv::imshow`");
+DEFINE_ARG_string(model, "Model path");
+DEFINE_ARG_string(image, "Input image path");
+DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
+DEFINE_string(output, "detector_output.jpg", "Output image path");
 
-DEFINE_double(det_thr, 0.5, "Detection score threshold");
+DEFINE_double(det_thr, .5, "Detection score threshold");
 
 int main(int argc, char* argv[]) {
   if (!utils::ParseArguments(argc, argv)) {
     return -1;
   }
 
-  // utils for handling input/output of images/videos
-  utils::mediaio::Input input(ARGS_input);
-  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
-  // util for visualization
-  utils::Visualize v(FLAGS_output_size);
-
-  /// ! construct a detector instance
-  mmdeploy::Detector detector(mmdeploy::Model(ARGS_model), mmdeploy::Device{FLAGS_device, 0});
-
-  for (const cv::Mat& img : input) {
-    /// ! apply detector on the image
-    mmdeploy::Detector::Result dets = detector.Apply(img);
-
-    // visualize detection results
-    auto sess = v.get_session(img);
-    for (const mmdeploy_detection_t& det : dets) {
-      auto& bbox = det.bbox;
-      auto bbox_w = bbox.right - bbox.left;
-      auto bbox_h = bbox.bottom - bbox.top;
-      if (bbox_w > 1 && bbox_h > 1 && det.score > FLAGS_det_thr) {  // filter bboxes
-        sess.add_det(det.bbox, det.label_id, det.score, det.mask);
-      }
-    }
+  cv::Mat img = cv::imread(ARGS_image);
+  if (img.empty()) {
+    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+    return -1;
+  }
 
-    // write visualization to output
-    if (!output.write(sess.get())) {
-      // user request exit
-      break;
+  // construct a detector instance
+  mmdeploy::Detector detector(mmdeploy::Model{ARGS_model}, mmdeploy::Device{FLAGS_device});
+
+  // apply the detector, the result is an array-like class holding references to
+  // `mmdeploy_detection_t`, will be released automatically on destruction
+  mmdeploy::Detector::Result dets = detector.Apply(img);
+
+  // visualize
+  utils::Visualize v;
+  auto sess = v.get_session(img);
+  int count = 0;
+  for (const mmdeploy_detection_t& det : dets) {
+    if (det.score > FLAGS_det_thr) {  // filter bboxes
+      sess.add_det(det.bbox, det.label_id, det.score, det.mask, count++);
     }
   }
 
+  if (!FLAGS_output.empty()) {
+    cv::imwrite(FLAGS_output, sess.get());
+  }
+
   return 0;
 }
diff --git a/demo/csrc/cpp/pose_tracker.cpp b/demo/csrc/cpp/pose_tracker.cpp
index 5f4ea14b5c..aa50d0c4f4 100644
--- a/demo/csrc/cpp/pose_tracker.cpp
+++ b/demo/csrc/cpp/pose_tracker.cpp
@@ -2,10 +2,26 @@
 
 #include "mmdeploy/pose_tracker.hpp"
 
-#include "pose_tracker_utils.hpp"
+#include "utils/argparse.h"
 #include "utils/mediaio.h"
 #include "utils/visualize.h"
 
+DEFINE_ARG_string(det_model, "Object detection model path");
+DEFINE_ARG_string(pose_model, "Pose estimation model path");
+DEFINE_ARG_string(input, "Input video path or camera index");
+
+DEFINE_string(device, "cpu", "Device name, e.g. \"cpu\", \"cuda\"");
+DEFINE_string(output, "", "Output video path or format string");
+
+DEFINE_int32(output_size, 0, "Long-edge of output frames");
+DEFINE_int32(show, 1, "Delay passed to `cv::waitKey` when using `cv::imshow`; -1: disable");
+
+DEFINE_string(skeleton, "coco", R"(Path to skeleton data or name of predefined skeletons: "coco")");
+DEFINE_string(background, "default",
+              R"(Output background, "default": original image, "black": black background)");
+
+#include "pose_tracker_params.inc"
+
 int main(int argc, char* argv[]) {
   if (!utils::ParseArguments(argc, argv)) {
     return -1;
@@ -23,16 +39,19 @@ int main(int argc, char* argv[]) {
   mmdeploy::PoseTracker::State state = tracker.CreateState(params);
 
   utils::mediaio::Input input(ARGS_input);
-  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
+  utils::mediaio::Output output(FLAGS_output, FLAGS_show);
 
   utils::Visualize v(FLAGS_output_size);
+  v.set_background(FLAGS_background);
   v.set_skeleton(utils::Skeleton::get(FLAGS_skeleton));
 
   for (const cv::Mat& frame : input) {
-    // apply the pipeline with the tracker state and video frame
+    // apply the pipeline with the tracker state and video frame; the result is an array-like class
+    // holding references to `mmdeploy_pose_tracker_target_t`, will be released automatically on
+    // destruction
     mmdeploy::PoseTracker::Result result = tracker.Apply(state, frame);
 
-    // visualize the results
+    // visualize results
     auto sess = v.get_session(frame);
     for (const mmdeploy_pose_tracker_target_t& target : result) {
       sess.add_pose(target.keypoints, target.scores, target.keypoint_count, FLAGS_pose_kpt_thr);
diff --git a/demo/csrc/cpp/pose_tracker_utils.hpp b/demo/csrc/cpp/pose_tracker_params.inc
similarity index 71%
rename from demo/csrc/cpp/pose_tracker_utils.hpp
rename to demo/csrc/cpp/pose_tracker_params.inc
index b2163ab2a8..e21389eb81 100644
--- a/demo/csrc/cpp/pose_tracker_utils.hpp
+++ b/demo/csrc/cpp/pose_tracker_params.inc
@@ -1,25 +1,5 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
-#ifndef MMDEPLOY_POSE_TRACKER_UTILS_HPP
-#define MMDEPLOY_POSE_TRACKER_UTILS_HPP
-
-#include <iostream>
-#include <map>
-
-#include "mmdeploy/pose_tracker.hpp"
-#include "utils/argparse.h"
-
-DEFINE_ARG_string(det_model, "object detection model path");
-DEFINE_ARG_string(pose_model, "pose estimation model path");
-DEFINE_ARG_string(input, "input video path or camera index");
-
-DEFINE_string(device, "cpu", "Device name, e.g. \"cpu\", \"cuda\"");
-DEFINE_string(output, "", "Output video path or format string; use `cv::imshow` if empty");
-DEFINE_int32(output_size, 0, "Long-edge of output frames");
-DEFINE_int32(delay, 1, "Delay passed to `cv::waitKey` when using `cv::imshow`");
-
-DEFINE_string(skeleton, "coco", "Path to skeleton data");
-
 DEFINE_int32(det_interval, 1, "Detection interval");
 DEFINE_int32(det_label, 0, "Detection label use for pose estimation");
 DEFINE_double(det_thr, 0.5, "Detection score threshold");
@@ -56,6 +36,4 @@ void InitTrackerParams(mmdeploy::PoseTracker::Params& params) {
   params->pose_nms_thr = FLAGS_pose_nms_thr;
   params->track_iou_thr = FLAGS_track_iou_thr;
   params->track_max_missing = FLAGS_track_max_missing;
-}
-
-#endif  // MMDEPLOY_POSE_TRACKER_UTILS_HPP
+}
\ No newline at end of file
diff --git a/demo/csrc/cpp/restorer.cxx b/demo/csrc/cpp/restorer.cxx
index 62612834ea..378294fed4 100644
--- a/demo/csrc/cpp/restorer.cxx
+++ b/demo/csrc/cpp/restorer.cxx
@@ -2,47 +2,38 @@
 
 #include "mmdeploy/restorer.hpp"
 
+#include "opencv2/imgcodecs/imgcodecs.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
 #include "utils/argparse.h"
-#include "utils/mediaio.h"
-#include "utils/visualize.h"
 
 DEFINE_ARG_string(model, "Super-resolution model path");
-DEFINE_ARG_string(input, "Path to input image, video, camera index or image list (.txt)");
-DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
-
-DEFINE_string(output, "upsampled_%04d.jpg", "Output image, video path, format string or SHOW");
-DEFINE_int32(output_size, 0, "Long-edge of output frames");
-DEFINE_int32(delay, 0, "Delay passed to `cv::waitKey` when using `cv::imshow`");
+DEFINE_ARG_string(image, "Input image path");
+DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
+DEFINE_string(output, "restorer_output.jpg", "Output image path");
 
 int main(int argc, char* argv[]) {
   if (!utils::ParseArguments(argc, argv)) {
     return -1;
   }
 
-  // utils for handling input/output of images/videos
-  utils::mediaio::Input input(ARGS_input);
-  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
-
-  // util for visualization
-  utils::Visualize v(FLAGS_output_size);
+  cv::Mat img = cv::imread(ARGS_image);
+  if (img.empty()) {
+    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+    return -1;
+  }
 
-  /// ! construct a restorer instance
+  // construct a restorer instance
   mmdeploy::Restorer restorer{mmdeploy::Model{ARGS_model}, mmdeploy::Device{FLAGS_device}};
 
-  for (const cv::Mat& img : input) {
-    /// ! apply restorer to the image
-    mmdeploy::Restorer::Result result = restorer.Apply(img);
+  // apply restorer to the image
+  mmdeploy::Restorer::Result result = restorer.Apply(img);
 
-    // convert to BGR
-    cv::Mat upsampled(result->height, result->width, CV_8UC3, result->data);
-    cv::cvtColor(upsampled, upsampled, cv::COLOR_RGB2BGR);
+  // convert to BGR
+  cv::Mat upsampled(result->height, result->width, CV_8UC3, result->data);
+  cv::cvtColor(upsampled, upsampled, cv::COLOR_RGB2BGR);
 
-    // optionally rescale to output size & write to output
-    if (!output.write(v.get_session(upsampled).get())) {
-      // user request exit
-      break;
-    }
+  if (!FLAGS_output.empty()) {
+    cv::imwrite(FLAGS_output, upsampled);
   }
 
   return 0;
diff --git a/demo/csrc/cpp/rotated_detector.cxx b/demo/csrc/cpp/rotated_detector.cxx
index 9fffe48214..3ddb5d4c1c 100644
--- a/demo/csrc/cpp/rotated_detector.cxx
+++ b/demo/csrc/cpp/rotated_detector.cxx
@@ -2,41 +2,44 @@
 #include "mmdeploy/rotated_detector.hpp"
 
 #include "utils/argparse.h"
-#include "utils/mediaio.h"
 #include "utils/visualize.h"
 
-DEFINE_ARG_string(model, "Detection model path");
-DEFINE_ARG_string(input, "Path to input image, video, camera index or image list (.txt)");
-DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
+DEFINE_ARG_string(model, "Model path");
+DEFINE_ARG_string(image, "Input image path");
+DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
+DEFINE_string(output, "rotated_detector_output.jpg", "Output image path");
 
-DEFINE_string(output, "detector_%04d.jpg", "Output image, video path, format string or SHOW");
-DEFINE_int32(output_size, 1024, "Long-edge of output frames");
-DEFINE_int32(delay, 0, "timeout for user input when showing images");
-
-DEFINE_double(det_thr, 0.5, "Detection score threshold");
+DEFINE_double(det_thr, 0.1, "Detection score threshold");
 
 int main(int argc, char* argv[]) {
   if (!utils::ParseArguments(argc, argv)) {
     return -1;
   }
 
-  mmdeploy::Model model(ARGS_model);
-  mmdeploy::RotatedDetector detector(model, mmdeploy::Device{FLAGS_device, 0});
+  cv::Mat img = cv::imread(ARGS_image);
+  if (img.empty()) {
+    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+    return -1;
+  }
 
-  utils::mediaio::Input input(ARGS_input);
-  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
+  // construct a detector instance
+  mmdeploy::RotatedDetector detector(mmdeploy::Model{ARGS_model}, mmdeploy::Device{FLAGS_device});
 
-  utils::Visualize v(FLAGS_output_size);
+  // apply the detector, the result is an array-like class holding references to
+  // `mmdeploy_rotated_detection_t`, will be released automatically on destruction
+  mmdeploy::RotatedDetector::Result dets = detector.Apply(img);
 
-  for (const cv::Mat& img : input) {
-    mmdeploy::RotatedDetector::Result dets = detector.Apply(img);
-    auto sess = v.get_session(img);
-    for (const mmdeploy_rotated_detection_t& det : dets) {
-      if (det.score > FLAGS_det_thr) {
-        sess.add_rotated_det(det.rbbox, det.label_id, det.score);
-      }
+  // visualize results
+  utils::Visualize v;
+  auto sess = v.get_session(img);
+  for (const mmdeploy_rotated_detection_t& det : dets) {
+    if (det.score > FLAGS_det_thr) {
+      sess.add_rotated_det(det.rbbox, det.label_id, det.score);
     }
-    output.write(sess.get());
+  }
+
+  if (!FLAGS_output.empty()) {
+    cv::imwrite(FLAGS_output, sess.get());
   }
 
   return 0;
diff --git a/demo/csrc/cpp/segmentor.cxx b/demo/csrc/cpp/segmentor.cxx
index 2f56093152..55ba41db67 100644
--- a/demo/csrc/cpp/segmentor.cxx
+++ b/demo/csrc/cpp/segmentor.cxx
@@ -2,7 +2,6 @@
 
 #include "mmdeploy/segmentor.hpp"
 
-#include <random>
 #include <string>
 #include <vector>
 
@@ -10,50 +9,39 @@
 #include "utils/mediaio.h"
 #include "utils/visualize.h"
 
-DEFINE_ARG_string(model, "Object detection model path");
-DEFINE_ARG_string(input, "Path to input image, video, camera index or image list (.txt)");
-DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
-
-DEFINE_string(output, "segmentation_%04d.jpg", "Output image, video path, format string or SHOW");
-DEFINE_int32(output_size, 1024, "Long-edge of output frames");
-DEFINE_int32(delay, 0, "Delay passed to `cv::waitKey` when using `cv::imshow`");
-
-std::vector<cv::Vec3b> gen_palette(int num_classes = 256);
+DEFINE_ARG_string(model, "Model path");
+DEFINE_ARG_string(image, "Input image path");
+DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
+DEFINE_string(output, "segmentor_output.jpg", "Output image path");
+DEFINE_string(palette, "cityscapes",
+              R"(Path to palette data or name of predefined palettes: "cityscapes")");
 
 int main(int argc, char* argv[]) {
   if (!utils::ParseArguments(argc, argv)) {
     return -1;
   }
 
+  cv::Mat img = cv::imread(ARGS_image);
+  if (img.empty()) {
+    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+    return -1;
+  }
+
   mmdeploy::Segmentor segmentor{mmdeploy::Model{ARGS_model}, mmdeploy::Device{FLAGS_device}};
 
-  utils::mediaio::Input input(ARGS_input);
-  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
+  // apply the detector, the result is an array-like class holding a reference to
+  // `mmdeploy_segmentation_t`, will be released automatically on destruction
+  mmdeploy::Segmentor::Result seg = segmentor.Apply(img);
 
-  utils::Visualize v(FLAGS_output_size);
-  v.set_palette(gen_palette());
+  // visualize
+  utils::Visualize v;
+  v.set_palette(utils::Palette::get(FLAGS_palette));
+  auto sess = v.get_session(img);
+  sess.add_mask(seg->height, seg->width, seg->classes, seg->mask, seg->score);
 
-  for (const cv::Mat& img : input) {
-    mmdeploy::Segmentor::Result result = segmentor.Apply(img);
-    auto sess = v.get_session(img);
-    mmdeploy_segmentation_t& seg = *result;
-    sess.add_mask(seg.height, seg.width, seg.classes, seg.mask, nullptr);
-    if (!output.write(sess.get())) {
-      break;
-    }
+  if (!FLAGS_output.empty()) {
+    cv::imwrite(FLAGS_output, sess.get());
   }
 
   return 0;
 }
-
-std::vector<cv::Vec3b> gen_palette(int num_classes) {
-  std::mt19937 gen{};  // NOLINT
-  std::uniform_int_distribution<ushort> uniform_dist(0, 255);
-
-  std::vector<cv::Vec3b> palette;
-  palette.reserve(num_classes);
-  for (auto i = 0; i < num_classes; ++i) {
-    palette.emplace_back(uniform_dist(gen), uniform_dist(gen), uniform_dist(gen));
-  }
-  return palette;
-}
diff --git a/demo/csrc/cpp/text_ocr.cxx b/demo/csrc/cpp/text_ocr.cxx
index ee24f3d535..6c8fdb055b 100644
--- a/demo/csrc/cpp/text_ocr.cxx
+++ b/demo/csrc/cpp/text_ocr.cxx
@@ -9,12 +9,9 @@
 
 DEFINE_ARG_string(det_model, "Text detection model path");
 DEFINE_ARG_string(reg_model, "Text recognition model path");
-DEFINE_ARG_string(input, "Path to input image, video, camera index or image list (.txt)");
-DEFINE_string(device, "cpu", "Device name, e.g. cpu, cuda");
-
-DEFINE_string(output, "text_ocr_%04d.jpg", "Output image, video path, format string or SHOW");
-DEFINE_int32(output_size, 0, "Long-edge of output frames");
-DEFINE_int32(delay, 0, "Delay passed to `cv::waitKey` when using `cv::imshow`");
+DEFINE_ARG_string(image, "Input image path");
+DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
+DEFINE_string(output, "text_ocr_output.jpg", "Output image path");
 
 using mmdeploy::TextDetector;
 using mmdeploy::TextRecognizer;
@@ -24,27 +21,36 @@ int main(int argc, char* argv[]) {
     return -1;
   }
 
+  cv::Mat img = cv::imread(ARGS_image);
+  if (img.empty()) {
+    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+    return -1;
+  }
+
   mmdeploy::Device device(FLAGS_device);
   TextDetector detector{mmdeploy::Model(ARGS_det_model), device};
   TextRecognizer recognizer{mmdeploy::Model(ARGS_reg_model), device};
 
-  utils::mediaio::Input input(ARGS_input);
-  utils::mediaio::Output output(FLAGS_output, FLAGS_delay);
-
-  utils::Visualize v(FLAGS_output_size);
-
-  for (const cv::Mat& img : input) {
-    TextDetector::Result bboxes = detector.Apply(img);
-    TextRecognizer::Result texts = recognizer.Apply(img, {bboxes.begin(), bboxes.size()});
-    auto sess = v.get_session(img);
-    for (size_t i = 0; i < bboxes.size(); ++i) {
-      mmdeploy_text_detection_t& bbox = bboxes[i];
-      mmdeploy_text_recognition_t& text = texts[i];
-      sess.add_text_det(bbox.bbox, bbox.score, text.text, text.length);
-    }
-    if (!output.write(sess.get())) {
-      break;
-    }
+  // apply the detector, the result is an array-like class holding references to
+  // `mmdeploy_text_detection_t`, will be released automatically on destruction
+  TextDetector::Result bboxes = detector.Apply(img);
+
+  // apply recognizer, if no bboxes are provided, full image will be used; the result is an
+  // array-like class holding references to `mmdeploy_text_recognition_t`, will be released
+  // automatically on destruction
+  TextRecognizer::Result texts = recognizer.Apply(img, {bboxes.begin(), bboxes.size()});
+
+  // visualize results
+  utils::Visualize v;
+  auto sess = v.get_session(img);
+  for (size_t i = 0; i < bboxes.size(); ++i) {
+    mmdeploy_text_detection_t& bbox = bboxes[i];
+    mmdeploy_text_recognition_t& text = texts[i];
+    sess.add_text_det(bbox.bbox, bbox.score, text.text, text.length, i);
+  }
+
+  if (!FLAGS_output.empty()) {
+    cv::imwrite(FLAGS_output, sess.get());
   }
 
   return 0;
diff --git a/demo/csrc/cpp/utils/argparse.h b/demo/csrc/cpp/utils/argparse.h
index be9e9dff3b..37cee6656e 100644
--- a/demo/csrc/cpp/utils/argparse.h
+++ b/demo/csrc/cpp/utils/argparse.h
@@ -73,22 +73,36 @@ class ArgParse {
 
   bool _Parse(int argc, char* argv[]) {
     int arg_idx{-1};
-    std::vector<const char*> args(infos_.size());
+    std::vector<std::string> args(infos_.size());
     for (int i = 1; i < argc; ++i) {
       if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
         return false;
       }
       if (argv[i][0] == '-' && argv[i][1] == '-') {
+        // parse flag key-value pair (--x=y or --x y)
+        int eq{-1};
+        for (int k = 2; argv[i][k]; ++k) {
+          if (argv[i][k] == '=') {
+            eq = k;
+            break;
+          }
+        }
+        std::string key;
+        std::string val;
+        if (eq >= 0) {
+          key = std::string(argv[i] + 2, argv[i] + eq);
+          val = std::string(argv[i] + eq + 1);
+        } else {
+          key = std::string(argv[i] + 2);
+          if (i < argc - 1) {
+            val = argv[++i];
+          }
+        }
         bool found{};
         for (int j = 0; j < infos_.size(); ++j) {
           auto& flag = infos_[j];
-          auto name = "--" + flag.name;
-          if (strncmp(argv[i], name.c_str(), name.size()) == 0) {
-            if (argv[i][name.size()] == '=') {
-              args[j] = argv[i] + name.size() + 1;
-            } else if (i + 1 < argc) {
-              args[j] = argv[++i];
-            }
+          if (key == flag.name) {
+            args[j] = val;
             found = true;
             break;
           }
@@ -128,7 +142,7 @@ class ArgParse {
     }
 
     for (int i = 0; i < infos_.size(); ++i) {
-      if (args[i]) {
+      if (!args[i].empty()) {
         try {
           parse_str(infos_[i], args[i]);
         } catch (...) {
diff --git a/demo/csrc/cpp/utils/mediaio.h b/demo/csrc/cpp/utils/mediaio.h
index 9e723e520c..3b61ea06ee 100644
--- a/demo/csrc/cpp/utils/mediaio.h
+++ b/demo/csrc/cpp/utils/mediaio.h
@@ -13,7 +13,7 @@
 namespace utils {
 namespace mediaio {
 
-enum class MediaType { kUnknown, kImage, kVideo, kImageList, kWebcam, kFmtStr, kShow };
+enum class MediaType { kUnknown, kImage, kVideo, kImageList, kWebcam, kFmtStr, kDisable };
 
 namespace detail {
 
@@ -292,12 +292,12 @@ class OutputIterator {
 
 class Output {
  public:
-  explicit Output(const std::string& path, int delay, MediaType type = MediaType::kUnknown)
-      : path_(path), type_(type), delay_(delay) {
+  explicit Output(const std::string& path, int show, MediaType type = MediaType::kUnknown)
+      : path_(path), type_(type), show_(show) {
     if (type_ == MediaType::kUnknown) {
       auto ext = detail::get_extension(path);
       if (path_.empty()) {
-        type_ = MediaType::kShow;
+        type_ = MediaType::kDisable;
       } else if (detail::is_image(ext)) {
         if (detail::is_fmtstr(path)) {
           type_ = MediaType::kFmtStr;
@@ -315,6 +315,8 @@ class Output {
   bool write(const cv::Mat& frame) {
     bool exit = false;
     switch (type_) {
+      case MediaType::kDisable:
+        break;
       case MediaType::kImage:
         cv::imwrite(path_, frame);
         break;
@@ -327,14 +329,14 @@ class Output {
       case MediaType::kVideo:
         write_video(frame);
         break;
-      case MediaType::kShow:
-        cv::imshow("", frame);
-        exit = cv::waitKey(delay_) == 'q';
-        break;
       default:
         std::cout << "unsupported output media type\n";
         assert(0);
     }
+    if (show_ >= 0) {
+      cv::imshow("", frame);
+      exit = cv::waitKey(show_) == 'q';
+    }
     ++frame_id_;
     return !exit;
   }
@@ -344,7 +346,7 @@ class Output {
  private:
   void write_video(const cv::Mat& frame) {
     if (!video_.isOpened()) {
-      video_.open(path_, cv::VideoWriter::fourcc('M', 'P', '4', 'V'), 30, frame.size());
+      video_.open(path_, cv::VideoWriter::fourcc('m', 'p', '4', 'v'), 30, frame.size());
     }
     video_ << frame;
   }
@@ -352,7 +354,7 @@ class Output {
  private:
   std::string path_;
   MediaType type_{MediaType::kUnknown};
-  int delay_{1};
+  int show_{1};
   size_t frame_id_{0};
   cv::VideoWriter video_;
 };
diff --git a/demo/csrc/cpp/utils/palette.h b/demo/csrc/cpp/utils/palette.h
new file mode 100644
index 0000000000..007bd640d8
--- /dev/null
+++ b/demo/csrc/cpp/utils/palette.h
@@ -0,0 +1,93 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_PALETTE_H
+#define MMDEPLOY_PALETTE_H
+
+#include <fstream>
+#include <opencv2/core/core.hpp>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace utils {
+
+struct Palette {
+  std::vector<cv::Vec3b> data;
+  static Palette get(const std::string& path);
+  static Palette get(int n);
+};
+
+inline Palette Palette::get(const std::string& path) {
+  if (path == "coco") {
+    Palette p{{
+        {220, 20, 60},   {119, 11, 32},   {0, 0, 142},     {0, 0, 230},     {106, 0, 228},
+        {0, 60, 100},    {0, 80, 100},    {0, 0, 70},      {0, 0, 192},     {250, 170, 30},
+        {100, 170, 30},  {220, 220, 0},   {175, 116, 175}, {250, 0, 30},    {165, 42, 42},
+        {255, 77, 255},  {0, 226, 252},   {182, 182, 255}, {0, 82, 0},      {120, 166, 157},
+        {110, 76, 0},    {174, 57, 255},  {199, 100, 0},   {72, 0, 118},    {255, 179, 240},
+        {0, 125, 92},    {209, 0, 151},   {188, 208, 182}, {0, 220, 176},   {255, 99, 164},
+        {92, 0, 73},     {133, 129, 255}, {78, 180, 255},  {0, 228, 0},     {174, 255, 243},
+        {45, 89, 255},   {134, 134, 103}, {145, 148, 174}, {255, 208, 186}, {197, 226, 255},
+        {171, 134, 1},   {109, 63, 54},   {207, 138, 255}, {151, 0, 95},    {9, 80, 61},
+        {84, 105, 51},   {74, 65, 105},   {166, 196, 102}, {208, 195, 210}, {255, 109, 65},
+        {0, 143, 149},   {179, 0, 194},   {209, 99, 106},  {5, 121, 0},     {227, 255, 205},
+        {147, 186, 208}, {153, 69, 1},    {3, 95, 161},    {163, 255, 0},   {119, 0, 170},
+        {0, 182, 199},   {0, 165, 120},   {183, 130, 88},  {95, 32, 0},     {130, 114, 135},
+        {110, 129, 133}, {166, 74, 118},  {219, 142, 185}, {79, 210, 114},  {178, 90, 62},
+        {65, 70, 15},    {127, 167, 115}, {59, 105, 106},  {142, 108, 45},  {196, 172, 0},
+        {95, 54, 80},    {128, 76, 255},  {201, 57, 1},    {246, 0, 122},   {191, 162, 208},
+    }};
+    for (auto& x : p.data) {
+      std::swap(x[0], x[2]);
+    }
+    return p;
+  } else if (path == "cityscapes") {
+    Palette p{{
+        {128, 64, 128},  {244, 35, 232}, {70, 70, 70},  {102, 102, 156}, {190, 153, 153},
+        {153, 153, 153}, {250, 170, 30}, {220, 220, 0}, {107, 142, 35},  {152, 251, 152},
+        {70, 130, 180},  {220, 20, 60},  {255, 0, 0},   {0, 0, 142},     {0, 0, 70},
+        {0, 60, 100},    {0, 80, 100},   {0, 0, 230},   {119, 11, 32},
+    }};
+    for (auto& x : p.data) {
+      std::swap(x[0], x[2]);
+    }
+    return p;
+  }
+  std::ifstream ifs(path);
+  if (ifs.is_open()) {
+    assert(0 && "Failed to open palette file.");
+  }
+  Palette p;
+  int n = 0;
+  ifs >> n;
+  for (int i = 0; i < n; ++i) {
+    cv::Vec3b x{};
+    ifs >> x[0] >> x[1] >> x[2];
+    p.data.push_back(x);
+  }
+  return p;
+}
+
+inline Palette Palette::get(int n) {
+  std::vector<cv::Point3f> samples(n * 100);
+  std::vector<int> indices(samples.size());
+  std::iota(indices.begin(), indices.end(), 0);
+  std::mt19937 gen;  // NOLINT
+  std::uniform_int_distribution<ushort> uniform_dist(0, 255);
+  for (auto& x : samples) {
+    x = {(float)uniform_dist(gen), (float)uniform_dist(gen), (float)uniform_dist(gen)};
+  }
+  std::vector<cv::Point3f> centers;
+  cv::kmeans(samples, n, indices, cv::TermCriteria(cv::TermCriteria::Type::COUNT, 10, 0), 1,
+             cv::KMEANS_PP_CENTERS, centers);
+  Palette p;
+  for (const auto& c : centers) {
+    p.data.emplace_back((uchar)c.x, (uchar)c.y, (uchar)c.z);
+  }
+  return p;
+}
+
+}  // namespace utils
+
+#endif  // MMDEPLOY_PALETTE_H
diff --git a/demo/csrc/cpp/utils/skeleton.h b/demo/csrc/cpp/utils/skeleton.h
index 650b9089c8..f0b49dfe60 100644
--- a/demo/csrc/cpp/utils/skeleton.h
+++ b/demo/csrc/cpp/utils/skeleton.h
@@ -52,7 +52,7 @@ inline Skeleton Skeleton::get(const std::string& path) {
   }
   std::ifstream ifs(path);
   if (!ifs.is_open()) {
-    assert(0 && "Failed to skeleton file");
+    assert(0 && "Failed to open skeleton file");
   }
   Skeleton skel;
   int n = 0;
diff --git a/demo/csrc/cpp/utils/visualize.h b/demo/csrc/cpp/utils/visualize.h
index 775d576081..5d0323a0ee 100644
--- a/demo/csrc/cpp/utils/visualize.h
+++ b/demo/csrc/cpp/utils/visualize.h
@@ -3,11 +3,14 @@
 #ifndef MMDEPLOY_VISUALIZE_H
 #define MMDEPLOY_VISUALIZE_H
 
+#include <algorithm>
 #include <iomanip>
+#include <numeric>
 #include <vector>
 
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/imgproc/imgproc.hpp"
+#include "palette.h"
 #include "skeleton.h"
 
 namespace utils {
@@ -20,17 +23,27 @@ class Visualize {
       if (v_.size_) {
         scale_ = (float)v_.size_ / (float)std::max(frame.cols, frame.rows);
       }
-      if (scale_ != 1) {
-        cv::resize(frame, img_, {}, scale_, scale_);
+      cv::Mat img;
+      if (v.background_ == "black") {
+        img = cv::Mat::zeros(frame.size(), CV_8UC3);
       } else {
-        img_ = frame.clone();
+        img = frame;
+        if (img.channels() == 1) {
+          cv::cvtColor(img, img, cv::COLOR_GRAY2BGR);
+        }
       }
+      if (scale_ != 1) {
+        cv::resize(img, img, {}, scale_, scale_);
+      } else if (img.data == frame.data) {
+        img = img.clone();
+      }
+      img_ = std::move(img);
     }
 
-    void add_label(int label_id, float score) {
-      offset_ +=
-          add_text(to_text(label_id, score), {1, (float)offset_}, .5f * (img_.rows + img_.cols)) +
-          2;
+    void add_label(int label_id, float score, int index) {
+      printf("label: %d, label_id: %d, score: %.4f\n", index, label_id, score);
+      auto size = .5f * static_cast<float>(img_.rows + img_.cols);
+      offset_ += add_text(to_text(label_id, score), {1, (float)offset_}, size) + 2;
     }
 
     int add_text(const std::string& text, const cv::Point2f& origin, float size) {
@@ -38,7 +51,7 @@ class Visualize {
       static constexpr const int thickness = 1;
       static constexpr const auto max_font_scale = .5f;
       static constexpr const auto min_font_scale = .25f;
-      float font_scale = 1;
+      float font_scale{};
       if (size < 20) {
         font_scale = min_font_scale;
       } else if (size > 200) {
@@ -64,33 +77,54 @@ class Visualize {
     }
 
     template <typename Mask>
-    void add_det(const mmdeploy_rect_t& rect, int label_id, float score, const Mask* mask) {
-      add_bbox(rect, label_id, score);
+    void add_det(const mmdeploy_rect_t& rect, int label_id, float score, const Mask* mask,
+                 int index) {
+      printf("bbox %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, label=%d, score=%.4f\n", index,
+             rect.left, rect.top, rect.right, rect.bottom, label_id, score);
       if (mask) {
-        cv::Mat mask_img(mask->height, mask->width, CV_8UC1, const_cast<char*>(mask->data));
-        auto x0 = std::max(std::floor(rect.left) - 1, 0.f);
-        auto y0 = std::max(std::floor(rect.top) - 1, 0.f);
-        cv::Rect roi((int)x0, (int)y0, mask->width, mask->height);
-
-        // split the RGB channels, overlay mask to a specific color channel
-        cv::Mat ch[3]{};
-        cv::split(img_, ch);
-        int col = 0;  // int col = i % 3;
-        cv::bitwise_or(mask_img, ch[col](roi), ch[col](roi));
-        merge(ch, 3, img_);
+        fprintf(stdout, "mask %d, height=%d, width=%d\n", index, mask->height, mask->width);
+        auto x0 = (int)std::max(std::floor(rect.left) - 1, 0.f);
+        auto y0 = (int)std::max(std::floor(rect.top) - 1, 0.f);
+        add_instance_mask({x0, y0}, rand(), mask->data, mask->height, mask->width);
       }
+      add_bbox(rect, label_id, score);
     }
 
-    void add_bbox(const mmdeploy_rect_t& rect, int label_id, float score) {
+    void add_instance_mask(const cv::Point& origin, int color_id, const char* mask_data, int mask_h,
+                           int mask_w, float alpha = .5f) {
+      auto color = v_.palette_.data[color_id % v_.palette_.data.size()];
+      auto x_end = std::min(origin.x + mask_w, img_.cols);
+      auto y_end = std::min(origin.y + mask_h, img_.rows);
+      auto img_data = img_.ptr<cv::Vec3b>();
+      for (int i = origin.y; i < y_end; ++i) {
+        for (int j = origin.x; j < x_end; ++j) {
+          if (mask_data[(i - origin.y) * mask_w + (j - origin.x)]) {
+            img_data[i * img_.cols + j] = img_data[i * img_.cols + j] * (1 - alpha) + color * alpha;
+          }
+        }
+      }
+    }
+
+    void add_bbox(mmdeploy_rect_t rect, int label_id, float score) {
+      rect.left *= scale_;
+      rect.right *= scale_;
+      rect.top *= scale_;
+      rect.bottom *= scale_;
       if (label_id >= 0 && score > 0) {
         auto area = std::max(0.f, (rect.right - rect.left) * (rect.bottom - rect.top));
-        add_text(to_text(label_id, score), {rect.left, rect.top}, std::sqrt(area * scale_));
+        add_text(to_text(label_id, score), {rect.left, rect.top}, std::sqrt(area));
       }
       cv::rectangle(img_, cv::Point2f(rect.left, rect.top), cv::Point2f(rect.right, rect.bottom),
                     cv::Scalar(0, 255, 0));
     }
 
-    void add_text_det(mmdeploy_point_t bbox[4], float score, const char* text, size_t text_size) {
+    void add_text_det(mmdeploy_point_t bbox[4], float score, const char* text, size_t text_size,
+                      int index) {
+      printf("bbox[%d]: (%.2f, %.2f), (%.2f, %.2f), (%.2f, %.2f), (%.2f, %.2f)\n", index,  //
+             bbox[0].x, bbox[0].y,                                                         //
+             bbox[1].x, bbox[1].y,                                                         //
+             bbox[2].x, bbox[2].y,                                                         //
+             bbox[3].x, bbox[3].y);
       std::vector<cv::Point> poly_points;
       cv::Point2f center{};
       for (int i = 0; i < 4; ++i) {
@@ -98,15 +132,18 @@ class Visualize {
         center += cv::Point2f(poly_points.back());
       }
       cv::polylines(img_, poly_points, true, cv::Scalar{0, 255, 0}, 1, cv::LINE_AA);
-      auto area = cv::contourArea(poly_points);
-      add_text(std::string(text, text + text_size), center / 4, std::sqrt(area));
+      if (text) {
+        auto area = cv::contourArea(poly_points);
+        fprintf(stdout, "text[%d]: %s\n", index, text);
+        add_text(std::string(text, text + text_size), center / 4, std::sqrt(area));
+      }
     }
 
     void add_rotated_det(const float bbox[5], int label_id, float score) {
-      float xc = bbox[0];
-      float yc = bbox[1];
-      float w = bbox[2];
-      float h = bbox[3];
+      float xc = bbox[0] * scale_;
+      float yc = bbox[1] * scale_;
+      float w = bbox[2] * scale_;
+      float h = bbox[3] * scale_;
       float ag = bbox[4];
       float wx = w / 2 * std::cos(ag);
       float wy = w / 2 * std::sin(ag);
@@ -116,27 +153,51 @@ class Visualize {
       cv::Point2f p2{xc + wx - hx, yc + wy - hy};
       cv::Point2f p3{xc + wx + hx, yc + wy + hy};
       cv::Point2f p4{xc - wx + hx, yc - wy + hy};
+      cv::Point2f c = .25f * (p1 + p2 + p3 + p4);
       cv::drawContours(
           img_,
           std::vector<std::vector<cv::Point>>{{p1 * scale_, p2 * scale_, p3 * scale_, p4 * scale_}},
           -1, {0, 255, 0}, 2);
+      add_text(to_text(label_id, score), c, std::sqrt(w * h));
     }
 
-    // TODO: handle score output
     void add_mask(int height, int width, int n_classes, const int* mask, const float* score) {
       cv::Mat color_mask = cv::Mat::zeros(height, width, CV_8UC3);
-      int pos = 0;
-      for (auto iter = color_mask.begin<cv::Vec3b>(); iter != color_mask.end<cv::Vec3b>(); ++iter) {
-        *iter = v_.palette_[mask[pos++] % v_.palette_.size()];
+      auto n_pix = color_mask.total();
+
+      // compute top 1 idx if score (CHW) is available
+      cv::Mat_<int> top;
+      if (!mask && score) {
+        top = cv::Mat_<int>::zeros(height, width);
+        for (auto c = 1; c < n_classes; ++c) {
+          top.forEach([&](int& x, const int* idx) {
+            auto offset = idx[0] * width + idx[1];
+            if (score[c * n_pix + offset] > score[x * n_pix + offset]) {
+              x = c;
+            }
+          });
+        }
+        mask = top.ptr<int>();
       }
-      if (color_mask.size() != img_.size()) {
-        cv::resize(color_mask, color_mask, img_.size(), 0., 0.);
+
+      if (mask) {
+        // palette look-up
+        color_mask.forEach<cv::Vec3b>([&](cv::Vec3b& x, const int* idx) {
+          auto& palette = v_.palette_.data;
+          x = palette[mask[idx[0] * width + idx[1]] % palette.size()];
+        });
+
+        if (color_mask.size() != img_.size()) {
+          cv::resize(color_mask, color_mask, img_.size());
+        }
+
+        // blend mask and background image
+        cv::addWeighted(img_, .5, color_mask, .5, 0., img_);
       }
-      cv::addWeighted(img_, .5, color_mask, .5, 0., img_);
     }
 
     void add_pose(const mmdeploy_point_t* pts, const float* scores, int32_t pts_size, double thr) {
-      auto& skel = v_.skel_;
+      auto& skel = v_.skeleton_;
       std::vector<int> used(pts_size);
       std::vector<int> is_end_point(pts_size);
       for (size_t i = 0; i < skel.links.size(); ++i) {
@@ -168,18 +229,21 @@ class Visualize {
     cv::Mat img_;
   };
 
-  explicit Visualize(int size = 0) : size_(size) {}
+  explicit Visualize(int size = 0) : size_(size) { palette_ = Palette::get(32); }
 
   Session get_session(const cv::Mat& frame) { return Session(*this, frame); }
 
-  void set_skeleton(const Skeleton& skel) { skel_ = skel; }
+  void set_skeleton(const Skeleton& skeleton) { skeleton_ = skeleton; }
+
+  void set_palette(const Palette& palette) { palette_ = palette; }
 
-  void set_palette(const std::vector<cv::Vec3b>& palette) { palette_ = palette; }
+  void set_background(const std::string& background) { background_ = background; }
 
  private:
   friend Session;
-  Skeleton skel_;
-  std::vector<cv::Vec3b> palette_;
+  Skeleton skeleton_;
+  Palette palette_;
+  std::string background_;
   int size_{};
 };
 
diff --git a/demo/csrc/cpp/video_cls.cxx b/demo/csrc/cpp/video_cls.cxx
index 95c9ef239e..ce08a50759 100644
--- a/demo/csrc/cpp/video_cls.cxx
+++ b/demo/csrc/cpp/video_cls.cxx
@@ -3,8 +3,8 @@
 #include <string>
 
 #include "mmdeploy/video_recognizer.hpp"
-#include "opencv2/imgcodecs/imgcodecs.hpp"
 #include "opencv2/videoio.hpp"
+#include "utils/argparse.h"
 
 void SampleFrames(const char* video_path, std::map<int, cv::Mat>& buffer,
                   std::vector<mmdeploy::Mat>& clips, int clip_len, int frame_interval = 1,
@@ -57,14 +57,14 @@ void SampleFrames(const char* video_path, std::map<int, cv::Mat>& buffer,
   }
 }
 
+DEFINE_ARG_string(model, "Model path");
+DEFINE_ARG_string(video, "Input video path");
+DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
+
 int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  video_cls device_name model_path video_path\n");
-    return 1;
+  if (!utils::ParseArguments(argc, argv)) {
+    return -1;
   }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto video_path = argv[3];
 
   int clip_len = 1;
   int frame_interval = 1;
@@ -73,10 +73,10 @@ int main(int argc, char* argv[]) {
   std::map<int, cv::Mat> buffer;
   std::vector<mmdeploy::Mat> clips;
   mmdeploy::VideoSampleInfo clip_info = {clip_len, num_clips};
-  SampleFrames(video_path, buffer, clips, clip_len, frame_interval, num_clips);
+  SampleFrames(ARGS_video.c_str(), buffer, clips, clip_len, frame_interval, num_clips);
 
-  mmdeploy::Model model(model_path);
-  mmdeploy::VideoRecognizer recognizer(model, mmdeploy::Device{device_name, 0});
+  mmdeploy::Model model(ARGS_model);
+  mmdeploy::VideoRecognizer recognizer(model, mmdeploy::Device{FLAGS_device});
 
   auto res = recognizer.Apply(clips, clip_info);
 

From 0ab7a5116e8d636679655a07cfabfacf249a24c2 Mon Sep 17 00:00:00 2001
From: zhangli <lzhang329@gmail.com>
Date: Mon, 6 Feb 2023 20:07:13 +0800
Subject: [PATCH 04/11] fix minor

---
 configs/mmseg/segmentation_onnxruntime_dynamic.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/configs/mmseg/segmentation_onnxruntime_dynamic.py b/configs/mmseg/segmentation_onnxruntime_dynamic.py
index 10979ff3fe..5553ee373c 100644
--- a/configs/mmseg/segmentation_onnxruntime_dynamic.py
+++ b/configs/mmseg/segmentation_onnxruntime_dynamic.py
@@ -1,3 +1 @@
 _base_ = ['./segmentation_dynamic.py', '../_base_/backends/onnxruntime.py']
-
-codebase_config = dict(type='mmseg', task='Segmentation', with_argmax=False)
\ No newline at end of file

From 9fbe5ea0c721f69d754f10e191f12c45a3ab9579 Mon Sep 17 00:00:00 2001
From: zhangli <lzhang329@gmail.com>
Date: Mon, 6 Feb 2023 20:10:27 +0800
Subject: [PATCH 05/11] fix minor

---
 demo/csrc/cpp/pose_tracker_params.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demo/csrc/cpp/pose_tracker_params.inc b/demo/csrc/cpp/pose_tracker_params.inc
index e21389eb81..2cda301869 100644
--- a/demo/csrc/cpp/pose_tracker_params.inc
+++ b/demo/csrc/cpp/pose_tracker_params.inc
@@ -36,4 +36,4 @@ void InitTrackerParams(mmdeploy::PoseTracker::Params& params) {
   params->pose_nms_thr = FLAGS_pose_nms_thr;
   params->track_iou_thr = FLAGS_track_iou_thr;
   params->track_max_missing = FLAGS_track_max_missing;
-}
\ No newline at end of file
+}

From 6a0ddf385e5fe2c7b28aec971d7fccecce9c8424 Mon Sep 17 00:00:00 2001
From: zhangli <lzhang329@gmail.com>
Date: Tue, 7 Feb 2023 12:25:58 +0800
Subject: [PATCH 06/11] fix minor

---
 demo/csrc/cpp/utils/argparse.h | 3 ++-
 demo/csrc/cpp/utils/mediaio.h  | 2 +-
 demo/csrc/cpp/utils/palette.h  | 5 +++--
 demo/csrc/cpp/utils/skeleton.h | 3 ++-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/demo/csrc/cpp/utils/argparse.h b/demo/csrc/cpp/utils/argparse.h
index 37cee6656e..3204b0f6ca 100644
--- a/demo/csrc/cpp/utils/argparse.h
+++ b/demo/csrc/cpp/utils/argparse.h
@@ -108,7 +108,8 @@ class ArgParse {
           }
         }
         if (!found) {
-          std::cout << "error: unknown option: " << argv[i] << std::endl;
+          std::cout << "error: unknown option: " << key << std::endl;
+          return false;
         }
       } else {
         for (arg_idx++; arg_idx < infos_.size(); ++arg_idx) {
diff --git a/demo/csrc/cpp/utils/mediaio.h b/demo/csrc/cpp/utils/mediaio.h
index 3b61ea06ee..6175f659fc 100644
--- a/demo/csrc/cpp/utils/mediaio.h
+++ b/demo/csrc/cpp/utils/mediaio.h
@@ -123,7 +123,7 @@ class BatchInputIterator {
  private:
   void next() {
     data_.clear();
-    for (size_t i = 0; i < batch_size_ && ++iter_ != end_; ++i) {
+    for (size_t i = 0; i < batch_size_ && iter_ != end_; ++i, ++iter_) {
       data_.push_back(*iter_);
     }
   }
diff --git a/demo/csrc/cpp/utils/palette.h b/demo/csrc/cpp/utils/palette.h
index 007bd640d8..03d76a4360 100644
--- a/demo/csrc/cpp/utils/palette.h
+++ b/demo/csrc/cpp/utils/palette.h
@@ -55,8 +55,9 @@ inline Palette Palette::get(const std::string& path) {
     return p;
   }
   std::ifstream ifs(path);
-  if (ifs.is_open()) {
-    assert(0 && "Failed to open palette file.");
+  if (!ifs.is_open()) {
+    std::cout << "error: failed to open palette data file: " << path << "\n";
+    std::abort();
   }
   Palette p;
   int n = 0;
diff --git a/demo/csrc/cpp/utils/skeleton.h b/demo/csrc/cpp/utils/skeleton.h
index f0b49dfe60..59b0df415a 100644
--- a/demo/csrc/cpp/utils/skeleton.h
+++ b/demo/csrc/cpp/utils/skeleton.h
@@ -52,7 +52,8 @@ inline Skeleton Skeleton::get(const std::string& path) {
   }
   std::ifstream ifs(path);
   if (!ifs.is_open()) {
-    assert(0 && "Failed to open skeleton file");
+    std::cout << "error: failed to open skeleton data file: " << path << "\n";
+    std::abort();
   }
   Skeleton skel;
   int n = 0;

From fb01fcebdd9b70dadd4334ed98ef10a59820dbe2 Mon Sep 17 00:00:00 2001
From: zhangli <lzhang329@gmail.com>
Date: Tue, 7 Feb 2023 13:11:12 +0800
Subject: [PATCH 07/11] install utils & fix demo file extensions

---
 csrc/mmdeploy/apis/cxx/CMakeLists.txt                | 1 +
 demo/csrc/cpp/{det_pose.cpp => det_pose.cxx}         | 0
 demo/csrc/cpp/{pose_tracker.cpp => pose_tracker.cxx} | 0
 3 files changed, 1 insertion(+)
 rename demo/csrc/cpp/{det_pose.cpp => det_pose.cxx} (100%)
 rename demo/csrc/cpp/{pose_tracker.cpp => pose_tracker.cxx} (100%)

diff --git a/csrc/mmdeploy/apis/cxx/CMakeLists.txt b/csrc/mmdeploy/apis/cxx/CMakeLists.txt
index 19d56344a6..6a5160d9c6 100644
--- a/csrc/mmdeploy/apis/cxx/CMakeLists.txt
+++ b/csrc/mmdeploy/apis/cxx/CMakeLists.txt
@@ -24,4 +24,5 @@ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/common.hpp
 install(DIRECTORY ${CMAKE_SOURCE_DIR}/demo/csrc/ DESTINATION example/cpp
         FILES_MATCHING
         PATTERN "*.cxx"
+        PATTERN "*.h"
         )
diff --git a/demo/csrc/cpp/det_pose.cpp b/demo/csrc/cpp/det_pose.cxx
similarity index 100%
rename from demo/csrc/cpp/det_pose.cpp
rename to demo/csrc/cpp/det_pose.cxx
diff --git a/demo/csrc/cpp/pose_tracker.cpp b/demo/csrc/cpp/pose_tracker.cxx
similarity index 100%
rename from demo/csrc/cpp/pose_tracker.cpp
rename to demo/csrc/cpp/pose_tracker.cxx

From aec4b263bc54645cb5a5e93c6311355674b52572 Mon Sep 17 00:00:00 2001
From: zhangli <lzhang329@gmail.com>
Date: Tue, 7 Feb 2023 14:07:34 +0800
Subject: [PATCH 08/11] rename

---
 demo/csrc/cpp/pose_tracker.cxx                                  | 2 +-
 .../csrc/cpp/{pose_tracker_params.inc => pose_tracker_params.h} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename demo/csrc/cpp/{pose_tracker_params.inc => pose_tracker_params.h} (100%)

diff --git a/demo/csrc/cpp/pose_tracker.cxx b/demo/csrc/cpp/pose_tracker.cxx
index aa50d0c4f4..c57cc3e4a4 100644
--- a/demo/csrc/cpp/pose_tracker.cxx
+++ b/demo/csrc/cpp/pose_tracker.cxx
@@ -20,7 +20,7 @@ DEFINE_string(skeleton, "coco", R"(Path to skeleton data or name of predefined s
 DEFINE_string(background, "default",
               R"(Output background, "default": original image, "black": black background)");
 
-#include "pose_tracker_params.inc"
+#include "pose_tracker_params.h"
 
 int main(int argc, char* argv[]) {
   if (!utils::ParseArguments(argc, argv)) {
diff --git a/demo/csrc/cpp/pose_tracker_params.inc b/demo/csrc/cpp/pose_tracker_params.h
similarity index 100%
rename from demo/csrc/cpp/pose_tracker_params.inc
rename to demo/csrc/cpp/pose_tracker_params.h

From d6d2b2685077320bad6716cb82bd743d66c2a9ee Mon Sep 17 00:00:00 2001
From: zhangli <lzhang329@gmail.com>
Date: Tue, 7 Feb 2023 16:56:18 +0800
Subject: [PATCH 09/11] parse empty flags

---
 demo/csrc/cpp/utils/argparse.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/demo/csrc/cpp/utils/argparse.h b/demo/csrc/cpp/utils/argparse.h
index 3204b0f6ca..5c94c8afad 100644
--- a/demo/csrc/cpp/utils/argparse.h
+++ b/demo/csrc/cpp/utils/argparse.h
@@ -74,6 +74,7 @@ class ArgParse {
   bool _Parse(int argc, char* argv[]) {
     int arg_idx{-1};
     std::vector<std::string> args(infos_.size());
+    std::vector<int> used(infos_.size());
     for (int i = 1; i < argc; ++i) {
       if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
         return false;
@@ -103,7 +104,7 @@ class ArgParse {
           auto& flag = infos_[j];
           if (key == flag.name) {
             args[j] = val;
-            found = true;
+            found = used[j] = 1;
             break;
           }
         }
@@ -115,6 +116,7 @@ class ArgParse {
         for (arg_idx++; arg_idx < infos_.size(); ++arg_idx) {
           if (!infos_[arg_idx].is_flag) {
             args[arg_idx] = argv[i];
+            used[arg_idx] = 1;
             break;
           }
         }
@@ -143,7 +145,7 @@ class ArgParse {
     }
 
     for (int i = 0; i < infos_.size(); ++i) {
-      if (!args[i].empty()) {
+      if (used[i]) {
         try {
           parse_str(infos_[i], args[i]);
         } catch (...) {

From 379bb16e53d7e987c75c961598aa47de043d0e0e Mon Sep 17 00:00:00 2001
From: zhangli <lzhang329@gmail.com>
Date: Tue, 7 Feb 2023 16:59:44 +0800
Subject: [PATCH 10/11] antialias

---
 demo/csrc/cpp/utils/visualize.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demo/csrc/cpp/utils/visualize.h b/demo/csrc/cpp/utils/visualize.h
index 5d0323a0ee..e9d8493f58 100644
--- a/demo/csrc/cpp/utils/visualize.h
+++ b/demo/csrc/cpp/utils/visualize.h
@@ -157,7 +157,7 @@ class Visualize {
       cv::drawContours(
           img_,
           std::vector<std::vector<cv::Point>>{{p1 * scale_, p2 * scale_, p3 * scale_, p4 * scale_}},
-          -1, {0, 255, 0}, 2);
+          -1, {0, 255, 0}, 2, cv::LINE_AA);
       add_text(to_text(label_id, score), c, std::sqrt(w * h));
     }
 

From 4e375ae13bd4eb24d5508b7d845482fcbef9233d Mon Sep 17 00:00:00 2001
From: zhangli <lzhang329@gmail.com>
Date: Tue, 7 Feb 2023 18:27:50 +0800
Subject: [PATCH 11/11] handle video complications

---
 demo/csrc/cpp/utils/mediaio.h | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/demo/csrc/cpp/utils/mediaio.h b/demo/csrc/cpp/utils/mediaio.h
index 6175f659fc..3426dfb3e0 100644
--- a/demo/csrc/cpp/utils/mediaio.h
+++ b/demo/csrc/cpp/utils/mediaio.h
@@ -31,6 +31,21 @@ static std::string get_extension(const std::string& path) {
   return {};
 }
 
+int ext2fourcc(const std::string& ext) {
+  auto get_fourcc = [](const char* s) { return cv::VideoWriter::fourcc(s[0], s[1], s[2], s[3]); };
+  static std::map<std::string, int> ext2fourcc{
+      {".mp4", get_fourcc("mp4v")},
+      {".avi", get_fourcc("DIVX")},
+      {".mkv", get_fourcc("X264")},
+      {".wmv", get_fourcc("WMV3")},
+  };
+  auto it = ext2fourcc.find(ext);
+  if (it != ext2fourcc.end()) {
+    return it->second;
+  }
+  return get_fourcc("DIVX");
+}
+
 static bool is_video(const std::string& ext) {
   static const std::set<std::string> es{".mp4", ".avi", ".mkv", ".webm", ".mov", ".mpg", ".wmv"};
   return es.count(ext);
@@ -294,17 +309,17 @@ class Output {
  public:
   explicit Output(const std::string& path, int show, MediaType type = MediaType::kUnknown)
       : path_(path), type_(type), show_(show) {
+    ext_ = detail::get_extension(path);
     if (type_ == MediaType::kUnknown) {
-      auto ext = detail::get_extension(path);
       if (path_.empty()) {
         type_ = MediaType::kDisable;
-      } else if (detail::is_image(ext)) {
+      } else if (detail::is_image(ext_)) {
         if (detail::is_fmtstr(path)) {
           type_ = MediaType::kFmtStr;
         } else {
           type_ = MediaType::kImage;
         }
-      } else if (detail::is_video(ext)) {
+      } else if (detail::is_video(ext_)) {
         type_ = MediaType::kVideo;
       } else {
         std::cout << "unknown file type: " << path << "\n";
@@ -346,13 +361,16 @@ class Output {
  private:
   void write_video(const cv::Mat& frame) {
     if (!video_.isOpened()) {
-      video_.open(path_, cv::VideoWriter::fourcc('m', 'p', '4', 'v'), 30, frame.size());
+      open_video(frame.size());
     }
     video_ << frame;
   }
 
+  void open_video(const cv::Size& size) { video_.open(path_, detail::ext2fourcc(ext_), 30, size); }
+
  private:
   std::string path_;
+  std::string ext_;
   MediaType type_{MediaType::kUnknown};
   int show_{1};
   size_t frame_id_{0};