[Update] FaceFusion Speed up (#446)

* update CMakeLists.txt * face_recognizer_postprocess cuda code and wrapper implement * trt code update * face_swap_postprocess cuda code implement * update code * facefusion pipeline test code upate * facefusion pipeline test code upate * fix multi thread face68landmarks code * multi thread yolofacev8 code * update name * bgr2rgb cuda code implement * use cuda rgb2bgr method * update code * speed up paste_back func * update to cuda version paste_back * time test for preprocess postprocess and inference * update code * update code --------- Co-authored-by: DefTruth <[email protected]>
DefTruth · Dec 9, 2024 · 5f4938f · 5f4938f
1 parent a7cd9db
commit 5f4938f
Show file tree

Hide file tree

Showing 32 changed files with 1,002 additions and 82 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -28,7 +28,7 @@ enable_language(CUDA)
 set(LITE_AI_ROOT_DIR ${CMAKE_SOURCE_DIR})
 
 option(ENABLE_TEST "build test examples." OFF)
-option(ENABLE_DEBUG_STRING "enable DEBUG string or not" OFF)
+option(ENABLE_DEBUG_STRING "enable DEBUG string or not" ON)
 option(ENABLE_ONNXRUNTIME "enable ONNXRuntime engine" ON)
 option(ENABLE_TENSORRT "enable TensorRT engine" OFF)
 option(ENABLE_MNN "enable MNN engine" OFF)

diff --git a/examples/lite/cv/test_lite_facefusion_pipeline.cpp b/examples/lite/cv/test_lite_facefusion_pipeline.cpp
@@ -5,11 +5,11 @@
 static void test_default()
 {
 #ifdef ENABLE_ONNXRUNTIME
-    std::string face_swap_onnx_path = "../../../examples/hub/onnx/cv/inswapper_128.onnx";
-    std::string face_detect_onnx_path = "../../../examples/hub/onnx/cv/yoloface_8n.onnx";
-    std::string face_landmarks_68 = "../../../examples/hub/onnx/cv/2dfan4.onnx";
-    std::string face_recognizer_onnx_path = "../../../examples/hub/onnx/cv/arcface_w600k_r50.onnx";
-    std::string face_restoration_onnx_path = "../../../examples/hub/onnx/cv/gfpgan_1.4.onnx";
+    std::string face_swap_onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/inswapper_128.onnx";
+    std::string face_detect_onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/yoloface_8n.onnx";
+    std::string face_landmarks_68 = "/home/lite.ai.toolkit/examples/hub/onnx/cv/2dfan4.onnx";
+    std::string face_recognizer_onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/arcface_w600k_r50.onnx";
+    std::string face_restoration_onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/gfpgan_1.4.onnx";
 
     auto pipeLine =  lite::cv::face::swap::facefusion::PipeLine(
             face_detect_onnx_path,
@@ -19,27 +19,62 @@ static void test_default()
             face_restoration_onnx_path
             );
 
-    std::string source_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_source.jpg";
-    std::string target_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_target.jpg";
-    std::string save_image_path = "../../../examples/logs/test_lite_facefusion_pipeline_result.jpg";
+    std::string source_image_path = "/home/lite.ai.toolkit/1.jpg";
+    std::string target_image_path = "/home/lite.ai.toolkit/2.jpg";
+    std::string save_image_path = "/home/lite.ai.toolkit/result111111.jpg";
 
 
     // 写一个测试时间的代码
     auto start = std::chrono::high_resolution_clock::now();
 
+    pipeLine.detect(source_image_path,target_image_path,save_image_path);
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = end-start;
+    std::cout << "Time: " << diff.count() << " s\n";
 
 
+#endif
+}
+
+
+
+
+static void test_tensorrt()
+{
+#ifdef ENABLE_TENSORRT
+    std::string face_swap_onnx_path = "../../../examples/hub/trt/inswapper_128_fp16.engine";
+    std::string face_detect_onnx_path = "../../../examples/hub/trt/yoloface_8n_fp16.engine";
+    std::string face_landmarks_68 = "../../../examples/hub/trt/2dfan4_fp16.engine";
+    std::string face_recognizer_onnx_path = "../../../examples/hub/trt/arcface_w600k_r50_fp16.engine";
+    std::string face_restoration_onnx_path = "../../../examples/hub/trt/gfpgan_1.4_fp32.engine";
+
+    auto pipeLine =  lite::trt::cv::face::swap::FaceFusionPipeLine (
+            face_detect_onnx_path,
+            face_landmarks_68,
+            face_recognizer_onnx_path,
+            face_swap_onnx_path,
+            face_restoration_onnx_path
+    );
+
+    std::string source_image_path = "../../../examples/logs/1.jpg";
+    std::string target_image_path = "../../../examples/logs/5.jpg";
+    std::string save_image_path = "../../../examples/logs/trt_pipeline_result_cuda_test_13_mt.jpg";
+
+
+    // 写一个测试时间的代码
+    auto start = std::chrono::high_resolution_clock::now();
+
     pipeLine.detect(source_image_path,target_image_path,save_image_path);
     auto end = std::chrono::high_resolution_clock::now();
     std::chrono::duration<double> diff = end-start;
-    std::cout << "Time: " << diff.count() << " s\n";
+    std::cout << "Time: " << diff.count()  * 1000<< " ms\n";
 
 
 #endif
 }
 
 int main()
 {
-
-    test_default();
+    test_tensorrt();
+//    test_default();
 }
diff --git a/lite/ort/cv/yolofacev8.cpp b/lite/ort/cv/yolofacev8.cpp
@@ -1,5 +1,5 @@
 //
-// Created by ai-test1 on 24-7-8.
+// Created by wangzijian on 24-7-8.
 //
 
 #include "yolofacev8.h"
@@ -9,6 +9,7 @@
 using ortcv::YoloFaceV8;
 
 float YoloFaceV8::get_iou(const lite::types::Boxf box1, const lite::types::Boxf box2) {
+    // 左上角是坐标轴原点，右下角是坐标轴最大值
     float x1 = std::max(box1.x1, box2.x1);
     float y1 = std::max(box1.y1, box2.y1);
     float x2 = std::min(box1.x2, box2.x2);

diff --git a/lite/trt/cv/trt_face_68landmarks_mt.cpp b/lite/trt/cv/trt_face_68landmarks_mt.cpp
@@ -99,6 +99,9 @@ trt_face_68landmarks_mt::trt_face_68landmarks_mt(std::string &model_path, size_t
         worker_threads.emplace_back(&trt_face_68landmarks_mt::worker_function, this, i);
     }
 
+    affine_matrixs.resize(num_threads);
+    img_with_landmarks_vec.resize(num_threads);
+
 }
 
 // 在cpp文件中修改相关实现
@@ -138,7 +141,7 @@ void trt_face_68landmarks_mt::worker_function(int thread_id) {
 }
 
 void
-trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const cv::Mat &input_mat, cv::Mat &crop_img) {
+trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const cv::Mat &input_mat, cv::Mat &crop_img,int thread_id) {
     float xmin = bounding_box.x1;
     float ymin = bounding_box.y1;
     float xmax = bounding_box.x2;
@@ -159,7 +162,7 @@ trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const
 
     cv::Size crop_size(256, 256);
 
-    std::tie(crop_img, affine_matrix) = face_utils::warp_face_by_translation(input_mat, translation, scale, crop_size);
+    std::tie(crop_img, affine_matrixs[thread_id]) = face_utils::warp_face_by_translation(input_mat, translation, scale, crop_size);
 
     crop_img.convertTo(crop_img,CV_32FC3,1 / 255.f);
 }
@@ -168,10 +171,10 @@ trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const
 void trt_face_68landmarks_mt::process_single_task( InferenceTask &task, int thread_id) {
     if (task.input_mat.empty()) return;
 
-    img_with_landmarks = task.input_mat.clone();
+    img_with_landmarks_vec[thread_id] = task.input_mat.clone();
     cv::Mat crop_image;
 
-    preprocess(task.bbox, task.input_mat, crop_image);
+    preprocess(task.bbox, task.input_mat, crop_image, thread_id);
 
     std::vector<float> input_data;
 
@@ -198,13 +201,13 @@ void trt_face_68landmarks_mt::process_single_task( InferenceTask &task, int thre
 
     // 带出结果
     // 指针指向带出来
-    *task.face_landmark_5of68 = postprocess(output.data());
+    *task.face_landmark_5of68 = postprocess(output.data(),thread_id);
 
     task.completion_promise.set_value();
 }
 
 
-std::vector<cv::Point2f> trt_face_68landmarks_mt::postprocess(float *trt_outputs) {
+std::vector<cv::Point2f> trt_face_68landmarks_mt::postprocess(float *trt_outputs,int thread_id) {
     std::vector<cv::Point2f> landmarks;
 
     for (int i = 0;i < 68; ++i)
@@ -215,15 +218,15 @@ std::vector<cv::Point2f> trt_face_68landmarks_mt::postprocess(float *trt_outputs
     }
 
     cv::Mat inverse_affine_matrix;
-    cv::invertAffineTransform(affine_matrix, inverse_affine_matrix);
+    cv::invertAffineTransform(affine_matrixs[thread_id], inverse_affine_matrix);
 
     cv::transform(landmarks, landmarks, inverse_affine_matrix);
 
     return face_utils::convert_face_landmark_68_to_5(landmarks);
 }
 
 
-void trt_face_68landmarks_mt::postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68) {
+void trt_face_68landmarks_mt::postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68,int thread_id) {
     std::vector<cv::Point2f> landmarks;
 
     for (int i = 0;i < 68; ++i)
@@ -234,7 +237,7 @@ void trt_face_68landmarks_mt::postprocess(float *trt_outputs, std::vector<cv::Po
     }
 
     cv::Mat inverse_affine_matrix;
-    cv::invertAffineTransform(affine_matrix, inverse_affine_matrix);
+    cv::invertAffineTransform(affine_matrixs[thread_id], inverse_affine_matrix);
 
     cv::transform(landmarks, landmarks, inverse_affine_matrix);
 

diff --git a/lite/trt/cv/trt_face_68landmarks_mt.h b/lite/trt/cv/trt_face_68landmarks_mt.h
@@ -65,17 +65,18 @@ class trt_face_68landmarks_mt {
     // 实际的推理函数
     void process_single_task(InferenceTask& task, int thread_id);
 
-    void preprocess(const lite::types::Boxf &bouding_box,const cv::Mat &input_mat,cv::Mat &crop_img);
+    void preprocess(const lite::types::Boxf &bouding_box,const cv::Mat &input_mat,cv::Mat &crop_img,int thread_id);
 
-    void postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68);
+    void postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68,int thread_id);
 
-    std::vector<cv::Point2f> postprocess(float *trt_outputs);
+    std::vector<cv::Point2f> postprocess(float *trt_outputs,int thread_id);
 
 
 
 private:
-    cv::Mat affine_matrix;
-    cv::Mat img_with_landmarks;
+    std::vector<cv::Mat> affine_matrixs;
+    std::vector<cv::Mat> img_with_landmarks_vec;
+
 
 public:
     explicit trt_face_68landmarks_mt(std::string& model_path, size_t num_threads = 4);

diff --git a/lite/trt/cv/trt_face_recognizer.cpp b/lite/trt/cv/trt_face_recognizer.cpp
@@ -52,15 +52,21 @@ void TRTFaceFusionFaceRecognizer::detect(cv::Mat &input_mat, std::vector<cv::Poi
     std::vector<float> normal_embeding(output.begin(),output.end());
 
 
-    float norm = 0.0f;
-    for (const auto &val : normal_embeding) {
-        norm += val * val;
-    }
-    norm = std::sqrt(norm);
+    launch_face_recognizer_postprocess(
+            static_cast<float*>(buffers[1]),
+            output_node_dims[0][0] * output_node_dims[0][1],
+            output.data()
+            );
+//    float norm = 0.0f;
+//    for (const auto &val : normal_embeding) {
+//        norm += val * val;
+//    }
+//    norm = std::sqrt(norm);
+//
+//    for (auto &val : normal_embeding) {
+//        val /= norm;
+//    }
 
-    for (auto &val : normal_embeding) {
-        val /= norm;
-    }
 
     std::cout<<"done!"<<std::endl;
 

diff --git a/lite/trt/cv/trt_face_recognizer.h b/lite/trt/cv/trt_face_recognizer.h
@@ -8,6 +8,7 @@
 #include "lite/trt/core/trt_core.h"
 #include "lite/trt/core/trt_utils.h"
 #include "lite/trt/core/trt_types.h"
+#include "lite/trt/kernel/face_recognizer_postprocess_manager.h"
 
 namespace trtcv{
     class  LITE_EXPORTS TRTFaceFusionFaceRecognizer : BasicTRTHandler{

diff --git a/lite/trt/cv/trt_face_restoration.cpp b/lite/trt/cv/trt_face_restoration.cpp
@@ -11,42 +11,53 @@ void TRTFaceFusionFaceRestoration::detect(cv::Mat &face_swap_image, std::vector<
 
     cv::Mat crop_image;
     cv::Mat affine_matrix;
-    std::tie(crop_image,affine_matrix) = face_utils::warp_face_by_face_landmark_5(face_swap_image,target_landmarks_5,face_utils::FFHQ_512);
+    // 记录时间
+    auto start_warp = std::chrono::high_resolution_clock::now();
+    std::tie(crop_image,affine_matrix) = face_utils::warp_face_by_face_landmark_5(face_swap_image,target_landmarks_5,
+                                                                                  face_utils::FFHQ_512);
 
     std::vector<float> crop_size = {512,512};
     cv::Mat box_mask = face_utils::create_static_box_mask(crop_size);
     std::vector<cv::Mat> crop_mask_list;
     crop_mask_list.emplace_back(box_mask);
 
-    cv::cvtColor(crop_image,crop_image,cv::COLOR_BGR2RGB);
-    crop_image.convertTo(crop_image,CV_32FC3,1.f / 255.f);
-    crop_image.convertTo(crop_image,CV_32FC3,2.0f,-1.f);
+    cv::Mat crop_image_rgb;
+    launch_bgr2rgb(crop_image,crop_image_rgb);
+    crop_image_rgb.convertTo(crop_image_rgb,CV_32FC3,1.f / 255.f);
+    crop_image_rgb.convertTo(crop_image_rgb,CV_32FC3,2.0f,-1.f);
 
     std::vector<float> input_vector;
-    trtcv::utils::transform::create_tensor(crop_image,input_vector,input_node_dims,trtcv::utils::transform::CHW);
+    trtcv::utils::transform::create_tensor(crop_image_rgb,input_vector,input_node_dims,trtcv::utils::transform::CHW);
 
-    // 拷贝
+    auto end_warp = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::milli> fp_ms_warp = end_warp - start_warp;
+    std::cout << "FaceRestoration preprocess time: " << fp_ms_warp.count() << "ms" << std::endl;
 
+
+    // 记录时间
+    auto start = std::chrono::high_resolution_clock::now();
     // 先不用拷贝了 处理完成再拷贝出来 类似于整个后处理放在GPU上完成
     cudaMemcpyAsync(buffers[0],input_vector.data(),1 * 3 * 512 * 512 * sizeof(float),cudaMemcpyHostToDevice,stream);
-
     // 同步
     cudaStreamSynchronize(stream);
-
     // 推理
     bool status = trt_context->enqueueV3(stream);
+
     if (!status) {
         std::cerr << "Failed to inference" << std::endl;
         return;
     }
-
-
     // 同步
     cudaStreamSynchronize(stream);
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::milli> fp_ms = end - start;
+    std::cout << "FaceRestoration Inference time: " << fp_ms.count() << "ms" << std::endl;
     std::vector<unsigned char> transposed_data(1 * 3 * 512 * 512);
 
 //    std::vector<float> transposed_data(1 * 3 * 512 * 512);
 
+    // 记录时间
+    auto start_postprocess = std::chrono::high_resolution_clock::now();
     // 这里buffer1就是输出了
     launch_face_restoration_postprocess(
             static_cast<float*>(buffers[1]),
@@ -64,47 +75,31 @@ void TRTFaceFusionFaceRestoration::detect(cv::Mat &face_swap_image, std::vector<
     std::vector<float> output_vector(1 * 3 * 512 * 512);
 //    cudaMemcpyAsync(output_vector.data(),buffers[1],1 * 3 * 512 * 512 * sizeof(float),cudaMemcpyDeviceToHost,stream);
     cudaStreamSynchronize(stream);
-//
     // 后处理
     int channel = 3;
     int height = 512;
     int width = 512;
-//    std::vector<float> output(channel * height * width);
-//    output.assign(output_vector.begin(),output_vector.end());
-//
-//    std::transform(output.begin(),output.end(),output.begin(),
-//                   [](double x){return std::max(-1.0,std::max(-1.0,std::min(1.0,x)));});
-//
-//    std::transform(output.begin(),output.end(),output.begin(),
-//                   [](double x){return (x + 1.f) /2.f;});
-//
-//    // CHW2HWC
-//    for (int c = 0; c < channel; ++c){
-//        for (int h = 0 ; h < height; ++h){
-//            for (int w = 0; w < width ; ++w){
-//                int src_index = c * (height * width) + h * width + w;
-//                int dst_index = h * (width * channel) + w *  channel + c;
-//                transposed_data[dst_index] = output[src_index];
-//            }
-//        }
-//    }
-//
-//    std::transform(transposed_data.begin(),transposed_data.end(),transposed_data.begin(),
-//                   [](float x){return std::round(x * 255.f);});
-//
-//    std::transform(transposed_data.begin(), transposed_data.end(), transposed_data.begin(),
-//                   [](float x) { return static_cast<uint8_t>(x); });
 
 
     cv::Mat mat(height, width, CV_32FC3, transposed_data_float.data());
-//    cv::imwrite("/home/lite.ai.toolkit/mid_process.jpg",mat);
     cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR);
+    // 到这里为止基本不耗时
 
 
     auto crop_mask = crop_mask_list[0];
-    cv::Mat paste_frame = face_utils::paste_back(ori_image,mat,crop_mask,affine_matrix);
-
+    // 这里的paste_back 40ms左右
+    cv::Mat paste_frame = launch_paste_back(ori_image,mat,crop_mask,affine_matrix);
+//    cv::Mat paste_frame = face_utils::paste_back(ori_image,mat,crop_mask,affine_matrix);
     cv::Mat dst_image = face_utils::blend_frame(ori_image,paste_frame);
+    auto end_postprocess = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::milli> fp_ms_postprocess = end_postprocess - start_postprocess;
+    std::cout << "FaceRestoration postprocess time: " << fp_ms_postprocess.count() << "ms" << std::endl;
 
+    // 记录时间
+    auto start_save = std::chrono::high_resolution_clock::now();
     cv::imwrite(face_enchaner_path,dst_image);
+    auto end_save = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::milli> fp_ms_save = end_save - start_save;
+    std::cout << "FaceRestoration save time: " << fp_ms_save.count() << "ms" << std::endl;
+
 }
diff --git a/lite/trt/cv/trt_face_restoration.h b/lite/trt/cv/trt_face_restoration.h
@@ -9,12 +9,15 @@
 #include "lite/trt/core/trt_config.h"
 #include "lite/ort/cv/face_utils.h"
 #include "lite/trt/kernel/face_restoration_postprocess_manager.h"
+#include "lite/trt/kernel/bgr2rgb_manager.h"
+#include "lite/trt/kernel/paste_back_manager.h"
 namespace trtcv{
     class LITE_EXPORTS TRTFaceFusionFaceRestoration : BasicTRTHandler{
     public:
         explicit TRTFaceFusionFaceRestoration(const std::string& _trt_model_path,unsigned int _num_threads = 1) :
                 BasicTRTHandler(_trt_model_path,_num_threads){};;
     public:
+        // 这个是直接保存的
         void detect(cv::Mat &face_swap_image,std::vector<cv::Point2f > &target_landmarks_5 ,const std::string &face_enchaner_path);
 
     };