Skip to content

Commit

Permalink
[Update] FaceFusion Speed up (#446)
Browse files Browse the repository at this point in the history
* update CMakeLists.txt

* face_recognizer_postprocess cuda code and wrapper implement

* trt code update

* face_swap_postprocess cuda code implement

* update code

* facefusion pipeline test code upate

* facefusion pipeline test code upate

* fix multi thread face68landmarks code

* multi thread yolofacev8 code

* update name

* bgr2rgb cuda code implement

* use cuda rgb2bgr method

* update code

* speed up paste_back func

* update to cuda version paste_back

* time test for preprocess postprocess and inference

* update code

* update code

---------

Co-authored-by: DefTruth <[email protected]>
  • Loading branch information
wangzijian1010 and DefTruth authored Dec 9, 2024
1 parent a7cd9db commit 5f4938f
Show file tree
Hide file tree
Showing 32 changed files with 1,002 additions and 82 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ enable_language(CUDA)
set(LITE_AI_ROOT_DIR ${CMAKE_SOURCE_DIR})

option(ENABLE_TEST "build test examples." OFF)
option(ENABLE_DEBUG_STRING "enable DEBUG string or not" OFF)
option(ENABLE_DEBUG_STRING "enable DEBUG string or not" ON)
option(ENABLE_ONNXRUNTIME "enable ONNXRuntime engine" ON)
option(ENABLE_TENSORRT "enable TensorRT engine" OFF)
option(ENABLE_MNN "enable MNN engine" OFF)
Expand Down
57 changes: 46 additions & 11 deletions examples/lite/cv/test_lite_facefusion_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
static void test_default()
{
#ifdef ENABLE_ONNXRUNTIME
std::string face_swap_onnx_path = "../../../examples/hub/onnx/cv/inswapper_128.onnx";
std::string face_detect_onnx_path = "../../../examples/hub/onnx/cv/yoloface_8n.onnx";
std::string face_landmarks_68 = "../../../examples/hub/onnx/cv/2dfan4.onnx";
std::string face_recognizer_onnx_path = "../../../examples/hub/onnx/cv/arcface_w600k_r50.onnx";
std::string face_restoration_onnx_path = "../../../examples/hub/onnx/cv/gfpgan_1.4.onnx";
std::string face_swap_onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/inswapper_128.onnx";
std::string face_detect_onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/yoloface_8n.onnx";
std::string face_landmarks_68 = "/home/lite.ai.toolkit/examples/hub/onnx/cv/2dfan4.onnx";
std::string face_recognizer_onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/arcface_w600k_r50.onnx";
std::string face_restoration_onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/gfpgan_1.4.onnx";

auto pipeLine = lite::cv::face::swap::facefusion::PipeLine(
face_detect_onnx_path,
Expand All @@ -19,27 +19,62 @@ static void test_default()
face_restoration_onnx_path
);

std::string source_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_source.jpg";
std::string target_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_target.jpg";
std::string save_image_path = "../../../examples/logs/test_lite_facefusion_pipeline_result.jpg";
std::string source_image_path = "/home/lite.ai.toolkit/1.jpg";
std::string target_image_path = "/home/lite.ai.toolkit/2.jpg";
std::string save_image_path = "/home/lite.ai.toolkit/result111111.jpg";


// 写一个测试时间的代码
auto start = std::chrono::high_resolution_clock::now();

pipeLine.detect(source_image_path,target_image_path,save_image_path);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end-start;
std::cout << "Time: " << diff.count() << " s\n";


#endif
}




static void test_tensorrt()
{
#ifdef ENABLE_TENSORRT
std::string face_swap_onnx_path = "../../../examples/hub/trt/inswapper_128_fp16.engine";
std::string face_detect_onnx_path = "../../../examples/hub/trt/yoloface_8n_fp16.engine";
std::string face_landmarks_68 = "../../../examples/hub/trt/2dfan4_fp16.engine";
std::string face_recognizer_onnx_path = "../../../examples/hub/trt/arcface_w600k_r50_fp16.engine";
std::string face_restoration_onnx_path = "../../../examples/hub/trt/gfpgan_1.4_fp32.engine";

auto pipeLine = lite::trt::cv::face::swap::FaceFusionPipeLine (
face_detect_onnx_path,
face_landmarks_68,
face_recognizer_onnx_path,
face_swap_onnx_path,
face_restoration_onnx_path
);

std::string source_image_path = "../../../examples/logs/1.jpg";
std::string target_image_path = "../../../examples/logs/5.jpg";
std::string save_image_path = "../../../examples/logs/trt_pipeline_result_cuda_test_13_mt.jpg";


// 写一个测试时间的代码
auto start = std::chrono::high_resolution_clock::now();

pipeLine.detect(source_image_path,target_image_path,save_image_path);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end-start;
std::cout << "Time: " << diff.count() << " s\n";
std::cout << "Time: " << diff.count() * 1000<< " ms\n";


#endif
}

int main()
{

test_default();
test_tensorrt();
// test_default();
}
3 changes: 2 additions & 1 deletion lite/ort/cv/yolofacev8.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// Created by ai-test1 on 24-7-8.
// Created by wangzijian on 24-7-8.
//

#include "yolofacev8.h"
Expand All @@ -9,6 +9,7 @@
using ortcv::YoloFaceV8;

float YoloFaceV8::get_iou(const lite::types::Boxf box1, const lite::types::Boxf box2) {
// 左上角是坐标轴原点,右下角是坐标轴最大值
float x1 = std::max(box1.x1, box2.x1);
float y1 = std::max(box1.y1, box2.y1);
float x2 = std::min(box1.x2, box2.x2);
Expand Down
21 changes: 12 additions & 9 deletions lite/trt/cv/trt_face_68landmarks_mt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ trt_face_68landmarks_mt::trt_face_68landmarks_mt(std::string &model_path, size_t
worker_threads.emplace_back(&trt_face_68landmarks_mt::worker_function, this, i);
}

affine_matrixs.resize(num_threads);
img_with_landmarks_vec.resize(num_threads);

}

// 在cpp文件中修改相关实现
Expand Down Expand Up @@ -138,7 +141,7 @@ void trt_face_68landmarks_mt::worker_function(int thread_id) {
}

void
trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const cv::Mat &input_mat, cv::Mat &crop_img) {
trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const cv::Mat &input_mat, cv::Mat &crop_img,int thread_id) {
float xmin = bounding_box.x1;
float ymin = bounding_box.y1;
float xmax = bounding_box.x2;
Expand All @@ -159,7 +162,7 @@ trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const

cv::Size crop_size(256, 256);

std::tie(crop_img, affine_matrix) = face_utils::warp_face_by_translation(input_mat, translation, scale, crop_size);
std::tie(crop_img, affine_matrixs[thread_id]) = face_utils::warp_face_by_translation(input_mat, translation, scale, crop_size);

crop_img.convertTo(crop_img,CV_32FC3,1 / 255.f);
}
Expand All @@ -168,10 +171,10 @@ trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const
void trt_face_68landmarks_mt::process_single_task( InferenceTask &task, int thread_id) {
if (task.input_mat.empty()) return;

img_with_landmarks = task.input_mat.clone();
img_with_landmarks_vec[thread_id] = task.input_mat.clone();
cv::Mat crop_image;

preprocess(task.bbox, task.input_mat, crop_image);
preprocess(task.bbox, task.input_mat, crop_image, thread_id);

std::vector<float> input_data;

Expand All @@ -198,13 +201,13 @@ void trt_face_68landmarks_mt::process_single_task( InferenceTask &task, int thre

// 带出结果
// 指针指向带出来
*task.face_landmark_5of68 = postprocess(output.data());
*task.face_landmark_5of68 = postprocess(output.data(),thread_id);

task.completion_promise.set_value();
}


std::vector<cv::Point2f> trt_face_68landmarks_mt::postprocess(float *trt_outputs) {
std::vector<cv::Point2f> trt_face_68landmarks_mt::postprocess(float *trt_outputs,int thread_id) {
std::vector<cv::Point2f> landmarks;

for (int i = 0;i < 68; ++i)
Expand All @@ -215,15 +218,15 @@ std::vector<cv::Point2f> trt_face_68landmarks_mt::postprocess(float *trt_outputs
}

cv::Mat inverse_affine_matrix;
cv::invertAffineTransform(affine_matrix, inverse_affine_matrix);
cv::invertAffineTransform(affine_matrixs[thread_id], inverse_affine_matrix);

cv::transform(landmarks, landmarks, inverse_affine_matrix);

return face_utils::convert_face_landmark_68_to_5(landmarks);
}


void trt_face_68landmarks_mt::postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68) {
void trt_face_68landmarks_mt::postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68,int thread_id) {
std::vector<cv::Point2f> landmarks;

for (int i = 0;i < 68; ++i)
Expand All @@ -234,7 +237,7 @@ void trt_face_68landmarks_mt::postprocess(float *trt_outputs, std::vector<cv::Po
}

cv::Mat inverse_affine_matrix;
cv::invertAffineTransform(affine_matrix, inverse_affine_matrix);
cv::invertAffineTransform(affine_matrixs[thread_id], inverse_affine_matrix);

cv::transform(landmarks, landmarks, inverse_affine_matrix);

Expand Down
11 changes: 6 additions & 5 deletions lite/trt/cv/trt_face_68landmarks_mt.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,18 @@ class trt_face_68landmarks_mt {
// 实际的推理函数
void process_single_task(InferenceTask& task, int thread_id);

void preprocess(const lite::types::Boxf &bouding_box,const cv::Mat &input_mat,cv::Mat &crop_img);
void preprocess(const lite::types::Boxf &bouding_box,const cv::Mat &input_mat,cv::Mat &crop_img,int thread_id);

void postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68);
void postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68,int thread_id);

std::vector<cv::Point2f> postprocess(float *trt_outputs);
std::vector<cv::Point2f> postprocess(float *trt_outputs,int thread_id);



private:
cv::Mat affine_matrix;
cv::Mat img_with_landmarks;
std::vector<cv::Mat> affine_matrixs;
std::vector<cv::Mat> img_with_landmarks_vec;


public:
explicit trt_face_68landmarks_mt(std::string& model_path, size_t num_threads = 4);
Expand Down
22 changes: 14 additions & 8 deletions lite/trt/cv/trt_face_recognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,21 @@ void TRTFaceFusionFaceRecognizer::detect(cv::Mat &input_mat, std::vector<cv::Poi
std::vector<float> normal_embeding(output.begin(),output.end());


float norm = 0.0f;
for (const auto &val : normal_embeding) {
norm += val * val;
}
norm = std::sqrt(norm);
launch_face_recognizer_postprocess(
static_cast<float*>(buffers[1]),
output_node_dims[0][0] * output_node_dims[0][1],
output.data()
);
// float norm = 0.0f;
// for (const auto &val : normal_embeding) {
// norm += val * val;
// }
// norm = std::sqrt(norm);
//
// for (auto &val : normal_embeding) {
// val /= norm;
// }

for (auto &val : normal_embeding) {
val /= norm;
}

std::cout<<"done!"<<std::endl;

Expand Down
1 change: 1 addition & 0 deletions lite/trt/cv/trt_face_recognizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "lite/trt/core/trt_core.h"
#include "lite/trt/core/trt_utils.h"
#include "lite/trt/core/trt_types.h"
#include "lite/trt/kernel/face_recognizer_postprocess_manager.h"

namespace trtcv{
class LITE_EXPORTS TRTFaceFusionFaceRecognizer : BasicTRTHandler{
Expand Down
73 changes: 34 additions & 39 deletions lite/trt/cv/trt_face_restoration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,42 +11,53 @@ void TRTFaceFusionFaceRestoration::detect(cv::Mat &face_swap_image, std::vector<

cv::Mat crop_image;
cv::Mat affine_matrix;
std::tie(crop_image,affine_matrix) = face_utils::warp_face_by_face_landmark_5(face_swap_image,target_landmarks_5,face_utils::FFHQ_512);
// 记录时间
auto start_warp = std::chrono::high_resolution_clock::now();
std::tie(crop_image,affine_matrix) = face_utils::warp_face_by_face_landmark_5(face_swap_image,target_landmarks_5,
face_utils::FFHQ_512);

std::vector<float> crop_size = {512,512};
cv::Mat box_mask = face_utils::create_static_box_mask(crop_size);
std::vector<cv::Mat> crop_mask_list;
crop_mask_list.emplace_back(box_mask);

cv::cvtColor(crop_image,crop_image,cv::COLOR_BGR2RGB);
crop_image.convertTo(crop_image,CV_32FC3,1.f / 255.f);
crop_image.convertTo(crop_image,CV_32FC3,2.0f,-1.f);
cv::Mat crop_image_rgb;
launch_bgr2rgb(crop_image,crop_image_rgb);
crop_image_rgb.convertTo(crop_image_rgb,CV_32FC3,1.f / 255.f);
crop_image_rgb.convertTo(crop_image_rgb,CV_32FC3,2.0f,-1.f);

std::vector<float> input_vector;
trtcv::utils::transform::create_tensor(crop_image,input_vector,input_node_dims,trtcv::utils::transform::CHW);
trtcv::utils::transform::create_tensor(crop_image_rgb,input_vector,input_node_dims,trtcv::utils::transform::CHW);

// 拷贝
auto end_warp = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> fp_ms_warp = end_warp - start_warp;
std::cout << "FaceRestoration preprocess time: " << fp_ms_warp.count() << "ms" << std::endl;


// 记录时间
auto start = std::chrono::high_resolution_clock::now();
// 先不用拷贝了 处理完成再拷贝出来 类似于整个后处理放在GPU上完成
cudaMemcpyAsync(buffers[0],input_vector.data(),1 * 3 * 512 * 512 * sizeof(float),cudaMemcpyHostToDevice,stream);

// 同步
cudaStreamSynchronize(stream);

// 推理
bool status = trt_context->enqueueV3(stream);

if (!status) {
std::cerr << "Failed to inference" << std::endl;
return;
}


// 同步
cudaStreamSynchronize(stream);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> fp_ms = end - start;
std::cout << "FaceRestoration Inference time: " << fp_ms.count() << "ms" << std::endl;
std::vector<unsigned char> transposed_data(1 * 3 * 512 * 512);

// std::vector<float> transposed_data(1 * 3 * 512 * 512);

// 记录时间
auto start_postprocess = std::chrono::high_resolution_clock::now();
// 这里buffer1就是输出了
launch_face_restoration_postprocess(
static_cast<float*>(buffers[1]),
Expand All @@ -64,47 +75,31 @@ void TRTFaceFusionFaceRestoration::detect(cv::Mat &face_swap_image, std::vector<
std::vector<float> output_vector(1 * 3 * 512 * 512);
// cudaMemcpyAsync(output_vector.data(),buffers[1],1 * 3 * 512 * 512 * sizeof(float),cudaMemcpyDeviceToHost,stream);
cudaStreamSynchronize(stream);
//
// 后处理
int channel = 3;
int height = 512;
int width = 512;
// std::vector<float> output(channel * height * width);
// output.assign(output_vector.begin(),output_vector.end());
//
// std::transform(output.begin(),output.end(),output.begin(),
// [](double x){return std::max(-1.0,std::max(-1.0,std::min(1.0,x)));});
//
// std::transform(output.begin(),output.end(),output.begin(),
// [](double x){return (x + 1.f) /2.f;});
//
// // CHW2HWC
// for (int c = 0; c < channel; ++c){
// for (int h = 0 ; h < height; ++h){
// for (int w = 0; w < width ; ++w){
// int src_index = c * (height * width) + h * width + w;
// int dst_index = h * (width * channel) + w * channel + c;
// transposed_data[dst_index] = output[src_index];
// }
// }
// }
//
// std::transform(transposed_data.begin(),transposed_data.end(),transposed_data.begin(),
// [](float x){return std::round(x * 255.f);});
//
// std::transform(transposed_data.begin(), transposed_data.end(), transposed_data.begin(),
// [](float x) { return static_cast<uint8_t>(x); });


cv::Mat mat(height, width, CV_32FC3, transposed_data_float.data());
// cv::imwrite("/home/lite.ai.toolkit/mid_process.jpg",mat);
cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR);
// 到这里为止基本不耗时


auto crop_mask = crop_mask_list[0];
cv::Mat paste_frame = face_utils::paste_back(ori_image,mat,crop_mask,affine_matrix);

// 这里的paste_back 40ms左右
cv::Mat paste_frame = launch_paste_back(ori_image,mat,crop_mask,affine_matrix);
// cv::Mat paste_frame = face_utils::paste_back(ori_image,mat,crop_mask,affine_matrix);
cv::Mat dst_image = face_utils::blend_frame(ori_image,paste_frame);
auto end_postprocess = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> fp_ms_postprocess = end_postprocess - start_postprocess;
std::cout << "FaceRestoration postprocess time: " << fp_ms_postprocess.count() << "ms" << std::endl;

// 记录时间
auto start_save = std::chrono::high_resolution_clock::now();
cv::imwrite(face_enchaner_path,dst_image);
auto end_save = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> fp_ms_save = end_save - start_save;
std::cout << "FaceRestoration save time: " << fp_ms_save.count() << "ms" << std::endl;

}
3 changes: 3 additions & 0 deletions lite/trt/cv/trt_face_restoration.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@
#include "lite/trt/core/trt_config.h"
#include "lite/ort/cv/face_utils.h"
#include "lite/trt/kernel/face_restoration_postprocess_manager.h"
#include "lite/trt/kernel/bgr2rgb_manager.h"
#include "lite/trt/kernel/paste_back_manager.h"
namespace trtcv{
class LITE_EXPORTS TRTFaceFusionFaceRestoration : BasicTRTHandler{
public:
explicit TRTFaceFusionFaceRestoration(const std::string& _trt_model_path,unsigned int _num_threads = 1) :
BasicTRTHandler(_trt_model_path,_num_threads){};;
public:
// 这个是直接保存的
void detect(cv::Mat &face_swap_image,std::vector<cv::Point2f > &target_landmarks_5 ,const std::string &face_enchaner_path);

};
Expand Down
Loading

0 comments on commit 5f4938f

Please sign in to comment.