Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Update] FaceFusion Speed up #446

Merged
merged 19 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ set(CMAKE_CUDA_ARCHITECTURES 89) # For RTX 20xx series
enable_language(CUDA)
set(LITE_AI_ROOT_DIR ${CMAKE_SOURCE_DIR})

option(ENABLE_TEST "build test examples." ON)
option(ENABLE_TEST "build test examples." OFF)
option(ENABLE_DEBUG_STRING "enable DEBUG string or not" ON)
option(ENABLE_ONNXRUNTIME "enable ONNXRuntime engine" ON)
option(ENABLE_TENSORRT "enable TensorRT engine" ON)
option(ENABLE_TENSORRT "enable TensorRT engine" OFF)
option(ENABLE_MNN "enable MNN engine" OFF)
option(ENABLE_NCNN "enable NCNN engine" OFF)
option(ENABLE_TNN "enable TNN engine" OFF)
Expand Down
57 changes: 46 additions & 11 deletions examples/lite/cv/test_lite_facefusion_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
static void test_default()
{
#ifdef ENABLE_ONNXRUNTIME
std::string face_swap_onnx_path = "../../../examples/hub/onnx/cv/inswapper_128.onnx";
std::string face_detect_onnx_path = "../../../examples/hub/onnx/cv/yoloface_8n.onnx";
std::string face_landmarks_68 = "../../../examples/hub/onnx/cv/2dfan4.onnx";
std::string face_recognizer_onnx_path = "../../../examples/hub/onnx/cv/arcface_w600k_r50.onnx";
std::string face_restoration_onnx_path = "../../../examples/hub/onnx/cv/gfpgan_1.4.onnx";
std::string face_swap_onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/inswapper_128.onnx";
std::string face_detect_onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/yoloface_8n.onnx";
std::string face_landmarks_68 = "/home/lite.ai.toolkit/examples/hub/onnx/cv/2dfan4.onnx";
std::string face_recognizer_onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/arcface_w600k_r50.onnx";
std::string face_restoration_onnx_path = "/home/lite.ai.toolkit/examples/hub/onnx/cv/gfpgan_1.4.onnx";

auto pipeLine = lite::cv::face::swap::facefusion::PipeLine(
face_detect_onnx_path,
Expand All @@ -19,27 +19,62 @@ static void test_default()
face_restoration_onnx_path
);

std::string source_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_source.jpg";
std::string target_image_path = "../../../examples/lite/resources/test_lite_facefusion_pipeline_target.jpg";
std::string save_image_path = "../../../examples/logs/test_lite_facefusion_pipeline_result.jpg";
std::string source_image_path = "/home/lite.ai.toolkit/1.jpg";
std::string target_image_path = "/home/lite.ai.toolkit/2.jpg";
std::string save_image_path = "/home/lite.ai.toolkit/result111111.jpg";


// 写一个测试时间的代码
auto start = std::chrono::high_resolution_clock::now();

pipeLine.detect(source_image_path,target_image_path,save_image_path);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end-start;
std::cout << "Time: " << diff.count() << " s\n";


#endif
}




static void test_tensorrt()
{
#ifdef ENABLE_TENSORRT
std::string face_swap_onnx_path = "../../../examples/hub/trt/inswapper_128_fp16.engine";
std::string face_detect_onnx_path = "../../../examples/hub/trt/yoloface_8n_fp16.engine";
std::string face_landmarks_68 = "../../../examples/hub/trt/2dfan4_fp16.engine";
std::string face_recognizer_onnx_path = "../../../examples/hub/trt/arcface_w600k_r50_fp16.engine";
std::string face_restoration_onnx_path = "../../../examples/hub/trt/gfpgan_1.4_fp32.engine";

auto pipeLine = lite::trt::cv::face::swap::FaceFusionPipeLine (
face_detect_onnx_path,
face_landmarks_68,
face_recognizer_onnx_path,
face_swap_onnx_path,
face_restoration_onnx_path
);

std::string source_image_path = "../../../examples/logs/1.jpg";
std::string target_image_path = "../../../examples/logs/5.jpg";
std::string save_image_path = "../../../examples/logs/trt_pipeline_result_cuda_test_13_mt.jpg";


// 写一个测试时间的代码
auto start = std::chrono::high_resolution_clock::now();

pipeLine.detect(source_image_path,target_image_path,save_image_path);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end-start;
std::cout << "Time: " << diff.count() << " s\n";
std::cout << "Time: " << diff.count() * 1000<< " ms\n";


#endif
}

int main()
{

test_default();
test_tensorrt();
// test_default();
}
3 changes: 2 additions & 1 deletion lite/ort/cv/yolofacev8.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// Created by ai-test1 on 24-7-8.
// Created by wangzijian on 24-7-8.
//

#include "yolofacev8.h"
Expand All @@ -9,6 +9,7 @@
using ortcv::YoloFaceV8;

float YoloFaceV8::get_iou(const lite::types::Boxf box1, const lite::types::Boxf box2) {
// 左上角是坐标轴原点,右下角是坐标轴最大值
float x1 = std::max(box1.x1, box2.x1);
float y1 = std::max(box1.y1, box2.y1);
float x2 = std::min(box1.x2, box2.x2);
Expand Down
21 changes: 12 additions & 9 deletions lite/trt/cv/trt_face_68landmarks_mt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ trt_face_68landmarks_mt::trt_face_68landmarks_mt(std::string &model_path, size_t
worker_threads.emplace_back(&trt_face_68landmarks_mt::worker_function, this, i);
}

affine_matrixs.resize(num_threads);
img_with_landmarks_vec.resize(num_threads);

}

// 在cpp文件中修改相关实现
Expand Down Expand Up @@ -138,7 +141,7 @@ void trt_face_68landmarks_mt::worker_function(int thread_id) {
}

void
trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const cv::Mat &input_mat, cv::Mat &crop_img) {
trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const cv::Mat &input_mat, cv::Mat &crop_img,int thread_id) {
float xmin = bounding_box.x1;
float ymin = bounding_box.y1;
float xmax = bounding_box.x2;
Expand All @@ -159,7 +162,7 @@ trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const

cv::Size crop_size(256, 256);

std::tie(crop_img, affine_matrix) = face_utils::warp_face_by_translation(input_mat, translation, scale, crop_size);
std::tie(crop_img, affine_matrixs[thread_id]) = face_utils::warp_face_by_translation(input_mat, translation, scale, crop_size);

crop_img.convertTo(crop_img,CV_32FC3,1 / 255.f);
}
Expand All @@ -168,10 +171,10 @@ trt_face_68landmarks_mt::preprocess(const lite::types::Boxf &bounding_box, const
void trt_face_68landmarks_mt::process_single_task( InferenceTask &task, int thread_id) {
if (task.input_mat.empty()) return;

img_with_landmarks = task.input_mat.clone();
img_with_landmarks_vec[thread_id] = task.input_mat.clone();
cv::Mat crop_image;

preprocess(task.bbox, task.input_mat, crop_image);
preprocess(task.bbox, task.input_mat, crop_image, thread_id);

std::vector<float> input_data;

Expand All @@ -198,13 +201,13 @@ void trt_face_68landmarks_mt::process_single_task( InferenceTask &task, int thre

// 带出结果
// 指针指向带出来
*task.face_landmark_5of68 = postprocess(output.data());
*task.face_landmark_5of68 = postprocess(output.data(),thread_id);

task.completion_promise.set_value();
}


std::vector<cv::Point2f> trt_face_68landmarks_mt::postprocess(float *trt_outputs) {
std::vector<cv::Point2f> trt_face_68landmarks_mt::postprocess(float *trt_outputs,int thread_id) {
std::vector<cv::Point2f> landmarks;

for (int i = 0;i < 68; ++i)
Expand All @@ -215,15 +218,15 @@ std::vector<cv::Point2f> trt_face_68landmarks_mt::postprocess(float *trt_outputs
}

cv::Mat inverse_affine_matrix;
cv::invertAffineTransform(affine_matrix, inverse_affine_matrix);
cv::invertAffineTransform(affine_matrixs[thread_id], inverse_affine_matrix);

cv::transform(landmarks, landmarks, inverse_affine_matrix);

return face_utils::convert_face_landmark_68_to_5(landmarks);
}


void trt_face_68landmarks_mt::postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68) {
void trt_face_68landmarks_mt::postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68,int thread_id) {
std::vector<cv::Point2f> landmarks;

for (int i = 0;i < 68; ++i)
Expand All @@ -234,7 +237,7 @@ void trt_face_68landmarks_mt::postprocess(float *trt_outputs, std::vector<cv::Po
}

cv::Mat inverse_affine_matrix;
cv::invertAffineTransform(affine_matrix, inverse_affine_matrix);
cv::invertAffineTransform(affine_matrixs[thread_id], inverse_affine_matrix);

cv::transform(landmarks, landmarks, inverse_affine_matrix);

Expand Down
11 changes: 6 additions & 5 deletions lite/trt/cv/trt_face_68landmarks_mt.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,18 @@ class trt_face_68landmarks_mt {
// 实际的推理函数
void process_single_task(InferenceTask& task, int thread_id);

void preprocess(const lite::types::Boxf &bouding_box,const cv::Mat &input_mat,cv::Mat &crop_img);
void preprocess(const lite::types::Boxf &bouding_box,const cv::Mat &input_mat,cv::Mat &crop_img,int thread_id);

void postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68);
void postprocess(float *trt_outputs, std::vector<cv::Point2f> &face_landmark_5of68,int thread_id);

std::vector<cv::Point2f> postprocess(float *trt_outputs);
std::vector<cv::Point2f> postprocess(float *trt_outputs,int thread_id);



private:
cv::Mat affine_matrix;
cv::Mat img_with_landmarks;
std::vector<cv::Mat> affine_matrixs;
std::vector<cv::Mat> img_with_landmarks_vec;


public:
explicit trt_face_68landmarks_mt(std::string& model_path, size_t num_threads = 4);
Expand Down
22 changes: 14 additions & 8 deletions lite/trt/cv/trt_face_recognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,21 @@ void TRTFaceFusionFaceRecognizer::detect(cv::Mat &input_mat, std::vector<cv::Poi
std::vector<float> normal_embeding(output.begin(),output.end());


float norm = 0.0f;
for (const auto &val : normal_embeding) {
norm += val * val;
}
norm = std::sqrt(norm);
launch_face_recognizer_postprocess(
static_cast<float*>(buffers[1]),
output_node_dims[0][0] * output_node_dims[0][1],
output.data()
);
// float norm = 0.0f;
// for (const auto &val : normal_embeding) {
// norm += val * val;
// }
// norm = std::sqrt(norm);
//
// for (auto &val : normal_embeding) {
// val /= norm;
// }

for (auto &val : normal_embeding) {
val /= norm;
}

std::cout<<"done!"<<std::endl;

Expand Down
1 change: 1 addition & 0 deletions lite/trt/cv/trt_face_recognizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "lite/trt/core/trt_core.h"
#include "lite/trt/core/trt_utils.h"
#include "lite/trt/core/trt_types.h"
#include "lite/trt/kernel/face_recognizer_postprocess_manager.h"

namespace trtcv{
class LITE_EXPORTS TRTFaceFusionFaceRecognizer : BasicTRTHandler{
Expand Down
73 changes: 34 additions & 39 deletions lite/trt/cv/trt_face_restoration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,42 +11,53 @@ void TRTFaceFusionFaceRestoration::detect(cv::Mat &face_swap_image, std::vector<

cv::Mat crop_image;
cv::Mat affine_matrix;
std::tie(crop_image,affine_matrix) = face_utils::warp_face_by_face_landmark_5(face_swap_image,target_landmarks_5,face_utils::FFHQ_512);
// 记录时间
auto start_warp = std::chrono::high_resolution_clock::now();
std::tie(crop_image,affine_matrix) = face_utils::warp_face_by_face_landmark_5(face_swap_image,target_landmarks_5,
face_utils::FFHQ_512);

std::vector<float> crop_size = {512,512};
cv::Mat box_mask = face_utils::create_static_box_mask(crop_size);
std::vector<cv::Mat> crop_mask_list;
crop_mask_list.emplace_back(box_mask);

cv::cvtColor(crop_image,crop_image,cv::COLOR_BGR2RGB);
crop_image.convertTo(crop_image,CV_32FC3,1.f / 255.f);
crop_image.convertTo(crop_image,CV_32FC3,2.0f,-1.f);
cv::Mat crop_image_rgb;
launch_bgr2rgb(crop_image,crop_image_rgb);
crop_image_rgb.convertTo(crop_image_rgb,CV_32FC3,1.f / 255.f);
crop_image_rgb.convertTo(crop_image_rgb,CV_32FC3,2.0f,-1.f);

std::vector<float> input_vector;
trtcv::utils::transform::create_tensor(crop_image,input_vector,input_node_dims,trtcv::utils::transform::CHW);
trtcv::utils::transform::create_tensor(crop_image_rgb,input_vector,input_node_dims,trtcv::utils::transform::CHW);

// 拷贝
auto end_warp = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> fp_ms_warp = end_warp - start_warp;
std::cout << "FaceRestoration preprocess time: " << fp_ms_warp.count() << "ms" << std::endl;


// 记录时间
auto start = std::chrono::high_resolution_clock::now();
// 先不用拷贝了 处理完成再拷贝出来 类似于整个后处理放在GPU上完成
cudaMemcpyAsync(buffers[0],input_vector.data(),1 * 3 * 512 * 512 * sizeof(float),cudaMemcpyHostToDevice,stream);

// 同步
cudaStreamSynchronize(stream);

// 推理
bool status = trt_context->enqueueV3(stream);

if (!status) {
std::cerr << "Failed to inference" << std::endl;
return;
}


// 同步
cudaStreamSynchronize(stream);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> fp_ms = end - start;
std::cout << "FaceRestoration Inference time: " << fp_ms.count() << "ms" << std::endl;
std::vector<unsigned char> transposed_data(1 * 3 * 512 * 512);

// std::vector<float> transposed_data(1 * 3 * 512 * 512);

// 记录时间
auto start_postprocess = std::chrono::high_resolution_clock::now();
// 这里buffer1就是输出了
launch_face_restoration_postprocess(
static_cast<float*>(buffers[1]),
Expand All @@ -64,47 +75,31 @@ void TRTFaceFusionFaceRestoration::detect(cv::Mat &face_swap_image, std::vector<
std::vector<float> output_vector(1 * 3 * 512 * 512);
// cudaMemcpyAsync(output_vector.data(),buffers[1],1 * 3 * 512 * 512 * sizeof(float),cudaMemcpyDeviceToHost,stream);
cudaStreamSynchronize(stream);
//
// 后处理
int channel = 3;
int height = 512;
int width = 512;
// std::vector<float> output(channel * height * width);
// output.assign(output_vector.begin(),output_vector.end());
//
// std::transform(output.begin(),output.end(),output.begin(),
// [](double x){return std::max(-1.0,std::max(-1.0,std::min(1.0,x)));});
//
// std::transform(output.begin(),output.end(),output.begin(),
// [](double x){return (x + 1.f) /2.f;});
//
// // CHW2HWC
// for (int c = 0; c < channel; ++c){
// for (int h = 0 ; h < height; ++h){
// for (int w = 0; w < width ; ++w){
// int src_index = c * (height * width) + h * width + w;
// int dst_index = h * (width * channel) + w * channel + c;
// transposed_data[dst_index] = output[src_index];
// }
// }
// }
//
// std::transform(transposed_data.begin(),transposed_data.end(),transposed_data.begin(),
// [](float x){return std::round(x * 255.f);});
//
// std::transform(transposed_data.begin(), transposed_data.end(), transposed_data.begin(),
// [](float x) { return static_cast<uint8_t>(x); });


cv::Mat mat(height, width, CV_32FC3, transposed_data_float.data());
// cv::imwrite("/home/lite.ai.toolkit/mid_process.jpg",mat);
cv::cvtColor(mat, mat, cv::COLOR_RGB2BGR);
// 到这里为止基本不耗时


auto crop_mask = crop_mask_list[0];
cv::Mat paste_frame = face_utils::paste_back(ori_image,mat,crop_mask,affine_matrix);

// 这里的paste_back 40ms左右
cv::Mat paste_frame = launch_paste_back(ori_image,mat,crop_mask,affine_matrix);
// cv::Mat paste_frame = face_utils::paste_back(ori_image,mat,crop_mask,affine_matrix);
cv::Mat dst_image = face_utils::blend_frame(ori_image,paste_frame);
auto end_postprocess = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> fp_ms_postprocess = end_postprocess - start_postprocess;
std::cout << "FaceRestoration postprocess time: " << fp_ms_postprocess.count() << "ms" << std::endl;

// 记录时间
auto start_save = std::chrono::high_resolution_clock::now();
cv::imwrite(face_enchaner_path,dst_image);
auto end_save = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> fp_ms_save = end_save - start_save;
std::cout << "FaceRestoration save time: " << fp_ms_save.count() << "ms" << std::endl;

}
Loading