tools/realsense-viewer/openvino-face-detection.cpp

// License: Apache 2.0. See LICENSE file in root directory.
// Copyright(c) 2020 Intel Corporation. All Rights Reserved.

// NOTE: This file will be compiled only with INTEL_OPENVINO_DIR pointing to an OpenVINO install!

#include "post-processing-filters-list.h"
#include "post-processing-worker-filter.h"

#include <rs-vino/object-detection.h>
#include <rs-vino/age-gender-detection.h>
#include <rs-vino/detected-object.h>
#include <cv-helpers.hpp>

namespace openvino = InferenceEngine;

/* We need to extend the basic detected_object to include facial characteristics
*/
class detected_face : public openvino_helpers::detected_object
{
    float _age;
    float _male_score, _female_score;  // cumulative - see update_gender()

public:
    using ptr = std::shared_ptr< detected_face >;

    explicit detected_face( size_t id,
        cv::Rect const& location,
        float male_prob,
        float age,
        cv::Rect const& depth_location = cv::Rect{},
        float intensity = 1,
        float depth = 0 )
        : detected_object( id, std::string(), location, depth_location, intensity, depth )
        , _age( age )
        , _male_score( male_prob > 0.5f ? male_prob - 0.5f : 0.f )
        , _female_score( male_prob > 0.5f ? 0.f : 0.5f - male_prob )
    {
    }

    void detected_face::update_age( float value )
    {
        _age = (_age == -1) ? value : 0.95f * _age + 0.05f * value;
    }
    
    void detected_face::update_gender( float value )
    {
        if( value >= 0 )
        {
            if( value > 0.5 )
                _male_score += value - 0.5f;
            else
                _female_score += 0.5f - value;
        }
    }

    int get_age() const { return static_cast< int >( std::floor( _age + 0.5f )); }
    bool is_male() const { return( _male_score > _female_score ); }
    bool is_female() const { return !is_male(); }
};


/* Define a filter that will perform facial detection using OpenVINO
*/
class openvino_face_detection : public post_processing_worker_filter
{
    InferenceEngine::Core _ie;
    openvino_helpers::object_detection _face_detector;
    openvino_helpers::age_gender_detection _age_detector;
    openvino_helpers::detected_objects _faces;
    size_t _id = 0;

    std::shared_ptr< atomic_objects_in_frame > _objects;

public:
    openvino_face_detection( std::string const & name )
        : post_processing_worker_filter( name )
        /*
            This face detector is from the OpenCV Model Zoo:
            https://github.com/opencv/open_model_zoo/blob/master/models/intel/face-detection-adas-0001/description/face-detection-adas-0001.md
        */
        , _face_detector(
            "face-detection-adas-0001.xml",
            0.5,    // Probability threshold
            false ) // Not async
        /*
        */
        , _age_detector(
            "age-gender-recognition-retail-0013.xml",
            false ) // Not async
    {
    }

    ~openvino_face_detection()
    {
        // Complete background worker to ensure it releases the instance's resources in controlled manner
        release_background_worker();
    }

public:
    void start( rs2::subdevice_model & model ) override
    {
        post_processing_worker_filter::start( model );
        _objects = model.detected_objects;
    }

private:
    void worker_start() override
    {
        LOG(INFO) << "Loading CPU extensions...";
        std::string const device_name{ "CPU" };

#ifdef OPENVINO2019
        _ie.AddExtension(std::make_shared< openvino::Extensions::Cpu::CpuExtensions >(), device_name);
#endif

        _face_detector.load_into( _ie, device_name);
        _age_detector.load_into( _ie, device_name);
    }

    /*
        Returns the "intensity" of the face in the picture, and calculates the distance to it, ignoring
        Invalid depth pixels or those outside a range that would be appropriate for a face.
    */
    static float calc_face_attrs(
        const rs2::video_frame & cf,
        const rs2::depth_frame & df,
        cv::Rect const & depth_bbox, 
        float * p_mean_depth )
    {
        uint16_t const * const pdw = reinterpret_cast<const uint16_t*>( df.get_data() );
        uint8_t const * const pcb = reinterpret_cast<uint8_t*>(const_cast<void*>( cf.get_data() ));
        float const depth_scale = df.get_units();

        int const depth_width = df.get_width();
        int const color_width = cf.get_width();
        int const color_bpp = cf.get_bytes_per_pixel();

        int const top = depth_bbox.y;
        int const bot = top + depth_bbox.height;
        int const left = depth_bbox.x;
        int const right = left + depth_bbox.width;

        // Find a center point that has a depth on it
        int center_x = (left + right) / 2;
        int center_index = (top + bot) / 2 * depth_width + center_x;
        for( int d = 1; !pdw[center_index] && d < 10; ++d )
        {
            if( pdw[center_index + d] ) center_index += d;
            if( pdw[center_index - d] ) center_index -= d;
            if( pdw[center_index + depth_width] ) center_index += depth_width;
            if( pdw[center_index - depth_width] ) center_index -= depth_width;
        }
        if( !pdw[center_index] )
        {
            if( p_mean_depth )
                *p_mean_depth = 0;
            return 1;
        }
        float const d_center = pdw[center_index] * depth_scale;

        // Set a "near" and "far" threshold -- anything closer or father, respectively,
        // would be deemed not a part of the face and therefore background:
        float const d_far_threshold = d_center + 0.2f;
        float const d_near_threshold = std::max( d_center - 0.5f, 0.001f );
        // Average human head diameter ~= 7.5" or ~19cm
        // Assume that the center point is in the front of the face, so the near threshold
        // should be very close to that, while the far farther...

        float total_luminance = 0;
        float total_depth = 0;
        unsigned pixel_count = 0;
#pragma omp parallel for schedule(dynamic) //Using OpenMP to try to parallelise the loop
        for( int y = top; y < bot; ++y )
        {
            auto depth_pixel_index = y * depth_width + left;
            for( int x = left; x < right; ++x, ++depth_pixel_index )
            {
                // Get the depth value of the current pixel
                auto d = depth_scale * pdw[depth_pixel_index];

                // Check if the depth value is invalid (<=0) or greater than the threashold
                if( d >= d_near_threshold && d <= d_far_threshold )
                {
                    // Calculate the offset in other frame's buffer to current pixel
                    auto const coffset = depth_pixel_index * color_bpp;
                    auto const pc = &pcb[coffset];

                    // Using RGB...
                    auto r = pc[0], g = pc[1], b = pc[2]; 
                    total_luminance += 0.2989f * r + 0.5870f * g + 0.1140f * b;  // CCIR 601 -- see https://en.wikipedia.org/wiki/Luma_(video)
                    ++pixel_count;

                    // And get a mean depth, too
                    total_depth += d;
                }
            }
        }
        if( p_mean_depth )
            *p_mean_depth = pixel_count ? total_depth / pixel_count : 0;
        return pixel_count ? total_luminance / pixel_count : 1;
    }

    void worker_body( rs2::frameset fs ) override
    {
        // A color video frame is the minimum we need for detection
        auto cf = fs.get_color_frame();
        if( !cf )
        {
            LOG(ERROR) << get_context( fs ) << "no color frame";
            return;
        }
        if( cf.get_profile().format() != RS2_FORMAT_RGB8 )
        {
            LOG(ERROR) << get_context(fs) << "color format must be RGB8; it's " << cf.get_profile().format();
            return;
        }
        // A depth frame is optional: if not enabled, we won't get it, and we simply won't provide depth info...
        auto df = fs.get_depth_frame();
        if( df )
        {
            if( df  &&  df.get_profile().format() != RS2_FORMAT_Z16 )
            {
                LOG(ERROR) << get_context(fs) << "depth format must be Z16; it's " << df.get_profile().format();
                return;
            }
        }

        try
        {
            rs2_intrinsics color_intrin, depth_intrin;
            rs2_extrinsics color_extrin, depth_extrin;
            get_trinsics( cf, df, color_intrin, depth_intrin, color_extrin, depth_extrin );

            objects_in_frame objects;

            cv::Mat image( color_intrin.height, color_intrin.width, CV_8UC3, const_cast<void *>(cf.get_data()), cv::Mat::AUTO_STEP );
            _face_detector.enqueue( image );
            _face_detector.submit_request();
            auto results = _face_detector.fetch_results();

            openvino_helpers::detected_objects prev_faces { std::move( _faces ) };
            _faces.clear();
            for( auto && result : results )
            {
                cv::Rect rect = result.location & cv::Rect( 0, 0, image.cols, image.rows );
                detected_face::ptr face = std::dynamic_pointer_cast< detected_face >(
                    openvino_helpers::find_object( rect, prev_faces ));
                try
                {
                    // Use a mean of the face intensity to help identify faces -- if the intensity changes too much,
                    // it's not the same face...
                    float depth = 0, intensity = 1;
                    cv::Rect depth_rect;
                    if( df )
                    {
                        rs2::rect depth_bbox = project_rect_to_depth(
                            rs2::rect { float( rect.x ), float( rect.y ), float( rect.width ), float( rect.height ) },
                            df,
                            color_intrin, depth_intrin, color_extrin, depth_extrin
                        );
                        // It is possible to get back an invalid rect!
                        if( depth_bbox == depth_bbox.intersection( rs2::rect { 0.f, 0.f, float( depth_intrin.width ), float( depth_intrin.height) } ) )
                        {
                            depth_rect = cv::Rect( int( depth_bbox.x ), int( depth_bbox.y ), int( depth_bbox.w ), int( depth_bbox.h ) );
                            intensity = calc_face_attrs( cf, df, depth_rect, &depth );
                        }
                        else
                        {
                            LOG(DEBUG) << get_context(fs) << "depth_bbox is no good!";
                        }
                    }
                    else
                    {
                        intensity = openvino_helpers::calc_intensity( image( rect ) );
                    }
                    float intensity_change = face ? std::abs( intensity - face->get_intensity() ) / face->get_intensity() : 1;
                    float depth_change = ( face  &&  face->get_depth() ) ? std::abs( depth - face->get_depth() ) / face->get_depth() : 0;

                    if( intensity_change > 0.07f || depth_change > 0.2f )
                    {
                        // Figure out the age for this new face
                        float age = 0, maleProb = 0.5;
                        // Enlarge the bounding box around the detected face for more robust operation of face analytics networks
                        cv::Mat face_image = image(
                            openvino_helpers::adjust_face_bbox( rect, 1.4f )
                                & cv::Rect( 0, 0, image.cols, image.rows ) );
                        _age_detector.enqueue( face_image );
                        _age_detector.submit_request();
                        _age_detector.wait();
                        auto age_gender = _age_detector[0];
                        age = age_gender.age;
                        maleProb = age_gender.maleProb;
                        // Note: we may want to update the gender/age for each frame, as it may change...
                        face = std::make_shared< detected_face >( _id++, rect, maleProb, age, depth_rect, intensity, depth );
                    }
                    else
                    {
                        face->move( rect, depth_rect, intensity, depth );
                    }

                    _faces.push_back( face );
                }
                catch( ... )
                {
                    LOG(ERROR) << get_context(fs) << "Unhandled exception!!!";
                }
            }

            for( auto && object : _faces )
            {
                auto face = std::dynamic_pointer_cast<detected_face>( object );
                cv::Rect const & loc = face->get_location();
                rs2::rect bbox { float( loc.x ), float( loc.y ), float( loc.width ), float( loc.height ) };
                rs2::rect normalized_color_bbox = bbox.normalize( rs2::rect { 0, 0, float(color_intrin.width), float(color_intrin.height) } );
                rs2::rect normalized_depth_bbox = normalized_color_bbox;
                if( df )
                {
                    cv::Rect const & depth_loc = face->get_depth_location();
                    rs2::rect depth_bbox { float( depth_loc.x ), float( depth_loc.y ), float( depth_loc.width ), float( depth_loc.height ) };
                    normalized_depth_bbox = depth_bbox.normalize( rs2::rect { 0, 0, float( df.get_width() ), float( df.get_height() ) } );
                }
                objects.emplace_back(
                    face->get_id(),
                    rs2::to_string() << (face->is_male() ? u8"\uF183" : u8"\uF182") << "  " << face->get_age(),
                    normalized_color_bbox,
                    normalized_depth_bbox,
                    face->get_depth()
                );
            }

            std::lock_guard< std::mutex > lock( _objects->mutex );
            if( is_pb_enabled() )
            {
                if( _objects->sensor_is_on )
                    _objects->swap( objects );
            }
            else
            {
                _objects->clear();
            }
        }
        catch( const std::exception & e )
        {
            LOG(ERROR) << get_context(fs) << e.what();
        }
        catch( ... )
        {
            LOG(ERROR) << get_context(fs) << "Unhandled exception caught in openvino_face_detection";
        }
    }

    void on_processing_block_enable( bool e ) override
    {
        post_processing_worker_filter::on_processing_block_enable( e );
        if( !e )
        {
            // Make sure all the objects go away!
            std::lock_guard< std::mutex > lock( _objects->mutex );
            _objects->clear();
        }
    }

};


static auto it = post_processing_filters_list::register_filter< openvino_face_detection >( "Face Detection : OpenVINO" );