diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 16bdfd0336..c040135c78 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -44,6 +44,7 @@ === Enhancements * Calculate total feature importance to store with model metadata. (See {ml-pull}1387[#1387].) +* Change outlier detection feature_influence format to array with nested objects. (See {ml-pull}1475[#1475], {es-pull}62068[#62068].) === Bug Fixes diff --git a/lib/api/CDataFrameOutliersRunner.cc b/lib/api/CDataFrameOutliersRunner.cc index 6e28a35152..d785a2a5a7 100644 --- a/lib/api/CDataFrameOutliersRunner.cc +++ b/lib/api/CDataFrameOutliersRunner.cc @@ -52,7 +52,9 @@ const CDataFrameAnalysisConfigReader& parameterReader() { // Output const std::string OUTLIER_SCORE_FIELD_NAME{"outlier_score"}; -const std::string FEATURE_INFLUENCE_FIELD_NAME_PREFIX{"feature_influence."}; +const std::string FEATURE_NAME_FIELD_NAME{"feature_name"}; +const std::string FEATURE_INFLUENCE_FIELD_NAME{"feature_influence"}; +const std::string INFLUENCE_FIELD_NAME{"influence"}; } CDataFrameOutliersRunner::CDataFrameOutliersRunner(const CDataFrameAnalysisSpecification& spec, @@ -93,11 +95,19 @@ void CDataFrameOutliersRunner::writeOneRow(const core::CDataFrame& frame, writer.StartObject(); writer.Key(OUTLIER_SCORE_FIELD_NAME); writer.Double(row[scoreColumn]); - if (row[scoreColumn] > m_FeatureInfluenceThreshold) { + if (row[scoreColumn] > m_FeatureInfluenceThreshold && numberFeatureScoreColumns > 0) { + writer.Key(FEATURE_INFLUENCE_FIELD_NAME); + writer.StartArray(); + for (std::size_t i = 0; i < numberFeatureScoreColumns; ++i) { - writer.Key(FEATURE_INFLUENCE_FIELD_NAME_PREFIX + frame.columnNames()[i]); + writer.StartObject(); + writer.Key(FEATURE_NAME_FIELD_NAME); + writer.String(frame.columnNames()[i]); + writer.Key(INFLUENCE_FIELD_NAME); writer.Double(row[beginFeatureScoreColumns + i]); + writer.EndObject(); } + writer.EndArray(); } writer.EndObject(); } diff --git a/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc b/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc index 564ee5f0c3..020174ab7b 100644 --- a/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc @@ -285,8 +285,7 @@ BOOST_AUTO_TEST_CASE(testRunOutlierFeatureInfluences) { TDoubleVec expectedScores; TDoubleVecVec expectedFeatureInfluences; - TStrVec expectedNames{"feature_influence.c1", "feature_influence.c2", "feature_influence.c3", - "feature_influence.c4", "feature_influence.c5"}; + TStrVec expectedNames{"c1", "c2", "c3", "c4", "c5"}; TStrVec fieldNames{"c1", "c2", "c3", "c4", "c5", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; @@ -301,12 +300,19 @@ BOOST_AUTO_TEST_CASE(testRunOutlierFeatureInfluences) { auto expectedFeatureInfluence = expectedFeatureInfluences.begin(); for (const auto& result : results.GetArray()) { if (result.HasMember("row_results")) { + BOOST_TEST_REQUIRE(expectedFeatureInfluence != expectedFeatureInfluences.end()); - for (std::size_t i = 0; i < 5; ++i) { + for (int i = 0; i < 5; ++i) { + BOOST_REQUIRE_EQUAL( + expectedNames[i].c_str(), + result["row_results"]["results"]["ml"]["feature_influence"][i]["feature_name"] + .GetString()); + BOOST_REQUIRE_CLOSE_ABSOLUTE( (*expectedFeatureInfluence)[i], - result["row_results"]["results"]["ml"][expectedNames[i]].GetDouble(), + result["row_results"]["results"]["ml"]["feature_influence"][i]["influence"] + .GetDouble(), 1e-4 * (*expectedFeatureInfluence)[i]); } ++expectedFeatureInfluence;