From 684a022ed1dd5966c950fb9aac0549b352c6e048 Mon Sep 17 00:00:00 2001 From: Flor Elisa Chacon Ochoa Date: Mon, 23 May 2022 19:38:22 -0700 Subject: [PATCH 1/4] Improve ARP matching heuristic --- src/AppInstallerCLITests/ARPChanges.cpp | 1 + src/AppInstallerCLITests/Correlation.cpp | 28 ++- .../TestData/InputARPData.txt | 3 +- .../AppInstallerStrings.cpp | 29 +++ .../NameNormalization.cpp | 31 ++- .../Public/AppInstallerStrings.h | 3 + .../Public/winget/NameNormalization.h | 1 + .../ARPCorrelation.cpp | 139 +----------- .../ARPCorrelationAlgorithms.cpp | 197 ++++++++++++++++++ .../AppInstallerRepositoryCore.vcxproj | 2 + ...AppInstallerRepositoryCore.vcxproj.filters | 6 + .../Public/winget/ARPCorrelation.h | 24 --- .../Public/winget/ARPCorrelationAlgorithms.h | 62 ++++++ 13 files changed, 343 insertions(+), 183 deletions(-) create mode 100644 src/AppInstallerRepositoryCore/ARPCorrelationAlgorithms.cpp create mode 100644 src/AppInstallerRepositoryCore/Public/winget/ARPCorrelationAlgorithms.h diff --git a/src/AppInstallerCLITests/ARPChanges.cpp b/src/AppInstallerCLITests/ARPChanges.cpp index 6111bad98a..fe980d1841 100644 --- a/src/AppInstallerCLITests/ARPChanges.cpp +++ b/src/AppInstallerCLITests/ARPChanges.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include using namespace TestCommon; diff --git a/src/AppInstallerCLITests/Correlation.cpp b/src/AppInstallerCLITests/Correlation.cpp index 859fb076b8..ec06ae0337 100644 --- a/src/AppInstallerCLITests/Correlation.cpp +++ b/src/AppInstallerCLITests/Correlation.cpp @@ -5,6 +5,7 @@ #include "TestSource.h" #include +#include #include #include @@ -76,13 +77,18 @@ Manifest GetManifestFromTestCase(const TestCase& testCase) return manifest; } -ARPEntry GetARPEntryFromTestCase(const TestCase& testCase) +ARPEntry GetARPEntryFromTestCase(const TestCase& testCase, bool isNew) { Manifest arpManifest; arpManifest.DefaultLocalization.Add(testCase.ARPName); arpManifest.DefaultLocalization.Add(testCase.ARPPublisher); arpManifest.Localizations.push_back(arpManifest.DefaultLocalization); - return ARPEntry{ TestPackage::Make(arpManifest, TestPackage::MetadataMap{}), false }; + return ARPEntry{ TestPackage::Make(arpManifest, TestPackage::MetadataMap{}), isNew }; +} + +ARPEntry GetExistingARPEntryFromTestCase(const TestCase& testCase) +{ + return GetARPEntryFromTestCase(testCase, /* isNew */ false); } void ReportMatch(std::string_view label, std::string_view appName, std::string_view appPublisher, std::string_view arpName, std::string_view arpPublisher) @@ -105,7 +111,7 @@ ResultSummary EvaluateDataSetWithHeuristic(const DataSet& dataSet, IARPMatchConf for (const auto& testCase : dataSet.TestCases) { - arpEntries.push_back(GetARPEntryFromTestCase(testCase)); + arpEntries.push_back(GetARPEntryFromTestCase(testCase, /* isNew */ true)); auto match = FindARPEntryForNewlyInstalledPackageWithHeuristics(GetManifestFromTestCase(testCase), arpEntries, correlationAlgorithm); arpEntries.pop_back(); @@ -225,10 +231,10 @@ DataSet GetDataSet_NoNoise() dataSet.TestCases = LoadTestData(); // Arbitrary values. We should refine them as the algorithm gets better. - dataSet.RequiredTrueMatchRatio = 0.7; - dataSet.RequiredFalseMatchRatio = 0.05; + dataSet.RequiredTrueMatchRatio = 0.75; + dataSet.RequiredFalseMatchRatio = 0; dataSet.RequiredTrueMismatchRatio = 0; // There are no expected mismatches in this data set - dataSet.RequiredFalseMismatchRatio = 0.3; + dataSet.RequiredFalseMismatchRatio = 0.25; return dataSet; } @@ -238,14 +244,14 @@ DataSet GetDataSet_WithNoise() DataSet dataSet; auto baseTestCases = LoadTestData(); - std::transform(baseTestCases.begin(), baseTestCases.end(), std::back_inserter(dataSet.ARPNoise), GetARPEntryFromTestCase); + std::transform(baseTestCases.begin(), baseTestCases.end(), std::back_inserter(dataSet.ARPNoise), GetExistingARPEntryFromTestCase); dataSet.TestCases = std::move(baseTestCases); // Arbitrary values. We should refine them as the algorithm gets better. - dataSet.RequiredTrueMatchRatio = 0.7; - dataSet.RequiredFalseMatchRatio = 0.05; + dataSet.RequiredTrueMatchRatio = 0.75; + dataSet.RequiredFalseMatchRatio = 0; // This should always stay at 0 dataSet.RequiredTrueMismatchRatio = 0; // There are no expected mismatches in this data set - dataSet.RequiredFalseMismatchRatio = 0.3; + dataSet.RequiredFalseMismatchRatio = 0.25; return dataSet; } @@ -256,7 +262,7 @@ DataSet GetDataSet_WithNoise() // performs well. TEMPLATE_TEST_CASE("Correlation_MeasureAlgorithmPerformance", "[correlation][.]", EmptyMatchConfidenceAlgorithm, - EditDistanceMatchConfidenceAlgorithm) + WordsEditDistanceMatchConfidenceAlgorithm) { // Each section loads a different data set, // and then they are all handled the same diff --git a/src/AppInstallerCLITests/TestData/InputARPData.txt b/src/AppInstallerCLITests/TestData/InputARPData.txt index fd5cb8f417..e28cb36a99 100644 --- a/src/AppInstallerCLITests/TestData/InputARPData.txt +++ b/src/AppInstallerCLITests/TestData/InputARPData.txt @@ -108,4 +108,5 @@ XPFP42J061BPC1|Documenti Lavori Cantiere|Esposito Software di M. G. Caputo|Docum XPFPFN4LT21PZJ|Studio Dentistico Pro|Esposito Software di M. G. Caputo|Studio Dentistico Pro Demo||Copyright Esposito Software|Studio Dentistico Pro Demo_is1 XPFPFWMVTR0WHP|Ashampoo UnInstaller 11|Ashampoo|Ashampoo UnInstaller 11|11.00.12|Ashampoo GmbH & Co. KG|{4209F371-B84B-F321-6BD3-1D91E2505732}_is1 XPFPFWV5JD80K2|BeeCut|网旭科技|BeeCut V1.7.7.22|1.7.7.22|Apowersoft LIMITED|{CA76BFA8-1862-49D7-B2C7-AE3D6CF40E53}_is1 -XPFPLCB36G8V8J|HttpMaster Professional|Borvid, Informacijske storitve, Janez Čas s.p.|HttpMaster Professional Edition 5.4.1|5.4.1|Borvid|{B61241AA-F5FC-42C9-A1F9-F6D72D654349} \ No newline at end of file +XPFPLCB36G8V8J|HttpMaster Professional|Borvid, Informacijske storitve, Janez Čas s.p.|HttpMaster Professional Edition 5.4.1|5.4.1|Borvid|{B61241AA-F5FC-42C9-A1F9-F6D72D654349} +XP8CDJNZKFM06W|Visual Studio Community 2019|Microsoft Corporation|Microsoft Visual Studio Installer|3.1.2196.8931|Microsoft Corporation|{6F320B93-EE3C-4826-85E0-ADF79F8D4C61} \ No newline at end of file diff --git a/src/AppInstallerCommonCore/AppInstallerStrings.cpp b/src/AppInstallerCommonCore/AppInstallerStrings.cpp index 984bfdb27c..9d1e6cd9e4 100644 --- a/src/AppInstallerCommonCore/AppInstallerStrings.cpp +++ b/src/AppInstallerCommonCore/AppInstallerStrings.cpp @@ -87,6 +87,12 @@ namespace AppInstaller::Utility return utext_char32At(m_text.get(), m_currentBrk); } + // Returns the status from the break rule that determined the most recently break position. + int32_t CurrentRuleStatus() + { + return ubrk_getRuleStatus(m_brk.get()); + } + private: wil::unique_any m_text; wil::unique_any m_brk; @@ -628,4 +634,27 @@ namespace AppInstaller::Utility return path.filename(); } + + std::vector SplitIntoWords(std::string_view input) + { + ICUBreakIterator itr{ input, UBRK_WORD }; + std::size_t currentOffset = 0; + + std::vector result; + while (itr.Next() != UBRK_DONE) + { + std::size_t nextOffset = itr.CurrentOffset(); + + // Ignore spaces and punctuation, accept words and numbers + if (itr.CurrentRuleStatus() != UBRK_WORD_NONE) + { + auto wordSize = nextOffset - currentOffset; + result.emplace_back(input, currentOffset, wordSize); + } + + currentOffset = nextOffset; + } + + return result; + } } diff --git a/src/AppInstallerCommonCore/NameNormalization.cpp b/src/AppInstallerCommonCore/NameNormalization.cpp index 63c9e0ff4e..dbbd2334c9 100644 --- a/src/AppInstallerCommonCore/NameNormalization.cpp +++ b/src/AppInstallerCommonCore/NameNormalization.cpp @@ -347,6 +347,8 @@ namespace AppInstaller::Utility // The folded and sorted version of LocaleViews. const std::vector LegalEntitySuffixes; + const bool PreserveWhiteSpace; + static std::vector FoldAndSort(const std::vector& input) { std::vector result; @@ -376,11 +378,14 @@ namespace AppInstaller::Utility // Repeatedly remove matches for the regexes to create the minimum name while (RemoveAll(ProgramNameRegexes, result.Name)); - auto tokens = Split(ProgramNameSplit, result.Name, LegalEntitySuffixes); - result.Name = Join(tokens); + if (!PreserveWhiteSpace) + { + auto tokens = Split(ProgramNameSplit, result.Name, LegalEntitySuffixes); + result.Name = Join(tokens); - // Drop all undesired characters - Remove(NonLettersAndDigits, result.Name); + // Drop all undesired characters + Remove(NonLettersAndDigits, result.Name); + } return result; } @@ -394,17 +399,20 @@ namespace AppInstaller::Utility while (RemoveAll(PublisherNameRegexes, result.Publisher)); - auto tokens = Split(PublisherNameSplit, result.Publisher, LegalEntitySuffixes, true); - result.Publisher = Join(tokens); + if (!PreserveWhiteSpace) + { + auto tokens = Split(PublisherNameSplit, result.Publisher, LegalEntitySuffixes, true); + result.Publisher = Join(tokens); - // Drop all undesired characters - Remove(NonLettersAndDigits, result.Publisher); + // Drop all undesired characters + Remove(NonLettersAndDigits, result.Publisher); + } return result; } public: - NormalizationInitial() : Locales(FoldAndSort(LocaleViews)), LegalEntitySuffixes(FoldAndSort(LegalEntitySuffixViews)) + NormalizationInitial(bool preserveWhiteSpace) : Locales(FoldAndSort(LocaleViews)), LegalEntitySuffixes(FoldAndSort(LegalEntitySuffixViews)), PreserveWhiteSpace(preserveWhiteSpace) { } @@ -448,7 +456,10 @@ namespace AppInstaller::Utility switch (version) { case AppInstaller::Utility::NormalizationVersion::Initial: - m_normalizer = std::make_unique(); + m_normalizer = std::make_unique(false); + break; + case AppInstaller::Utility::NormalizationVersion::InitialPreserveWhiteSpace: + m_normalizer = std::make_unique(true); break; default: THROW_HR(E_INVALIDARG); diff --git a/src/AppInstallerCommonCore/Public/AppInstallerStrings.h b/src/AppInstallerCommonCore/Public/AppInstallerStrings.h index dd2f09c6e1..d8e4f0216d 100644 --- a/src/AppInstallerCommonCore/Public/AppInstallerStrings.h +++ b/src/AppInstallerCommonCore/Public/AppInstallerStrings.h @@ -149,6 +149,9 @@ namespace AppInstaller::Utility // Gets the file name part of the given URI. std::filesystem::path GetFileNameFromURI(std::string_view uri); + // Splits the string into words. + std::vector SplitIntoWords(std::string_view input); + // Converts a container to a string representation of it. template std::string ConvertContainerToString(const T& container, U toString) diff --git a/src/AppInstallerCommonCore/Public/winget/NameNormalization.h b/src/AppInstallerCommonCore/Public/winget/NameNormalization.h index 2c2c4cf8dd..d53b0cbdfc 100644 --- a/src/AppInstallerCommonCore/Public/winget/NameNormalization.h +++ b/src/AppInstallerCommonCore/Public/winget/NameNormalization.h @@ -13,6 +13,7 @@ namespace AppInstaller::Utility enum class NormalizationVersion { Initial, + InitialPreserveWhiteSpace, }; struct NameNormalizer; diff --git a/src/AppInstallerRepositoryCore/ARPCorrelation.cpp b/src/AppInstallerRepositoryCore/ARPCorrelation.cpp index 7a1e54a7c9..5b81e4f96a 100644 --- a/src/AppInstallerRepositoryCore/ARPCorrelation.cpp +++ b/src/AppInstallerRepositoryCore/ARPCorrelation.cpp @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "pch.h" #include "winget/ARPCorrelation.h" +#include "winget/ARPCorrelationAlgorithms.h" #include "winget/Manifest.h" #include "winget/NameNormalization.h" #include "winget/RepositorySearch.h" @@ -19,7 +20,7 @@ namespace AppInstaller::Repository::Correlation IARPMatchConfidenceAlgorithm& InstanceInternal(std::optional algorithmOverride = {}) { - static EditDistanceMatchConfidenceAlgorithm s_algorithm; + static WordsEditDistanceMatchConfidenceAlgorithm s_algorithm; static IARPMatchConfidenceAlgorithm* s_override = nullptr; if (algorithmOverride.has_value()) @@ -36,69 +37,6 @@ namespace AppInstaller::Repository::Correlation return s_algorithm; } } - - // A simple matrix class to hold the edit distance table without having to allocate multiple arrays. - struct Matrix - { - Matrix(size_t rows, size_t columns) : m_rows(rows), m_columns(columns), m_data(rows * columns) {} - - double& At(size_t i, size_t j) - { - return m_data[i * m_columns + j]; - } - - private: - size_t m_rows; - size_t m_columns; - std::vector m_data; - }; - - double EditDistanceScore(std::u32string_view sv1, std::u32string_view sv2) - { - // Naive implementation of edit distance (scaled over the string size) - - // We may have empty values coming from the ARP - if (sv1.empty() || sv2.empty()) - { - return 0; - } - - // distance[i, j] = distance between sv1[0:i] and sv2[0:j] - // We don't need to hold more than two rows at a time, but it's simpler to keep the whole table. - Matrix distance(sv1.size(), sv2.size()); - - for (size_t i = 0; i < sv1.size(); ++i) - { - for (size_t j = 0; j < sv2.size(); ++j) - { - double& d = distance.At(i, j); - if (i == 0) - { - d = static_cast(j); - } - else if (j == 0) - { - d = static_cast(i); - } - else if (sv1[i] == sv2[j]) - { - d = distance.At(i - 1, j - 1); - } - else - { - d = std::min( - 1 + distance.At(i - 1, j - 1), - 1 + std::min(distance.At(i, j - 1), distance.At(i - 1, j))); - } - } - } - - // Maximum distance is equal to the length of the longest string. - // We use that to scale to [0,1]. - // A smaller distance represents a higher match, so we subtract from 1 for the final score - double editDistance = distance.At(sv1.size() - 1, sv2.size() - 1); - return 1 - editDistance / std::max(sv1.size(), sv2.size()); - } } IARPMatchConfidenceAlgorithm& IARPMatchConfidenceAlgorithm::Instance() @@ -118,79 +56,6 @@ namespace AppInstaller::Repository::Correlation } #endif - std::u32string EditDistanceMatchConfidenceAlgorithm::PrepareString(std::string_view s) const - { - return Utility::ConvertToUTF32(Utility::FoldCase(s)); - } - - std::u32string EditDistanceMatchConfidenceAlgorithm::NormalizeAndPrepareName(std::string_view name) const - { - return PrepareString(m_normalizer.NormalizeName(name).Name()); - } - - std::u32string EditDistanceMatchConfidenceAlgorithm::NormalizeAndPreparePublisher(std::string_view publisher) const - { - return PrepareString(m_normalizer.NormalizePublisher(publisher)); - } - - void EditDistanceMatchConfidenceAlgorithm::Init(const Manifest::Manifest& manifest) - { - // We will use the name and publisher from each localization. - m_namesAndPublishers.clear(); - - std::u32string defaultPublisher; - if (manifest.DefaultLocalization.Contains(Localization::Publisher)) - { - defaultPublisher = NormalizeAndPreparePublisher(manifest.DefaultLocalization.Get()); - } - - if (manifest.DefaultLocalization.Contains(Localization::PackageName)) - { - std::u32string defaultName = NormalizeAndPrepareName(manifest.DefaultLocalization.Get()); - m_namesAndPublishers.emplace_back(defaultName, defaultPublisher, defaultName + defaultPublisher); - - for (const auto& loc : manifest.Localizations) - { - if (loc.Contains(Localization::PackageName) || loc.Contains(Localization::Publisher)) - { - auto name = loc.Contains(Localization::PackageName) ? NormalizeAndPrepareName(loc.Get()) : defaultName; - auto publisher = loc.Contains(Localization::Publisher) ? NormalizeAndPreparePublisher(loc.Get()) : defaultPublisher; - auto nameAndPublisher = publisher + name; - - m_namesAndPublishers.emplace_back(std::move(name), std::move(publisher), std::move(nameAndPublisher)); - } - } - } - } - - double EditDistanceMatchConfidenceAlgorithm::ComputeConfidence(const ARPEntry& arpEntry) const - { - // Name and Publisher are available as multi properties, but for ARP entries there will only be 0 or 1 values. - auto arpName = NormalizeAndPrepareName(arpEntry.Entry->GetInstalledVersion()->GetProperty(PackageVersionProperty::Name).get()); - auto arpPublisher = NormalizeAndPreparePublisher(arpEntry.Entry->GetInstalledVersion()->GetProperty(PackageVersionProperty::Publisher).get()); - auto arpNamePublisher = arpPublisher + arpName; - - // Get the best score across all localizations - double bestMatchingScore = 0; - for (const auto& manifestNameAndPublisher : m_namesAndPublishers) - { - // Sometimes the publisher may be included in the name, for example Microsoft PowerToys as opposed to simply PowerToys. - // This may happen both in the ARP entry and the manifest. We try adding it in case it is in one but not in both. - auto nameDistance = std::max( - EditDistanceScore(std::get<0>(manifestNameAndPublisher), arpName), - std::max( - EditDistanceScore(std::get<2>(manifestNameAndPublisher), arpName), - EditDistanceScore(std::get<0>(manifestNameAndPublisher), arpNamePublisher))); - auto publisherDistance = EditDistanceScore(std::get<1>(manifestNameAndPublisher), arpPublisher); - - // TODO: Consider other ways of merging the two values - auto score = (2 * nameDistance + publisherDistance) / 3; - bestMatchingScore = std::max(bestMatchingScore, score); - } - - return bestMatchingScore; - } - ARPCorrelationResult FindARPEntryForNewlyInstalledPackage( const Manifest::Manifest& manifest, const std::vector& arpSnapshot, diff --git a/src/AppInstallerRepositoryCore/ARPCorrelationAlgorithms.cpp b/src/AppInstallerRepositoryCore/ARPCorrelationAlgorithms.cpp new file mode 100644 index 0000000000..f9aacdf996 --- /dev/null +++ b/src/AppInstallerRepositoryCore/ARPCorrelationAlgorithms.cpp @@ -0,0 +1,197 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +#include "pch.h" +#include "winget/ARPCorrelationAlgorithms.h" + +using namespace AppInstaller::Manifest; +using namespace AppInstaller::Repository; +using namespace AppInstaller::Utility; + +namespace AppInstaller::Repository::Correlation +{ + using WordSequence = WordsEditDistanceMatchConfidenceAlgorithm::WordSequence; + + namespace + { + // A simple matrix class to hold score tables without having to allocate multiple arrays. + struct Matrix + { + Matrix(size_t rows, size_t columns) : m_rows(rows), m_columns(columns), m_data(rows* columns) {} + + double& At(size_t i, size_t j) + { + return m_data[i * m_columns + j]; + } + + private: + size_t m_rows; + size_t m_columns; + std::vector m_data; + }; + + double EditDistanceScore(const std::vector& s1, const std::vector& s2) + { + // Naive implementation of edit distance (scaled over the sequence size). + // This considers only the operations of adding and removing elements. + + if (s1.empty() || s2.empty()) + { + return 0; + } + + // distance[i, j] = distance between s1[0:i] and s2[0:j] + // We don't need to hold more than two rows at a time, but it's simpler to keep the whole table. + Matrix distance(s1.size() + 1, s2.size() + 1); + + for (size_t i = 0; i < s1.size(); ++i) + { + for (size_t j = 0; j < s2.size(); ++j) + { + double& d = distance.At(i, j); + if (s1[i] == s2[j]) + { + // If the two elements are equal, the distance is the same as from one element before. + // In case we are on the first element of one of the two sequences, the distance is + // equal to the cost of adding all the previous elements in the other + if (i == 0) + { + d = static_cast(j); + } + else if (j == 0) + { + d = static_cast(i); + } + else + { + d = distance.At(i - 1, j - 1); + } + } + else + { + // If the two elements are distinct, the score is the cost of removing the last element + // in one sequence plus the cost of editing the remainder of both. + if (i > 0 && j > 0) + { + d = 1 + std::min(distance.At(i - 1, j), distance.At(i, j - 1)); + } + else if (i > 0) + { + d = 1 + distance.At(i - 1, j); + } + else if (j > 0) + { + d = 1 + distance.At(i, j - 1); + } + else + { + // Remove one and add the other + d = 2; + } + } + } + } + + // Maximum distance is equal to the sum of both lengths (removing all elements from one and adding all the elements from the other). + // We use that to scale to [0,1]. + // A smaller distance represents a higher match, so we subtract from 1 for the final score + double editDistance = distance.At(s1.size() - 1, s2.size() - 1); + return 1 - editDistance / (s1.size() + s2.size()); + } + } + + WordsEditDistanceMatchConfidenceAlgorithm::NameAndPublisher::NameAndPublisher(const WordSequence& name, const WordSequence& publisher) : Name(name), Publisher(publisher) + { + NamePublisher.insert(NamePublisher.end(), publisher.begin(), publisher.end()); + NamePublisher.insert(NamePublisher.end(), name.begin(), name.end()); + } + + WordsEditDistanceMatchConfidenceAlgorithm::NameAndPublisher::NameAndPublisher(WordSequence&& name, WordSequence&& publisher) : Name(std::move(name)), Publisher(std::move(publisher)) + { + NamePublisher.insert(NamePublisher.end(), publisher.begin(), publisher.end()); + NamePublisher.insert(NamePublisher.end(), name.begin(), name.end()); + + } + + void WordsEditDistanceMatchConfidenceAlgorithm::Init(const AppInstaller::Manifest::Manifest& manifest) + { + // We will use the name and publisher from each localization. + m_namesAndPublishers.clear(); + + WordSequence defaultPublisher; + if (manifest.DefaultLocalization.Contains(Manifest::Localization::Publisher)) + { + defaultPublisher = NormalizeAndPreparePublisher(manifest.DefaultLocalization.Get()); + } + + if (manifest.DefaultLocalization.Contains(Manifest::Localization::PackageName)) + { + WordSequence defaultName = NormalizeAndPrepareName(manifest.DefaultLocalization.Get()); + m_namesAndPublishers.emplace_back(defaultName, defaultPublisher); + + for (const auto& loc : manifest.Localizations) + { + if (loc.Contains(Manifest::Localization::PackageName) || loc.Contains(Manifest::Localization::Publisher)) + { + auto name = loc.Contains(Manifest::Localization::PackageName) ? NormalizeAndPrepareName(loc.Get()) : defaultName; + auto publisher = loc.Contains(Manifest::Localization::Publisher) ? NormalizeAndPreparePublisher(loc.Get()) : defaultPublisher; + + m_namesAndPublishers.emplace_back(std::move(name), std::move(publisher)); + } + } + } + } + + double WordsEditDistanceMatchConfidenceAlgorithm::ComputeConfidence(const ARPEntry& arpEntry) const + { + // Name and Publisher are available as multi properties, but for ARP entries there will only be 0 or 1 values. + NameAndPublisher arpNameAndPublisher( + NormalizeAndPrepareName(arpEntry.Entry->GetInstalledVersion()->GetProperty(PackageVersionProperty::Name).get()), + NormalizeAndPreparePublisher(arpEntry.Entry->GetInstalledVersion()->GetProperty(PackageVersionProperty::Publisher).get())); + + // Get the best score across all localizations + double bestMatchingScore = 0; + for (const auto& manifestNameAndPublisher : m_namesAndPublishers) + { + // Sometimes the publisher may be included in the name, for example Microsoft PowerToys as opposed to simply PowerToys. + // This may happen both in the ARP entry and the manifest. We try adding it in case it is in one but not in both. + auto nameScore = EditDistanceScore(manifestNameAndPublisher.Name, arpNameAndPublisher.Name); + + // Ignore cases where the name is not at all similar to avoid matching due to publisher only + if (nameScore < m_nameMatchingScoreMinThreshold) + { + continue; + } + + auto publisherScore = EditDistanceScore(manifestNameAndPublisher.Publisher, arpNameAndPublisher.Publisher); + auto namePublisherScore = std::max( + EditDistanceScore(manifestNameAndPublisher.NamePublisher, arpNameAndPublisher.Name), + EditDistanceScore(manifestNameAndPublisher.Name, arpNameAndPublisher.NamePublisher)); + + // Use the best between considering name and publisher as a single string or separately. + auto score = std::max( + nameScore * m_nameMatchingScoreWeight + publisherScore * (1 - m_nameMatchingScoreWeight), + namePublisherScore); + bestMatchingScore = std::max(bestMatchingScore, score); + } + + // Factor in whether this entry is new + auto result = bestMatchingScore * m_stringMatchingWeight + (arpEntry.IsNewOrUpdated ? 1 : 0) * (1 - m_stringMatchingWeight); + + return result; + } + + WordSequence WordsEditDistanceMatchConfidenceAlgorithm::PrepareString(std::string_view s) const + { + return Utility::SplitIntoWords(Utility::FoldCase(s)); + } + + WordSequence WordsEditDistanceMatchConfidenceAlgorithm::NormalizeAndPrepareName(std::string_view name) const + { + return PrepareString(m_normalizer.NormalizeName(name).Name()); + } + + WordSequence WordsEditDistanceMatchConfidenceAlgorithm::NormalizeAndPreparePublisher(std::string_view publisher) const + { + return PrepareString(m_normalizer.NormalizePublisher(publisher)); + } +} diff --git a/src/AppInstallerRepositoryCore/AppInstallerRepositoryCore.vcxproj b/src/AppInstallerRepositoryCore/AppInstallerRepositoryCore.vcxproj index 0f0df2c4b5..2eb1ba2a1c 100644 --- a/src/AppInstallerRepositoryCore/AppInstallerRepositoryCore.vcxproj +++ b/src/AppInstallerRepositoryCore/AppInstallerRepositoryCore.vcxproj @@ -273,6 +273,7 @@ + @@ -301,6 +302,7 @@ + NotUsing diff --git a/src/AppInstallerRepositoryCore/AppInstallerRepositoryCore.vcxproj.filters b/src/AppInstallerRepositoryCore/AppInstallerRepositoryCore.vcxproj.filters index 78d9b98747..509d4dbb87 100644 --- a/src/AppInstallerRepositoryCore/AppInstallerRepositoryCore.vcxproj.filters +++ b/src/AppInstallerRepositoryCore/AppInstallerRepositoryCore.vcxproj.filters @@ -267,6 +267,9 @@ Public\winget + + Header Files + @@ -419,6 +422,9 @@ Source Files + + Source Files + diff --git a/src/AppInstallerRepositoryCore/Public/winget/ARPCorrelation.h b/src/AppInstallerRepositoryCore/Public/winget/ARPCorrelation.h index 9696495a04..8c828691e6 100644 --- a/src/AppInstallerRepositoryCore/Public/winget/ARPCorrelation.h +++ b/src/AppInstallerRepositoryCore/Public/winget/ARPCorrelation.h @@ -2,8 +2,6 @@ // Licensed under the MIT License. #pragma once -#include - namespace AppInstaller { namespace Manifest @@ -65,28 +63,6 @@ namespace AppInstaller::Repository::Correlation #endif }; - struct EmptyMatchConfidenceAlgorithm : public IARPMatchConfidenceAlgorithm - { - void Init(const AppInstaller::Manifest::Manifest&) override {} - double ComputeConfidence(const ARPEntry&) const override { return 0; } - }; - - // Measures the correlation with the edit distance between the normalized name and publisher strings. - struct EditDistanceMatchConfidenceAlgorithm : public IARPMatchConfidenceAlgorithm - { - void Init(const AppInstaller::Manifest::Manifest& manifest) override; - double ComputeConfidence(const ARPEntry& entry) const override; - - private: - std::u32string PrepareString(std::string_view s) const; - std::u32string NormalizeAndPrepareName(std::string_view name) const; - std::u32string NormalizeAndPreparePublisher(std::string_view publisher) const; - - AppInstaller::Utility::NameNormalizer m_normalizer{ AppInstaller::Utility::NormalizationVersion::Initial }; - // Each entry is a tuple { name, publisher, name + publisher } - std::vector> m_namesAndPublishers; - }; - // Finds the ARP entry in the ARP source that matches a newly installed package. // Takes the package manifest, a snapshot of the ARP before the installation, and the current ARP source. // Returns the entry in the ARP source, or nullptr if there was no match, plus some stats about the correlation. diff --git a/src/AppInstallerRepositoryCore/Public/winget/ARPCorrelationAlgorithms.h b/src/AppInstallerRepositoryCore/Public/winget/ARPCorrelationAlgorithms.h new file mode 100644 index 0000000000..970e8bb07a --- /dev/null +++ b/src/AppInstallerRepositoryCore/Public/winget/ARPCorrelationAlgorithms.h @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +#pragma once + +#include +#include +#include +#include +#include + +namespace AppInstaller::Repository::Correlation +{ + struct EmptyMatchConfidenceAlgorithm : public IARPMatchConfidenceAlgorithm + { + void Init(const AppInstaller::Manifest::Manifest&) override {} + double ComputeConfidence(const ARPEntry&) const override { return 0; } + }; + + // Algorithm that computes the match confidence by looking at the edit distance between + // the sequences of words in the name and publisher. + // The edit distance for this allows only adding or removing words, not editing them, + // as that would make any two names too similar. + struct WordsEditDistanceMatchConfidenceAlgorithm : public IARPMatchConfidenceAlgorithm + { + using WordSequence = std::vector; + + struct NameAndPublisher + { + NameAndPublisher(const WordSequence& name, const WordSequence& publisher); + NameAndPublisher(WordSequence&& name, WordSequence&& publisher); + + WordSequence Name; + WordSequence Publisher; + WordSequence NamePublisher; + }; + + void Init(const AppInstaller::Manifest::Manifest& manifest) override; + double ComputeConfidence(const ARPEntry& arpEntry) const override; + + private: + WordSequence PrepareString(std::string_view s) const; + WordSequence NormalizeAndPrepareName(std::string_view name) const; + WordSequence NormalizeAndPreparePublisher(std::string_view publisher) const; + + AppInstaller::Utility::NameNormalizer m_normalizer{ AppInstaller::Utility::NormalizationVersion::InitialPreserveWhiteSpace }; + std::vector m_namesAndPublishers; + + // Parameters for the algorithm + + // How much weight to give to the string matching score. + // The rest is assigned by whether the entry is new. + const double m_stringMatchingWeight = 0.8; + + // How much weight to give to the matching score of the package name; + // the rest is assigned to the publisher score. + const double m_nameMatchingScoreWeight = 2. / 3.; + + // Minimum score needed in the name for us to accept a match. + // This prevents us from being misled by a long match in the publisher. + const double m_nameMatchingScoreMinThreshold = 0.2; + }; +} From 86fed430a8110c890090ac0644ca3640bac6151d Mon Sep 17 00:00:00 2001 From: Flor Elisa Chacon Ochoa Date: Mon, 23 May 2022 20:04:59 -0700 Subject: [PATCH 2/4] Fix test --- src/AppInstallerCLITests/Correlation.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/AppInstallerCLITests/Correlation.cpp b/src/AppInstallerCLITests/Correlation.cpp index ec06ae0337..b7a3d79ed3 100644 --- a/src/AppInstallerCLITests/Correlation.cpp +++ b/src/AppInstallerCLITests/Correlation.cpp @@ -12,6 +12,7 @@ using namespace AppInstaller::Manifest; using namespace AppInstaller::Repository; using namespace AppInstaller::Repository::Correlation; +using namespace AppInstaller::Utility; using namespace TestCommon; @@ -120,7 +121,9 @@ ResultSummary EvaluateDataSetWithHeuristic(const DataSet& dataSet, IARPMatchConf auto matchName = match->GetProperty(PackageVersionProperty::Name); auto matchPublisher = match->GetProperty(PackageVersionProperty::Publisher); - if (matchName == testCase.ARPName && matchPublisher == testCase.ARPPublisher) + // The strings get normalized when added to the manifest, so we have + // to normalize for the comparison. + if (matchName == NormalizedString(testCase.ARPName) && matchPublisher == NormalizedString(testCase.ARPPublisher)) { ++result.TrueMatches; } From 93dc2d8664546f850234e51b36d8082a01e07238 Mon Sep 17 00:00:00 2001 From: Flor Elisa Chacon Ochoa Date: Tue, 24 May 2022 13:00:03 -0700 Subject: [PATCH 3/4] Fix name normalization --- .../NameNormalization.cpp | 39 ++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/src/AppInstallerCommonCore/NameNormalization.cpp b/src/AppInstallerCommonCore/NameNormalization.cpp index dbbd2334c9..364b42aaf4 100644 --- a/src/AppInstallerCommonCore/NameNormalization.cpp +++ b/src/AppInstallerCommonCore/NameNormalization.cpp @@ -215,12 +215,22 @@ namespace AppInstaller::Utility } // Joins all of the given strings into a single value - static std::wstring Join(const std::vector& values) + static std::wstring Join(const std::vector& values, const std::wstring& separator = {}) { std::wstring result; + bool isFirst = true; for (const auto& v : values) { + if (isFirst) + { + isFirst = false; + } + else + { + result += separator; + } + result += v; } @@ -246,6 +256,7 @@ namespace AppInstaller::Utility Regex::Expression KBNumbers{ R"(\((KB\d+)\))", reOptions }; Regex::Expression NonLettersAndDigits{ R"([^\p{L}\p{Nd}])", reOptions }; + Regex::Expression NonLetterDigitOrSpace{ R"([^\p{L}\p{Nd}\s])", reOptions }; Regex::Expression URIProtocol{ R"((? Date: Tue, 24 May 2022 14:30:24 -0700 Subject: [PATCH 4/4] Update tests --- src/AppInstallerCLITests/Correlation.cpp | 4 ++-- src/AppInstallerCLITests/NameNormalization.cpp | 8 ++++++++ src/AppInstallerCLITests/Strings.cpp | 10 ++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/AppInstallerCLITests/Correlation.cpp b/src/AppInstallerCLITests/Correlation.cpp index b7a3d79ed3..d3a99f6a6d 100644 --- a/src/AppInstallerCLITests/Correlation.cpp +++ b/src/AppInstallerCLITests/Correlation.cpp @@ -234,7 +234,7 @@ DataSet GetDataSet_NoNoise() dataSet.TestCases = LoadTestData(); // Arbitrary values. We should refine them as the algorithm gets better. - dataSet.RequiredTrueMatchRatio = 0.75; + dataSet.RequiredTrueMatchRatio = 0.81; dataSet.RequiredFalseMatchRatio = 0; dataSet.RequiredTrueMismatchRatio = 0; // There are no expected mismatches in this data set dataSet.RequiredFalseMismatchRatio = 0.25; @@ -251,7 +251,7 @@ DataSet GetDataSet_WithNoise() dataSet.TestCases = std::move(baseTestCases); // Arbitrary values. We should refine them as the algorithm gets better. - dataSet.RequiredTrueMatchRatio = 0.75; + dataSet.RequiredTrueMatchRatio = 0.81; dataSet.RequiredFalseMatchRatio = 0; // This should always stay at 0 dataSet.RequiredTrueMismatchRatio = 0; // There are no expected mismatches in this data set dataSet.RequiredFalseMismatchRatio = 0.25; diff --git a/src/AppInstallerCLITests/NameNormalization.cpp b/src/AppInstallerCLITests/NameNormalization.cpp index f45bf2f015..191e6f1a17 100644 --- a/src/AppInstallerCLITests/NameNormalization.cpp +++ b/src/AppInstallerCLITests/NameNormalization.cpp @@ -131,3 +131,11 @@ TEST_CASE("NameNorm_KBNumbers", "[name_norm]") REQUIRE(normer.Normalize("Fix for (KB42)", {}).Name() == "FixforKB42"); } + +TEST_CASE("NameNorm_Initial_PreserveWhitespace", "[name_norm]") +{ + NameNormalizer normer(NormalizationVersion::InitialPreserveWhiteSpace); + + REQUIRE(normer.NormalizeName("Some Name").Name() == "Some Name"); + REQUIRE(normer.NormalizePublisher("Some Publisher Corp") == "Some Publisher"); +} diff --git a/src/AppInstallerCLITests/Strings.cpp b/src/AppInstallerCLITests/Strings.cpp index ed4ef43e4a..547e5fd66c 100644 --- a/src/AppInstallerCLITests/Strings.cpp +++ b/src/AppInstallerCLITests/Strings.cpp @@ -193,4 +193,14 @@ TEST_CASE("GetFileNameFromURI", "[strings]") REQUIRE(GetFileNameFromURI("https://github.com/microsoft/winget-cli/pull/1722").u8string() == "1722"); REQUIRE(GetFileNameFromURI("https://github.com/microsoft/winget-cli/README.md").u8string() == "README.md"); REQUIRE(GetFileNameFromURI("https://microsoft.com/").u8string() == ""); +} + +TEST_CASE("SplitIntoWords", "[strings]") +{ + REQUIRE(SplitIntoWords("A B") == std::vector{ "A", "B" }); + REQUIRE(SplitIntoWords("Some-Thing") == std::vector{ "Some", "Thing" }); + + // 私のテスト = "My test" according to an online translator + // Split as "私" "の" "テスト" + REQUIRE(SplitIntoWords("\xe7\xa7\x81\xe3\x81\xae\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88") == std::vector{ "\xe7\xa7\x81", "\xe3\x81\xae", "\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88" }); } \ No newline at end of file