microsoft · florelis · May 25, 2022 · May 24, 2022 · May 24, 2022 · May 24, 2022
@@ -7,6 +7,7 @@
 #include <Workflows/WorkflowBase.h>
 #include <Workflows/InstallFlow.h>
 #include <winget/Manifest.h>
+#include <winget/ARPCorrelationAlgorithms.h>
 #include <Microsoft/PredefinedInstalledSourceFactory.h>
 
 using namespace TestCommon;

@@ -5,12 +5,14 @@
 #include "TestSource.h"
 
 #include <winget/ARPCorrelation.h>
+#include <winget/ARPCorrelationAlgorithms.h>
 #include <winget/Manifest.h>
 #include <winget/RepositorySearch.h>
 
 using namespace AppInstaller::Manifest;
 using namespace AppInstaller::Repository;
 using namespace AppInstaller::Repository::Correlation;
+using namespace AppInstaller::Utility;
 
 using namespace TestCommon;
 
@@ -76,13 +78,18 @@ Manifest GetManifestFromTestCase(const TestCase& testCase)
     return manifest;
 }
 
-ARPEntry GetARPEntryFromTestCase(const TestCase& testCase)
+ARPEntry GetARPEntryFromTestCase(const TestCase& testCase, bool isNew)
 {
     Manifest arpManifest;
     arpManifest.DefaultLocalization.Add<Localization::PackageName>(testCase.ARPName);
     arpManifest.DefaultLocalization.Add<Localization::Publisher>(testCase.ARPPublisher);
     arpManifest.Localizations.push_back(arpManifest.DefaultLocalization);
-    return ARPEntry{ TestPackage::Make(arpManifest, TestPackage::MetadataMap{}), false };
+    return ARPEntry{ TestPackage::Make(arpManifest, TestPackage::MetadataMap{}), isNew };
+}
+
+ARPEntry GetExistingARPEntryFromTestCase(const TestCase& testCase)
+{
+    return GetARPEntryFromTestCase(testCase, /* isNew */ false);
 }
 
 void ReportMatch(std::string_view label, std::string_view appName, std::string_view appPublisher, std::string_view arpName, std::string_view arpPublisher)
@@ -105,7 +112,7 @@ ResultSummary EvaluateDataSetWithHeuristic(const DataSet& dataSet, IARPMatchConf
 
     for (const auto& testCase : dataSet.TestCases)
     {
-        arpEntries.push_back(GetARPEntryFromTestCase(testCase));
+        arpEntries.push_back(GetARPEntryFromTestCase(testCase, /* isNew */ true));
         auto match = FindARPEntryForNewlyInstalledPackageWithHeuristics(GetManifestFromTestCase(testCase), arpEntries, correlationAlgorithm);
         arpEntries.pop_back();
 
@@ -114,7 +121,9 @@ ResultSummary EvaluateDataSetWithHeuristic(const DataSet& dataSet, IARPMatchConf
             auto matchName = match->GetProperty(PackageVersionProperty::Name);
             auto matchPublisher = match->GetProperty(PackageVersionProperty::Publisher);
 
-            if (matchName == testCase.ARPName && matchPublisher == testCase.ARPPublisher)
+            // The strings get normalized when added to the manifest, so we have
+            // to normalize for the comparison.
+            if (matchName == NormalizedString(testCase.ARPName) && matchPublisher == NormalizedString(testCase.ARPPublisher))
             {
                 ++result.TrueMatches;
             }
@@ -225,10 +234,10 @@ DataSet GetDataSet_NoNoise()
     dataSet.TestCases = LoadTestData();
 
     // Arbitrary values. We should refine them as the algorithm gets better.
-    dataSet.RequiredTrueMatchRatio = 0.7;
-    dataSet.RequiredFalseMatchRatio = 0.05;
+    dataSet.RequiredTrueMatchRatio = 0.81;
+    dataSet.RequiredFalseMatchRatio = 0;
     dataSet.RequiredTrueMismatchRatio = 0; // There are no expected mismatches in this data set
-    dataSet.RequiredFalseMismatchRatio = 0.3;
+    dataSet.RequiredFalseMismatchRatio = 0.25;
 
     return dataSet;
 }
@@ -238,14 +247,14 @@ DataSet GetDataSet_WithNoise()
     DataSet dataSet;
     auto baseTestCases = LoadTestData();
 
-    std::transform(baseTestCases.begin(), baseTestCases.end(), std::back_inserter(dataSet.ARPNoise), GetARPEntryFromTestCase);
+    std::transform(baseTestCases.begin(), baseTestCases.end(), std::back_inserter(dataSet.ARPNoise), GetExistingARPEntryFromTestCase);
     dataSet.TestCases = std::move(baseTestCases);
 
     // Arbitrary values. We should refine them as the algorithm gets better.
-    dataSet.RequiredTrueMatchRatio = 0.7;
-    dataSet.RequiredFalseMatchRatio = 0.05;
+    dataSet.RequiredTrueMatchRatio = 0.81;
+    dataSet.RequiredFalseMatchRatio = 0; // This should always stay at 0
     dataSet.RequiredTrueMismatchRatio = 0; // There are no expected mismatches in this data set
-    dataSet.RequiredFalseMismatchRatio = 0.3;
+    dataSet.RequiredFalseMismatchRatio = 0.25;
 
     return dataSet;
 }
@@ -256,7 +265,7 @@ DataSet GetDataSet_WithNoise()
 // performs well.
 TEMPLATE_TEST_CASE("Correlation_MeasureAlgorithmPerformance", "[correlation][.]",
     EmptyMatchConfidenceAlgorithm,
-    EditDistanceMatchConfidenceAlgorithm)
+    WordsEditDistanceMatchConfidenceAlgorithm)
 {
     // Each section loads a different data set,
     // and then they are all handled the same

@@ -131,3 +131,11 @@ TEST_CASE("NameNorm_KBNumbers", "[name_norm]")
 
     REQUIRE(normer.Normalize("Fix for (KB42)", {}).Name() == "FixforKB42");
 }
+
+TEST_CASE("NameNorm_Initial_PreserveWhitespace", "[name_norm]")
+{
+    NameNormalizer normer(NormalizationVersion::InitialPreserveWhiteSpace);
+
+    REQUIRE(normer.NormalizeName("Some Name").Name() == "Some Name");
+    REQUIRE(normer.NormalizePublisher("Some Publisher Corp") == "Some Publisher");
+}
@@ -193,4 +193,14 @@ TEST_CASE("GetFileNameFromURI", "[strings]")
     REQUIRE(GetFileNameFromURI("https://github.com/microsoft/winget-cli/pull/1722").u8string() == "1722");
     REQUIRE(GetFileNameFromURI("https://github.com/microsoft/winget-cli/README.md").u8string() == "README.md");
     REQUIRE(GetFileNameFromURI("https://microsoft.com/").u8string() == "");
+}
+
+TEST_CASE("SplitIntoWords", "[strings]")
+{
+    REQUIRE(SplitIntoWords("A B") == std::vector<std::string>{ "A", "B" });
+    REQUIRE(SplitIntoWords("Some-Thing") == std::vector<std::string>{ "Some", "Thing" });
+
+    // 私のテスト = "My test" according to an online translator
+    // Split as "私" "の" "テスト"
+    REQUIRE(SplitIntoWords("\xe7\xa7\x81\xe3\x81\xae\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88") == std::vector<std::string>{ "\xe7\xa7\x81", "\xe3\x81\xae", "\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88" });
 }
@@ -108,4 +108,5 @@ XPFP42J061BPC1|Documenti Lavori Cantiere|Esposito Software di M. G. Caputo|Docum
 XPFPFN4LT21PZJ|Studio Dentistico Pro|Esposito Software di M. G. Caputo|Studio Dentistico Pro Demo||Copyright Esposito Software|Studio Dentistico Pro Demo_is1
 XPFPFWMVTR0WHP|Ashampoo UnInstaller 11|Ashampoo|Ashampoo UnInstaller 11|11.00.12|Ashampoo GmbH & Co. KG|{4209F371-B84B-F321-6BD3-1D91E2505732}_is1
 XPFPFWV5JD80K2|BeeCut|网旭科技|BeeCut V1.7.7.22|1.7.7.22|Apowersoft LIMITED|{CA76BFA8-1862-49D7-B2C7-AE3D6CF40E53}_is1
-XPFPLCB36G8V8J|HttpMaster Professional|Borvid, Informacijske storitve, Janez Čas s.p.|HttpMaster Professional Edition 5.4.1|5.4.1|Borvid|{B61241AA-F5FC-42C9-A1F9-F6D72D654349}
+XPFPLCB36G8V8J|HttpMaster Professional|Borvid, Informacijske storitve, Janez Čas s.p.|HttpMaster Professional Edition 5.4.1|5.4.1|Borvid|{B61241AA-F5FC-42C9-A1F9-F6D72D654349}
+XP8CDJNZKFM06W|Visual Studio Community 2019|Microsoft Corporation|Microsoft Visual Studio Installer|3.1.2196.8931|Microsoft Corporation|{6F320B93-EE3C-4826-85E0-ADF79F8D4C61}
@@ -87,6 +87,12 @@ namespace AppInstaller::Utility
                 return utext_char32At(m_text.get(), m_currentBrk);
             }
 
+            // Returns the status from the break rule that determined the most recently break position.
+            int32_t CurrentRuleStatus()
+            {
+                return ubrk_getRuleStatus(m_brk.get());
+            }
+
         private:
             wil::unique_any<UText*, decltype(utext_close), &utext_close> m_text;
             wil::unique_any<UBreakIterator*, decltype(ubrk_close), &ubrk_close> m_brk;
@@ -628,4 +634,27 @@ namespace AppInstaller::Utility
 
         return path.filename();
     }
+
+    std::vector<std::string> SplitIntoWords(std::string_view input)
+    {
+        ICUBreakIterator itr{ input, UBRK_WORD };
+        std::size_t currentOffset = 0;
+
+        std::vector<std::string> result;
+        while (itr.Next() != UBRK_DONE)
+        {
+            std::size_t nextOffset = itr.CurrentOffset();
+
+            // Ignore spaces and punctuation, accept words and numbers
+            if (itr.CurrentRuleStatus() != UBRK_WORD_NONE)
+            {
+                auto wordSize = nextOffset - currentOffset;
+                result.emplace_back(input, currentOffset, wordSize);
+            }
+
+            currentOffset = nextOffset;
+        }
+
+        return result;
+    }
 }
@@ -215,12 +215,22 @@ namespace AppInstaller::Utility
             }
 
             // Joins all of the given strings into a single value
-            static std::wstring Join(const std::vector<std::wstring>& values)
+            static std::wstring Join(const std::vector<std::wstring>& values, const std::wstring& separator = {})
             {
                 std::wstring result;
 
+                bool isFirst = true;
                 for (const auto& v : values)
                 {
+                    if (isFirst)
+                    {
+                        isFirst = false;
+                    }
+                    else
+                    {
+                        result += separator;
+                    }
+
                     result += v;
                 }
 
@@ -246,6 +256,7 @@ namespace AppInstaller::Utility
             Regex::Expression KBNumbers{ R"(\((KB\d+)\))", reOptions };
 
             Regex::Expression NonLettersAndDigits{ R"([^\p{L}\p{Nd}])", reOptions };
+            Regex::Expression NonLetterDigitOrSpace{ R"([^\p{L}\p{Nd}\s])", reOptions };
             Regex::Expression URIProtocol{ R"((?<!\p{L})(?:http[s]?|ftp):\/\/)", reOptions }; // remove protocol from URIs
 
             Regex::Expression VersionDelimited{ R"(((?<!\p{L})(?:V|VER|VERSI(?:O|Ó)N|VERSÃO|VERSIE|WERSJA|BUILD|RELEASE|RC|SP)\P{L}?)?\p{Nd}+([\p{Po}\p{Pd}\p{Pc}]\p{Nd}?(RC|B|A|R|SP|K)?\p{Nd}+)+([\p{Po}\p{Pd}\p{Pc}]?[\p{L}\p{Nd}]+)*)", reOptions };
@@ -347,6 +358,8 @@ namespace AppInstaller::Utility
             // The folded and sorted version of LocaleViews.
             const std::vector<std::wstring> LegalEntitySuffixes;
 
+            const bool PreserveWhiteSpace;
+
             static std::vector<std::wstring> FoldAndSort(const std::vector<std::wstring_view>& input)
             {
                 std::vector<std::wstring> result;
@@ -377,10 +390,18 @@ namespace AppInstaller::Utility
                 while (RemoveAll(ProgramNameRegexes, result.Name));
 
                 auto tokens = Split(ProgramNameSplit, result.Name, LegalEntitySuffixes);
-                result.Name = Join(tokens);
 
-                // Drop all undesired characters
-                Remove(NonLettersAndDigits, result.Name);
+                // Re-join the tokens and drop all undesired characters
+                if (PreserveWhiteSpace)
+                {
+                    result.Name = Join(tokens, L" ");
+                    Remove(NonLetterDigitOrSpace, result.Name);
+                }
+                else
+                {
+                    result.Name = Join(tokens);
+                    Remove(NonLettersAndDigits, result.Name);
+                }
 
                 return result;
             }
@@ -395,16 +416,24 @@ namespace AppInstaller::Utility
                 while (RemoveAll(PublisherNameRegexes, result.Publisher));
 
                 auto tokens = Split(PublisherNameSplit, result.Publisher, LegalEntitySuffixes, true);
-                result.Publisher = Join(tokens);
 
-                // Drop all undesired characters
-                Remove(NonLettersAndDigits, result.Publisher);
+                // Re-join the tokens and drop all undesired characters
+                if (PreserveWhiteSpace)
+                {
+                    result.Publisher = Join(tokens, L" ");
+                    Remove(NonLetterDigitOrSpace, result.Publisher);
+                }
+                else
+                {
+                    result.Publisher = Join(tokens);
+                    Remove(NonLettersAndDigits, result.Publisher);
+                }
 
                 return result;
             }
 
         public:
-            NormalizationInitial() : Locales(FoldAndSort(LocaleViews)), LegalEntitySuffixes(FoldAndSort(LegalEntitySuffixViews))
+            NormalizationInitial(bool preserveWhiteSpace) : Locales(FoldAndSort(LocaleViews)), LegalEntitySuffixes(FoldAndSort(LegalEntitySuffixViews)), PreserveWhiteSpace(preserveWhiteSpace)
             {
             }
 
@@ -448,7 +477,10 @@ namespace AppInstaller::Utility
         switch (version)
         {
         case AppInstaller::Utility::NormalizationVersion::Initial:
-            m_normalizer = std::make_unique<NormalizationInitial>();
+            m_normalizer = std::make_unique<NormalizationInitial>(false);
+            break;
+        case AppInstaller::Utility::NormalizationVersion::InitialPreserveWhiteSpace:
+            m_normalizer = std::make_unique<NormalizationInitial>(true);
             break;
         default:
             THROW_HR(E_INVALIDARG);

@@ -149,6 +149,9 @@ namespace AppInstaller::Utility
     // Gets the file name part of the given URI.
     std::filesystem::path GetFileNameFromURI(std::string_view uri);
 
+    // Splits the string into words.
+    std::vector<std::string> SplitIntoWords(std::string_view input);
+
     // Converts a container to a string representation of it.
     template <typename T, typename U>
     std::string ConvertContainerToString(const T& container, U toString)

@@ -13,6 +13,7 @@ namespace AppInstaller::Utility
     enum class NormalizationVersion
     {
         Initial,
+        InitialPreserveWhiteSpace,
     };
 
     struct NameNormalizer;