From 9d893c26dc7ddb422630fb7cb103ae4f0e73fdad Mon Sep 17 00:00:00 2001 From: Gabe Stocco <98900+gfs@users.noreply.github.com> Date: Wed, 8 Nov 2023 11:28:09 -0800 Subject: [PATCH 1/3] Fixes an issue with setting correct index values when matching a property of an xml tag with an xpath --- AppInspector.RulesEngine/TextContainer.cs | 53 ++++++--- .../RuleProcessor/XmlAndJsonTests.cs | 101 +++++++++++++++++- 2 files changed, 135 insertions(+), 19 deletions(-) diff --git a/AppInspector.RulesEngine/TextContainer.cs b/AppInspector.RulesEngine/TextContainer.cs index 1b631e68..8918d6ae 100644 --- a/AppInspector.RulesEngine/TextContainer.cs +++ b/AppInspector.RulesEngine/TextContainer.cs @@ -221,26 +221,47 @@ public TextContainer(string content, string language, Languages languages, ILogg continue; } - // First we find the name + // First we find the name, absolute position index var nameIndex = FullContent[minIndex..].IndexOf(nodeIter.Current.Name, StringComparison.Ordinal) + minIndex; // Then we grab the index of the end of this tag. // We can't use OuterXML because the parser will inject the namespace if present into the OuterXML so it doesn't match the original text. - var endTagIndex = FullContent[nameIndex..].IndexOf('>'); - // We also look for self-closing tag - var selfClosedTag = FullContent[endTagIndex-1] == '/'; - // If the tag is self closing innerxml will be empty string, so the finding is located at the end of the tag and is empty string - // Otherwise the finding is the content of the xml tag - var offset = selfClosedTag ? endTagIndex : FullContent[nameIndex..].IndexOf(nodeIter.Current.InnerXml, StringComparison.Ordinal) + nameIndex; - // Move the minimum index up in case there are multiple instances of identical OuterXML - // This ensures we won't re-find the same one - var totalOffset = minIndex + nameIndex + endTagIndex; - minIndex = totalOffset; - var location = new Boundary + // Position relative to nameIndex + var endTagIndex = FullContent[nameIndex..].IndexOf('>') + nameIndex; + // If we are matching a tag itself, the previous char should be the open tag + // If its a property it won't be + var isProp = FullContent[(nameIndex - 1)] != '<'; + // Check for self-closing tag + var selfClosedTag = FullContent[endTagIndex - 1] == '/'; + + // This is for when we're capturing the value of a property of the tag rather than the tag itself + if (isProp) { - Index = offset, - Length = nodeIter.Current.InnerXml.Length - }; - yield return (nodeIter.Current.Value, location); + // Move the offset to the end of the opening tag + var nextClosingIndexAfterName = FullContent[nameIndex..].IndexOf('>', StringComparison.Ordinal)+ nameIndex+1; + var offset = selfClosedTag ? endTagIndex : FullContent[nextClosingIndexAfterName..].IndexOf('>') + nextClosingIndexAfterName + 1; + // Move the minimum index up to the end of the closing tag + minIndex = selfClosedTag ? offset : FullContent[offset..].IndexOf('>') + offset + 1; + var location = new Boundary + { + // +2 for the \" before the value for the property + Index = nameIndex + nodeIter.Current.Name.Length + 2, + Length = nodeIter.Current.InnerXml.Length + }; + yield return (nodeIter.Current.Value, location); + } + else + { + // Move the offset to the end of the opening tag + var offset = selfClosedTag ? endTagIndex : FullContent[nameIndex..].IndexOf(nodeIter.Current.InnerXml, StringComparison.Ordinal) + nameIndex; + // Move the minimum index up to the end of the closing tag + minIndex = selfClosedTag ? offset : FullContent[offset..].IndexOf('>') + offset + 1; + var location = new Boundary + { + Index = offset, + Length = nodeIter.Current.InnerXml.Length + }; + yield return (nodeIter.Current.Value, location); + } } } diff --git a/AppInspector.Tests/RuleProcessor/XmlAndJsonTests.cs b/AppInspector.Tests/RuleProcessor/XmlAndJsonTests.cs index d942cf04..045e1431 100644 --- a/AppInspector.Tests/RuleProcessor/XmlAndJsonTests.cs +++ b/AppInspector.Tests/RuleProcessor/XmlAndJsonTests.cs @@ -83,6 +83,82 @@ public class XmlAndJsonTests } ]"; + private const string xmlStringRuleForPropWithData = @"[ + { + ""id"": ""SA000005"", + ""name"": ""Testing.Rules.XML"", + ""tags"": [ + ""Testing.Rules.XML"" + ], + ""severity"": ""Critical"", + ""description"": ""This rule checks the value of the property property to be true"", + ""patterns"": [ + { + ""pattern"": ""true"", + ""type"": ""string"", + ""confidence"": ""High"", + ""scopes"": [ + ""code"" + ], + ""xpaths"" : [""/bookstore/book/title/@*[name()='property']""] + } + ], + ""_comment"": """" + } +]"; + + private const string xmlStringRuleForPropWithDataForData = @"[ + { + ""id"": ""SA000005"", + ""name"": ""Testing.Rules.XML"", + ""tags"": [ + ""Testing.Rules.XML"" + ], + ""severity"": ""Critical"", + ""description"": ""This rule checks the value of the title tag when it has a property"", + ""patterns"": [ + { + ""pattern"": ""Franklin"", + ""type"": ""regex"", + ""confidence"": ""High"", + ""scopes"": [ + ""code"" + ], + ""xpaths"" : [""/bookstore/book/title""] + } + ], + ""_comment"": """" + } +]"; + + private const string xmlDataPropsWithTagValue = + @" + + + The Autobiography of Benjamin Franklin + + Benjamin + Franklin + + 8.99 + + + The Confidence Man + + Herman + Melville + + 11.99 + + + The Gorgias + + Plato + + 9.99 + + "; + private const string jsonData = @"{ ""books"": @@ -228,14 +304,14 @@ public void XmlAttributeTest() { ""xpaths"": [""system.web/trace/@enabled""], ""pattern"": ""true"", - ""type"": ""regex"" + ""type"": ""string"" } ], ""must-match"": [ - ""\n\n"" + ""\n\n"" ], ""must-not-match"": [ - ""\n\n"" + ""\n\n"" ] }]"; RuleSet rules = new(); @@ -268,6 +344,25 @@ public void JsonStringRule(string rule) Assert.Fail(); } } + [DataRow(xmlStringRuleForPropWithDataForData)] + [DataRow(xmlStringRuleForPropWithData)] + [DataTestMethod] + public void XmlTagWithPropsAndValue(string rule) + { + RuleSet rules = new(); + rules.AddString(rule, "XmlTestRules"); + Microsoft.ApplicationInspector.RulesEngine.RuleProcessor processor = new(rules, + new RuleProcessorOptions { AllowAllTagsInBuildFiles = true }); + if (_languages.FromFileNameOut("test.xml", out var info)) + { + var matches = processor.AnalyzeFile(xmlDataPropsWithTagValue, new FileEntry("test.xml", new MemoryStream()), info); + Assert.AreEqual(1, matches.Count); + } + else + { + Assert.Fail(); + } + } [DataRow(xmlStringRule)] [DataRow(jsonAndXmlStringRule)] From 972296d16be61260d3c3f59dbbe9c45a94fd1447 Mon Sep 17 00:00:00 2001 From: Gabe Stocco <98900+gfs@users.noreply.github.com> Date: Thu, 9 Nov 2023 09:43:21 -0800 Subject: [PATCH 2/3] Update comments --- AppInspector.RulesEngine/TextContainer.cs | 31 +++++++++++++++-------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/AppInspector.RulesEngine/TextContainer.cs b/AppInspector.RulesEngine/TextContainer.cs index 8918d6ae..a93a3612 100644 --- a/AppInspector.RulesEngine/TextContainer.cs +++ b/AppInspector.RulesEngine/TextContainer.cs @@ -172,11 +172,13 @@ public TextContainer(string content, string language, Languages languages, ILogg } /// - /// If this file is a JSON, XML or YML file, returns the string contents of the specified path. + /// If this file is XML, attempts to return the the string contents of the specified XPath applied to the file. /// If the path does not exist, or the file is not JSON, XML or YML returns null. + /// Method contains some heuristic behavior and may not cover all cases. + /// Please report any issues with a sample XML and XPATH to reproduce. /// - /// - /// + /// XPath to query document with + /// Enumeration of string and Boundary tuples for the XPath matches. Boundary locations refer to the locations in the original document on disk. internal IEnumerable<(string, Boundary)> GetStringFromXPath(string Path, Dictionary xpathNameSpaces) { lock (_xpathLock) @@ -221,14 +223,22 @@ public TextContainer(string content, string language, Languages languages, ILogg continue; } + // We have to heuristically calculate the original indexes of the locations in the original document because the internal representation differs + // For example it will convert to + // First we find the name, absolute position index var nameIndex = FullContent[minIndex..].IndexOf(nodeIter.Current.Name, StringComparison.Ordinal) + minIndex; - // Then we grab the index of the end of this tag. - // We can't use OuterXML because the parser will inject the namespace if present into the OuterXML so it doesn't match the original text. - // Position relative to nameIndex - var endTagIndex = FullContent[nameIndex..].IndexOf('>') + nameIndex; + // Then we calculate the absolute index of the end of the tag. + // We can't use OuterXML property because the parser will inject the namespace if present into the OuterXML so it doesn't match the original text. + var endTagIndex = FullContent[nameIndex..].IndexOf('>', StringComparison.Ordinal) + nameIndex; // If we are matching a tag itself, the previous char should be the open tag + // | + // v + // // If its a property it won't be + // | + // v + // var isProp = FullContent[(nameIndex - 1)] != '<'; // Check for self-closing tag var selfClosedTag = FullContent[endTagIndex - 1] == '/'; @@ -236,10 +246,11 @@ public TextContainer(string content, string language, Languages languages, ILogg // This is for when we're capturing the value of a property of the tag rather than the tag itself if (isProp) { - // Move the offset to the end of the opening tag - var nextClosingIndexAfterName = FullContent[nameIndex..].IndexOf('>', StringComparison.Ordinal)+ nameIndex+1; + // Find the index of character after the next end tag index after the name + var nextClosingIndexAfterName = endTagIndex+1; + // If we have a self closing tag, we can use that index, otherwise we need the closure of this tag var offset = selfClosedTag ? endTagIndex : FullContent[nextClosingIndexAfterName..].IndexOf('>') + nextClosingIndexAfterName + 1; - // Move the minimum index up to the end of the closing tag + // Move the minimum index up to the end of the closing tag to avoid additioanl matches of the same values minIndex = selfClosedTag ? offset : FullContent[offset..].IndexOf('>') + offset + 1; var location = new Boundary { From 40ce3bcd40ce22b031f71c25529a6c416b9f757b Mon Sep 17 00:00:00 2001 From: Gabe Stocco <98900+gfs@users.noreply.github.com> Date: Thu, 9 Nov 2023 09:48:03 -0800 Subject: [PATCH 3/3] Improve robustness of new test cases. Also check value and index location of matches. --- AppInspector.Tests/RuleProcessor/XmlAndJsonTests.cs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/AppInspector.Tests/RuleProcessor/XmlAndJsonTests.cs b/AppInspector.Tests/RuleProcessor/XmlAndJsonTests.cs index 045e1431..29725d98 100644 --- a/AppInspector.Tests/RuleProcessor/XmlAndJsonTests.cs +++ b/AppInspector.Tests/RuleProcessor/XmlAndJsonTests.cs @@ -344,10 +344,10 @@ public void JsonStringRule(string rule) Assert.Fail(); } } - [DataRow(xmlStringRuleForPropWithDataForData)] - [DataRow(xmlStringRuleForPropWithData)] + [DataRow(xmlStringRuleForPropWithDataForData, "Franklin", 212)] + [DataRow(xmlStringRuleForPropWithData, "true", 176)] [DataTestMethod] - public void XmlTagWithPropsAndValue(string rule) + public void XmlTagWithPropsAndValue(string rule, string expectedValue, int expectedIndex) { RuleSet rules = new(); rules.AddString(rule, "XmlTestRules"); @@ -357,6 +357,9 @@ public void XmlTagWithPropsAndValue(string rule) { var matches = processor.AnalyzeFile(xmlDataPropsWithTagValue, new FileEntry("test.xml", new MemoryStream()), info); Assert.AreEqual(1, matches.Count); + var match = matches[0]; + Assert.AreEqual(expectedValue, match.Sample); + Assert.AreEqual(expectedIndex, match.Boundary.Index); } else {