Skip to content

Commit

Permalink
fix: search indexer incorrectly add space for inline HTML tags
Browse files Browse the repository at this point in the history
  • Loading branch information
yufeih committed Jan 30, 2023
1 parent 8790fd3 commit 48a4ac5
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 75 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,29 @@ namespace Microsoft.DocAsCode.Build.Engine
{
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Composition;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Composition;
using System.Collections.Immutable;

using HtmlAgilityPack;
using Microsoft.DocAsCode.Common;
using Microsoft.DocAsCode.MarkdownLite;
using Microsoft.DocAsCode.Plugins;

using HtmlAgilityPack;
using Newtonsoft.Json;

[Export(nameof(ExtractSearchIndex), typeof(IPostProcessor))]
public class ExtractSearchIndex : IPostProcessor
{
private static readonly Regex RegexWhiteSpace = new Regex(@"\s+", RegexOptions.Compiled);
private static readonly Regex s_regexWhiteSpace = new(@"\s+", RegexOptions.Compiled);
private static readonly HashSet<string> s_htmlInlineTags = new(StringComparer.OrdinalIgnoreCase)
{
"a", "area", "del", "ins", "link", "map", "meta", "abbr", "audio", "b", "bdo", "button", "canvas", "cite", "code", "command", "data",
"datalist", "dfn", "em", "embed", "i", "iframe", "img", "input", "kbd", "keygen", "label", "mark", "math", "meter", "noscript", "object",
"output", "picture", "progress", "q", "ruby", "samp", "script", "select", "small", "span", "strong", "sub", "sup", "svg", "textarea", "time",
"var", "video", "wbr",
};

public string Name => nameof(ExtractSearchIndex);
public const string IndexFileName = "index.json";
Expand Down Expand Up @@ -132,28 +137,34 @@ private string NormalizeContent(string str)
{
return string.Empty;
}
str = StringHelper.HtmlDecode(str);
return RegexWhiteSpace.Replace(str, " ").Trim();
str = WebUtility.HtmlDecode(str);
return s_regexWhiteSpace.Replace(str, " ").Trim();
}

private void ExtractTextFromNode(HtmlNode root, StringBuilder contentBuilder)
private void ExtractTextFromNode(HtmlNode node, StringBuilder contentBuilder)
{
if (root == null)
if (node == null)
{
return;
}

if (!root.HasChildNodes)
if (node.NodeType is HtmlNodeType.Text or HtmlNodeType.Comment)
{
contentBuilder.Append(root.InnerText);
contentBuilder.Append(" ");
contentBuilder.Append(node.InnerText);
return;
}
else

if (node.NodeType is HtmlNodeType.Element or HtmlNodeType.Document)
{
foreach (var node in root.ChildNodes)
{
ExtractTextFromNode(node, contentBuilder);
}
var isBlock = !s_htmlInlineTags.Contains(node.Name);
if (isBlock)
contentBuilder.Append(' ');

foreach (var childNode in node.ChildNodes)
ExtractTextFromNode(childNode, contentBuilder);

if (isBlock)
contentBuilder.Append(' ');
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ This is article title
html.LoadHtml(rawHtml);
var href = "http://dotnet.github.io/docfx";
var item = _extractor.ExtractItem(html, href);
Assert.True(item.Equals(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Keywords = "Hello World, Microsoft This is article title docfx can do anything..." }));
Assert.Equal(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Keywords = "Hello World, Microsoft This is article title docfx can do anything..." }, item);
}

[Fact]
Expand All @@ -62,7 +62,7 @@ public void TestSearchableClass()
html.LoadHtml(rawHtml);
var href = "http://dotnet.github.io/docfx";
var item = _extractor.ExtractItem(html, href);
Assert.True(item.Equals(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Keywords = "Cooooooool!" }));
Assert.Equal(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Keywords = "Cooooooool!" }, item);
}

[Fact]
Expand Down Expand Up @@ -110,7 +110,7 @@ Only index once.
html.LoadHtml(rawHtml);
var href = "http://dotnet.github.io/docfx";
var item = _extractor.ExtractItem(html, href);
Assert.True(item.Equals(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Keywords = "Only index once."}));
Assert.Equal(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Keywords = "Only index once."}, item);
}

[Fact]
Expand Down Expand Up @@ -153,7 +153,27 @@ public void TestEmptyItem()
html.LoadHtml(rawHtml);
var href = "http://dotnet.github.io/docfx";
var item = _extractor.ExtractItem(html, href);
Assert.True(item.Equals(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Keywords = string.Empty }));
Assert.Equal(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Keywords = string.Empty }, item);
}

[Fact]
public void TestBlockTagsVsInlineTags()
{
var rawHtml = @"
<html>
<body>
<article>
<div>Insert<br>space<div>in</div>block<p>level</p>html<li>tags</li></div>
<div>Do<a>not</a>insert<em>space</em>in<b>inline</b>html<i>tags</i></div>
</article>
</body>
</html>
";
var html = new HtmlDocument();
html.LoadHtml(rawHtml);
var href = "http://dotnet.github.io/docfx";
var item = _extractor.ExtractItem(html, href);
Assert.Equal(new SearchIndexItem { Href = href, Title = "", Keywords = "Insert space in block level html tags Donotinsertspaceininlinehtmltags" }, item);
}

[Fact]
Expand Down
Loading

0 comments on commit 48a4ac5

Please sign in to comment.