-
Notifications
You must be signed in to change notification settings - Fork 560
/
AngleSharpHyperLinkParser.cs
85 lines (69 loc) · 3.09 KB
/
AngleSharpHyperLinkParser.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
using Abot2.Poco;
using System;
using System.Collections.Generic;
using System.Linq;
using AngleSharp.Dom;
namespace Abot2.Core
{
/// <summary>
/// Parser that uses AngleSharp https://github.com/AngleSharp/AngleSharp to parse page links
/// </summary>
public class AngleSharpHyperlinkParser : HyperLinkParser
{
public AngleSharpHyperlinkParser()
{
}
public AngleSharpHyperlinkParser(CrawlConfiguration config, Func<string, string> cleanUrlFunc)
: base(config, cleanUrlFunc)
{
}
protected override string ParserType
{
get { return "AngleSharp"; }
}
protected override IEnumerable<HyperLink> GetRawHyperLinks(CrawledPage crawledPage)
{
if (HasRobotsNoFollow(crawledPage))
return null;
var hrefValues = crawledPage.AngleSharpHtmlDocument.QuerySelectorAll("a, area")
.Where(e => !HasRelNoFollow(e))
.Select(y => new HyperLink() { RawHrefValue = y.GetAttribute("href"), RawHrefText = y.Text() })
.Where(e => !string.IsNullOrWhiteSpace(e.RawHrefValue));
var canonicalHref = crawledPage.AngleSharpHtmlDocument
.QuerySelectorAll("link")
.Where(e => HasRelCanonicalPointingToDifferentUrl(e, crawledPage.Uri.ToString()))
.Select(e => new HyperLink() { RawHrefValue = e.GetAttribute("href"), RawHrefText = e.Text() } );
return hrefValues.Concat(canonicalHref);
}
protected override string GetBaseHrefValue(CrawledPage crawledPage)
{
var baseTag = crawledPage.AngleSharpHtmlDocument.QuerySelector("base");
if (baseTag == null)
return "";
var baseTagValue = baseTag.Attributes["href"];
if (baseTagValue == null)
return "";
return baseTagValue.Value.Trim();
}
protected override string GetMetaRobotsValue(CrawledPage crawledPage)
{
var robotsMeta = crawledPage.AngleSharpHtmlDocument
.QuerySelectorAll("meta[name]")
.FirstOrDefault(d => d.GetAttribute("name").ToLowerInvariant() == "robots");
if (robotsMeta == null)
return "";
return robotsMeta.GetAttribute("content");
}
protected virtual bool HasRelCanonicalPointingToDifferentUrl(IElement e, string orginalUrl)
{
return e.HasAttribute("rel") && !string.IsNullOrWhiteSpace(e.GetAttribute("rel")) &&
string.Equals(e.GetAttribute("rel"), "canonical", StringComparison.OrdinalIgnoreCase) &&
e.HasAttribute("href") && !string.IsNullOrWhiteSpace(e.GetAttribute("href")) &&
!string.Equals(e.GetAttribute("href"), orginalUrl, StringComparison.OrdinalIgnoreCase);
}
protected virtual bool HasRelNoFollow(IElement e)
{
return Config.IsRespectAnchorRelNoFollowEnabled && (e.HasAttribute("rel") && e.GetAttribute("rel").ToLower().Trim() == "nofollow");
}
}
}