-
Notifications
You must be signed in to change notification settings - Fork 560
/
Scheduler.cs
130 lines (110 loc) · 3.35 KB
/
Scheduler.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
using Abot2.Poco;
using System;
using System.Collections.Generic;
namespace Abot2.Core
{
/// <summary>
/// Handles managing the priority of what pages need to be crawled
/// </summary>
public interface IScheduler : IDisposable
{
/// <summary>
/// Count of remaining items that are currently scheduled
/// </summary>
int Count { get; }
/// <summary>
/// Schedules the param to be crawled
/// </summary>
void Add(PageToCrawl page);
/// <summary>
/// Schedules the param to be crawled
/// </summary>
void Add(IEnumerable<PageToCrawl> pages);
/// <summary>
/// Gets the next page to crawl
/// </summary>
PageToCrawl GetNext();
/// <summary>
/// Clear all currently scheduled pages
/// </summary>
void Clear();
/// <summary>
/// Add the Url to the list of crawled Url without scheduling it to be crawled.
/// </summary>
/// <param name="uri"></param>
void AddKnownUri(Uri uri);
/// <summary>
/// Returns whether or not the specified Uri was already scheduled to be crawled or simply added to the
/// list of known Uris.
/// </summary>
bool IsUriKnown(Uri uri);
}
public class Scheduler : IScheduler
{
ICrawledUrlRepository _crawledUrlRepo;
IPagesToCrawlRepository _pagesToCrawlRepo;
bool _allowUriRecrawling;
public Scheduler()
:this(false, null, null)
{
}
public Scheduler(bool allowUriRecrawling, ICrawledUrlRepository crawledUrlRepo, IPagesToCrawlRepository pagesToCrawlRepo)
{
_allowUriRecrawling = allowUriRecrawling;
_crawledUrlRepo = crawledUrlRepo ?? new CompactCrawledUrlRepository();
_pagesToCrawlRepo = pagesToCrawlRepo ?? new FifoPagesToCrawlRepository();
}
public int Count
{
get { return _pagesToCrawlRepo.Count(); }
}
public void Add(PageToCrawl page)
{
if (page == null)
throw new ArgumentNullException("page");
if (_allowUriRecrawling || page.IsRetry)
{
_pagesToCrawlRepo.Add(page);
}
else
{
if (_crawledUrlRepo.AddIfNew(page.Uri))
_pagesToCrawlRepo.Add(page);
}
}
public void Add(IEnumerable<PageToCrawl> pages)
{
if (pages == null)
throw new ArgumentNullException("pages");
foreach (var page in pages)
Add(page);
}
public PageToCrawl GetNext()
{
return _pagesToCrawlRepo.GetNext();
}
public void Clear()
{
_pagesToCrawlRepo.Clear();
}
public void AddKnownUri(Uri uri)
{
_crawledUrlRepo.AddIfNew(uri);
}
public bool IsUriKnown(Uri uri)
{
return _crawledUrlRepo.Contains(uri);
}
public void Dispose()
{
if (_crawledUrlRepo != null)
{
_crawledUrlRepo.Dispose();
}
if (_pagesToCrawlRepo != null)
{
_pagesToCrawlRepo.Dispose();
}
}
}
}