-
Notifications
You must be signed in to change notification settings - Fork 1k
/
CnblogsSpider.cs
118 lines (106 loc) · 4.04 KB
/
CnblogsSpider.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
using System;
using System.ComponentModel;
using System.Threading;
using System.Threading.Tasks;
using DotnetSpider.DataFlow;
using DotnetSpider.DataFlow.Parser;
using DotnetSpider.Http;
using DotnetSpider.Selector;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using Serilog;
namespace DotnetSpider.Sample.samples;
[DisplayName("博客园爬虫")]
public class CnBlogsSpider(
IOptions<SpiderOptions> options,
DependenceServices services,
ILogger<Spider> logger)
: Spider(options, services, logger)
{
public static async Task RunAsync()
{
var builder = Builder.CreateDefaultBuilder<CnBlogsSpider>(x =>
{
x.Speed = 2;
});
builder.UseSerilog();
await builder.Build().RunAsync();
}
protected override async Task InitializeAsync(CancellationToken stoppingToken = default)
{
AddDataFlow<ListNewsParser>();
AddDataFlow<NewsParser>();
// AddDataFlow(GetDefaultStorage);
var request = new Request("https://news.cnblogs.com/n/page/1");
request.Headers.UserAgent = "";
await AddRequestsAsync(request);
}
protected class ListNewsParser : DataParser
{
public override Task InitializeAsync()
{
AddRequiredValidator("news\\.cnblogs\\.com/n/page");
// if you want to collect every pages
AddFollowRequestQuerier(Selectors.XPath(".//div[@class='pager']"));
return Task.CompletedTask;
}
protected override Task ParseAsync(DataFlowContext context)
{
var newsList = context.Selectable.SelectList(Selectors.XPath(".//div[@class='news_block']"));
foreach (var news in newsList)
{
var title = news.Select(Selectors.XPath(".//h2[@class='news_entry']"))?.Value;
var url = news.Select(Selectors.XPath(".//h2[@class='news_entry']/a/@href"))?.Value;
var summary = news.Select(Selectors.XPath(".//div[@class='entry_summary']"))?.Value;
var views = news.Select(Selectors.XPath(".//span[@class='view']"))?.Value.Replace(" 人浏览", "");
if (!string.IsNullOrWhiteSpace(url))
{
var request = context.CreateNewRequest(new Uri(url));
request.Properties.Add("title", title);
request.Properties.Add("url", url);
request.Properties.Add("summary", summary);
request.Properties.Add("views", views);
context.AddFollowRequests(request);
}
}
return Task.CompletedTask;
}
}
protected class NewsParser : DataParser
{
public override Task InitializeAsync()
{
AddRequiredValidator("news\\.cnblogs\\.com/n/\\d+");
return Task.CompletedTask;
}
protected override Task ParseAsync(DataFlowContext context)
{
var typeName = typeof(News).FullName;
var url = context.Request.RequestUri.ToString();
var title = context.Request.Properties["title"]?.ToString()?.Trim();
var summary = context.Request.Properties["summary"]?.ToString()?.Trim();
var views = int.Parse(context.Request.Properties["views"]?.ToString()?.Trim() ?? "0");
var content = context.Selectable.Select(Selectors.XPath(".//div[@id='news_body']"))?.Value
?.Trim();
context.AddData(typeName,
new News
{
Url = url,
Title = title,
Summary = summary,
Views = views,
Content = content
});
return Task.CompletedTask;
}
}
protected class News
{
public string Title { get; set; }
public string Url { get; set; }
public string Summary { get; set; }
public int Views { get; set; }
public string Content { get; set; }
}
}