-
Notifications
You must be signed in to change notification settings - Fork 1k
/
TestSpider2.cs
94 lines (80 loc) · 3.17 KB
/
TestSpider2.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
using System;
using System.ComponentModel.DataAnnotations;
using System.Threading;
using System.Threading.Tasks;
using DotnetSpider.DataFlow.Parser;
using DotnetSpider.DataFlow.Parser.Formatters;
using DotnetSpider.DataFlow.Storage.Entity;
using DotnetSpider.Http;
using DotnetSpider.Infrastructure;
using DotnetSpider.Selector;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using Serilog;
namespace DotnetSpider.Sample.samples;
public class TestSpider2(
IOptions<SpiderOptions> options,
DependenceServices services,
ILogger<Spider> logger)
: Spider(options,
services, logger)
{
public static async Task RunAsync()
{
var builder = Builder.CreateDefaultBuilder<TestSpider2>(options =>
{
options.Speed = 1;
});
builder.UseSerilog();
await builder.Build().RunAsync();
}
protected override async Task InitializeAsync(CancellationToken stoppingToken)
{
await AddRequestsAsync(new Request($"https://news.cnblogs.com/n/page/1/"));
AddDataFlow<DataParser<CnblogsEntity>>();
AddDataFlow(GetDefaultStorage);
// AddDataFlow(new HBaseStorage("http://localhost:8080"));
}
protected override SpiderId GenerateSpiderId()
{
return new(ObjectId.CreateId().ToString(), "测试爬虫 1");
}
[Schema("cnblogs", "news")]
[EntitySelector(Expression = ".//div[@class='news_block']", Type = SelectorType.XPath)]
[GlobalValueSelector(Expression = ".//a[@class='current']", Name = "类别", Type = SelectorType.XPath)]
[FollowRequestSelector(Expressions = ["//div[@class='pager']"],
Patterns = ["news\\.cnblogs\\.com/n/page"])]
public class CnblogsEntity : EntityBase<CnblogsEntity>
{
protected override void Configure()
{
HasIndex(x => x.Title);
HasIndex(x => new { x.WebSite, x.Guid }, true);
}
public int Id { get; set; }
[Required]
[StringLength(200)]
[ValueSelector(Expression = "类别", Type = SelectorType.Environment)]
public string Category { get; set; }
[Required]
[StringLength(200)]
[ValueSelector(Expression = "网站", Type = SelectorType.Environment)]
public string WebSite { get; set; }
[StringLength(200)]
[ValueSelector(Expression = "//title")]
[ReplaceFormatter(NewValue = "", OldValue = " - 博客园")]
public string Title { get; set; }
[StringLength(40)]
[ValueSelector(Expression = "GUID", Type = SelectorType.Environment)]
public string Guid { get; set; }
[ValueSelector(Expression = ".//h2[@class='news_entry']/a")]
public string News { get; set; }
[ValueSelector(Expression = ".//h2[@class='news_entry']/a/@href")]
public string Url { get; set; }
[ValueSelector(Expression = ".//div[@class='entry_summary']")]
public string PlainText { get; set; }
[ValueSelector(Expression = "DATETIME", Type = SelectorType.Environment)]
public DateTime CreationTime { get; set; }
}
}