-
Notifications
You must be signed in to change notification settings - Fork 1k
/
EntitySpider.cs
113 lines (98 loc) · 3.8 KB
/
EntitySpider.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
using System;
using System.Collections.Generic;
using System.ComponentModel.DataAnnotations;
using System.Threading;
using System.Threading.Tasks;
using DotnetSpider.DataFlow.Parser;
using DotnetSpider.DataFlow.Parser.Formatters;
using DotnetSpider.DataFlow.Storage.Entity;
using DotnetSpider.Http;
using DotnetSpider.Infrastructure;
using DotnetSpider.MySql.Scheduler;
using DotnetSpider.Selector;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using Serilog;
namespace DotnetSpider.Sample.samples;
public class EntitySpider(
IOptions<SpiderOptions> options,
DependenceServices services,
ILogger<Spider> logger)
: Spider(options, services, logger)
{
public static async Task RunAsync()
{
var builder = Builder.CreateDefaultBuilder<EntitySpider>(options =>
{
options.Speed = 1;
});
builder.UseSerilog();
builder.IgnoreServerCertificateError();
await builder.Build().RunAsync();
}
public static async Task RunMySqlQueueAsync()
{
var builder = Builder.CreateDefaultBuilder<EntitySpider>(options =>
{
options.Speed = 1;
});
builder.UseSerilog();
builder.IgnoreServerCertificateError();
builder.UseMySqlQueueBfsScheduler((context, options) =>
{
options.ConnectionString = context.Configuration["SchedulerConnectionString"];
});
await builder.Build().RunAsync();
}
protected override async Task InitializeAsync(CancellationToken stoppingToken = default)
{
AddDataFlow<DataParser<CnblogsEntry>>();
AddDataFlow(GetDefaultStorage);
await AddRequestsAsync(
new Request(
"https://news.cnblogs.com/n/page/1", new Dictionary<string, object> { { "网站", "博客园" } }));
}
protected override SpiderId GenerateSpiderId()
{
return new(ObjectId.CreateId().ToString(), "博客园");
}
[Schema("cnblogs", "news")]
[EntitySelector(Expression = ".//div[@class='news_block']", Type = SelectorType.XPath)]
[GlobalValueSelector(Expression = ".//a[@class='current']", Name = "类别", Type = SelectorType.XPath)]
[GlobalValueSelector(Expression = "//title", Name = "Title", Type = SelectorType.XPath)]
[FollowRequestSelector(Expressions = ["//div[@class='pager']"])]
public class CnblogsEntry : EntityBase<CnblogsEntry>
{
protected override void Configure()
{
HasIndex(x => x.Title);
HasIndex(x => new { x.WebSite, x.Guid }, true);
}
public int Id { get; set; }
[Required]
[StringLength(200)]
[ValueSelector(Expression = "类别", Type = SelectorType.Environment)]
public string Category { get; set; }
[Required]
[StringLength(200)]
[ValueSelector(Expression = "网站", Type = SelectorType.Environment)]
public string WebSite { get; set; }
[StringLength(200)]
[ValueSelector(Expression = "Title", Type = SelectorType.Environment)]
[ReplaceFormatter(NewValue = "", OldValue = " - 博客园")]
public string Title { get; set; }
[StringLength(40)]
[ValueSelector(Expression = "GUID", Type = SelectorType.Environment)]
public string Guid { get; set; }
[ValueSelector(Expression = ".//h2[@class='news_entry']/a")]
public string News { get; set; }
[ValueSelector(Expression = ".//h2[@class='news_entry']/a/@href")]
public string Url { get; set; }
[ValueSelector(Expression = ".//div[@class='entry_summary']")]
[TrimFormatter]
public string PlainText { get; set; }
[ValueSelector(Expression = "DATETIME", Type = SelectorType.Environment)]
public DateTime CreationTime { get; set; }
}
}