-
Notifications
You must be signed in to change notification settings - Fork 0
/
get-all-magzione.rb
138 lines (126 loc) · 5.13 KB
/
get-all-magzione.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
require 'rest-client'
require 'nokogiri'
class Test
#获取所有杂志栏目
def start
urls = []
line_content_arr = []
start_url = 'http://qingmang.me/magazines/'
response = RestClient::Request.execute(method: :get, url: start_url, timeout: 500)
resp_content = Nokogiri::HTML(response)
current_magzines = resp_content.css("li.magazine")
f = File.open("magzines.txt" ,"w+")
current_magzines.map do |node|
link = node.css('a').map { |link| link['href'] }
h3text = node.css('a h3')[0].text
pcontent= node.css('a p').text #[0].nil ? "" : node.css('a p')[0].text
background_image_node = node.css('a div') #.map{|style| style['background-image']}
background_image = background_image_node[0]['style'].to_s.split("(").last.to_s.split(")").first #.last.split(")" ).first#[/\((.*?)\)/m ,1 ]
line_content_arr = [link , h3text , pcontent , background_image ]
f.puts line_content_arr.join("\t")
urls.push link
end
f.close
return urls.sort!
end
#request url and return response
def request_url url
response = RestClient::Request.execute(method: :get, url: url, timeout: 500 , user_agent: "Mozilla/5.0 (Linux; U; Android 6.0.1; zh-CN; HUAWEI RIO-AL00 Build/HuaweiRIO-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.4.5.937 Mobile Safari/537.36" )
end
#读存储在文件中的
def readFile( f )
urls = { }
begin
File.readlines(f.to_s).each do |line|
line_arr = line.split("\t")
if line_arr.length > 3
url = "http://qingmang.me" + line_arr[0]
urls.store url , line_arr[1..-1]
end
end
rescue => exception
puts "file #{f} not exists ,exexception :", exception
end
#puts urls.keys.sort!
return urls.keys.sort!
end
#获取单个杂志的所有文章
def each_magazines ( urls )
#puts "urls is :" , urls
urls.each do |url |
#puts " this url is : " , url
magzine_content = []
response = request_url url
resp_content = Nokogiri::HTML(response)
current_page_content = resp_content.css("li.article-compact")
magzine_content.concat( current_page_content )
#puts " this mag content is : \a " , current_page_content
result = parse_content( current_page_content , url )
blank_page = 0
#获取其他页的内容 todo 根据上一页下一页判断
(1..50).to_a.each do |page_index |
if blank_page >= 1
break
end
next_page = url + "?page=#{page_index}"
response = request_url next_page
resp_content = Nokogiri::HTML(response)
current_page_content = resp_content.css("li.article-compact")
if current_page_content.length == 0
blank_page += 1
else
magzine_content.concat( current_page_content )
result1 = parse_content(current_page_content , url )
result.merge! result1
end
#防止server 504 timeout 导致任务失败
system(" sleep 0.35 ")
end
#break
end
system(" sleep 0.8")
end
#把文章索引解析出来
def parse_content articles , url
article_maps = {}
magziname = url[/magazines\/(.*)\//, 1].to_s + ".txt"
f = File.open(magziname , "a+")
articles.each do | node|
begin
article_link = node.css('a').map { |link| link['href'] }
article_link = "http://qingmang.me" + article_link[0].to_s
h3text = node.css('a h3')[0].text
pcontent= node.css("a p[class='lead']").text #[0].nil ? "" : node.css('a p')[0].text
background_image_node = node.css('a div') #.map{|style| style['background-image']}
background_image = ""
if !(background_image_node.nil? ) && background_image_node.to_s != ""
background_image = background_image_node[0]['style'].to_s.split("(").last.to_s.split(")").first #.last.split(")" ).first#[/\((.*?)\)/m ,1 ]
end
p_meta_img_src = node.css("a p img").map{ |img| img['src']}
p_meta_src = node.css("a p[class='meta']").text.to_s.gsub(/\s/, '')
date_time = node.css("a time").map{|dt| dt['title']}
line_content_arr = [article_link , h3text ,pcontent , background_image ,p_meta_img_src , p_meta_src, date_time]
f.puts line_content_arr.join("\t")
article_maps.store( article_link , line_content_arr)
rescue Exception =>e
divobj = node.css('a div')
puts " [#{magziname}] fetch err #{e}"
end
end
f.close
return article_maps
end
end
#step1 启动任务获取所有的杂志列表 存储到magzines.txt
t = Test.new
#puts t.start
#step2 获取每一本杂志对应的文章列表
urls = t.readFile "magzines.txt"
t.each_magazines(urls)
#step3 获取杂志索引
def interupt urls , magazine_id
idx = Hash[urls.map.with_index.to_a]["http://qingmang.me/magazines/#{magazine_id}/"]
puts idx
urls = urls[idx .. -1 ]
puts urls , urls.length
end