forked from blinry/anki-german-sign-language
-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawl.rb
66 lines (61 loc) · 2.57 KB
/
crawl.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
require "open-uri"
# $gem install nokogiri
require "nokogiri"
File.open(File.expand_path("entries", File.dirname(__FILE__)), 'w') { |file| file.write("") }
(("A".."Z").to_a + ["0-9"]).each do |letter|
oldcontent = ''
1.upto(100) do |pageno|
uri = "https://signdict.org/entry?letter=#{letter}&page=#{pageno}"
page = URI.open(uri)
content = page.read
break if content == oldcontent
oldcontent = content
new_entries = content.scan(/"so-search-result--link" href="\/entry\/(\d+)-[^"]+">([^<]+)</)
new_entries.each do |id, meaning|
uri = "https://signdict.org/entry/#{id}"
URI.open(uri) do |page2|
content = page2.read
n = Nokogiri::HTML(content)
meaning = n.css(".so-video-details--headline").text
note = n.css(".so-video-details--headline + p").text
if note.length > 0
note = "\t" + note
end
variants = n.css(".sc-sidebar .so-video-list--item--thumbnail a")
variants.each do |v|
video = v["href"]
puts "#{video}\t#{meaning}\t#{note}"
File.open(File.expand_path("entries", File.dirname(__FILE__)), 'a+') { |file| file.write("#{video}\t#{meaning}#{note}\n") }
STDOUT.flush
end
end
end
break if new_entries.empty?
end
end
# Und hier fügen wir bislang fehlende Entries hinzu, die mit Umlauten beginnen
umlautlinks = IO.readlines(File.expand_path('Umlaute', File.dirname(__FILE__)),chomp: true).uniq
umlautlinks.each do |ulink|
URI.open(ulink) do |page2|
# puts ulink
content = page2.read
n = Nokogiri::HTML(content)
meaning = n.css(".so-video-details--headline").text
note = n.css(".so-video-details--headline + p").text
if note.length > 0
note = "\t" + note
end
variants = n.css(".sc-sidebar .so-video-list--item--thumbnail a")
variants.each do |v|
video = v["href"]
# puts "#{video}\t#{meaning}\t#{note}"
File.open(File.expand_path("entries", File.dirname(__FILE__)), 'a+') { |file| file.write("#{video}\t#{meaning}#{note}\n") }
STDOUT.flush
end
end
end
entries = IO.readlines(File.expand_path('entries', File.dirname(__FILE__))).uniq
File.open(File.expand_path("entries", File.dirname(__FILE__)), 'w') { |file| file.write("") }
File.open(File.expand_path("entries", File.dirname(__FILE__)), 'a+') do |f|
entries.each { |element| f.puts(element) }
end