-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_decks.rb
144 lines (111 loc) · 3.92 KB
/
make_decks.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# encoding: utf-8
require "bundler/setup"
require "anki"
require "json"
require "nokogiri"
require "tradsim"
DATA_FILE = "web_page_archive_20160630.html"
# yolo
class String
SIMPLIFIED_REGEX = /\(S.+?\)/
TRADITIONAL_REGEX = /\S?\(F(.+?)\)/
# returns the first version of a character in a string
# no alternate forms, no traditional forms
def isolate_first_version!
self.gsub!(self, self[0]) # the first character is always the simplified form
# there's probably a more sane way to do this lol
end
# removes any character in front and promotes the traditional character in its place
def promote_traditional!
self.gsub!(TRADITIONAL_REGEX, $1) if self =~ TRADITIONAL_REGEX
end
def remove_simplified!
self.gsub!(SIMPLIFIED_REGEX, "")
end
end
def most_common_chinese_characters(options)
data = File.read("data/#{DATA_FILE}")
doc = Nokogiri::HTML(data)
simplified = options.fetch(:simplified)
first_row = true
Enumerator.new do |enum|
doc.xpath("//blockquote/table/tr").each do |row|
# skip the header row
if first_row
first_row = false
next
end
# ----- CHARACTER -----------------------------------------------------------------------------
character = row.xpath("td")[1].text.strip
if simplified
# extract the first character and delete everything else
character.isolate_first_version!
else
character.promote_traditional!
character.remove_simplified!
end
# ----- DESCRIPTION ---------------------------------------------------------------------------
# use #inner_html because #text eats <BR> without converting it to a newline
description = row.xpath("td")[2].inner_html
description.gsub!(/&/i, "&")
description.gsub!(/</i, "<")
description.gsub!(/>/i, ">")
description.gsub!(/<br>/i, "\n")
# particles are described in <explanatory text> which Anki interprets as HTML, so
# replace <> with {} instead.
description.gsub!(/<([^>]+)/, "{\\1}")
description.gsub!("\n", "<br /><br />")
if simplified
description.promote_traditional!
description = Tradsim::to_trad(description)
end
data = {
"character" => character,
"description" => description
}
enum.yield(data)
end
end
end
# defaults to traditional characters
# pass in "simplified: true" to get simplified characters
def build_deck_of_top_n_cards(num_cards, options = {})
raise ArgumentError, "num_cards must be an integer" unless num_cards.class <= Integer
options[:simplified] ||= false
type = options[:simplified] ? "simplified" : "traditional"
headers = %w[front back]
output_deck_filename = "top-#{num_cards}-#{type}-chinese-characters.txt"
puts "Generating: #{output_deck_filename}"
# since there can be multiple entries for some characters, store descriptions onto an
# hash of arrays keyed by character which we can join together later.
card_hash = {}
most_common_chinese_characters(options).take(num_cards).each do |pair|
character = pair["character"]
description = pair["description"]
card_hash[character] ||= []
card_hash[character] << description
end
cards = []
card_hash.each do |character, descriptions|
cards << {
"front" => character,
"back" => descriptions.join("<br /><br />"),
}
end
deck = Anki::Deck.new(card_headers: headers, card_data: cards, field_separator: "|")
# ensure output directories exist. there's probably a FileUtils method for this...
["decks", "decks/#{type}"].each do |dir|
begin
Dir.mkdir(dir)
rescue Errno::EEXIST
end
end
output_path = "decks/#{type}/#{output_deck_filename}"
deck.generate_deck(file: output_path)
end
if __FILE__ == $0
[100, 250, 500, 1000, 2000, 9999].each do |n|
build_deck_of_top_n_cards(n, simplified: true)
build_deck_of_top_n_cards(n)
end
end