-
Notifications
You must be signed in to change notification settings - Fork 12
/
Rakefile
129 lines (97 loc) · 2.99 KB
/
Rakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
require 'rubygems' unless ENV['NO_RUBYGEMS']
require 'bundler'
require 'digest'
require 'fileutils'
require 'rubygems/package_task'
require './lib/cskit-strongs'
require 'cskit'
Bundler::GemHelper.install_tasks
class LexiconUpdater
BLOCK_SIZE = 100
attr_reader :lexicon_lang, :splitter_class
def initialize(lexicon_lang, splitter_class)
@lexicon_lang = lexicon_lang
@splitter_class = splitter_class
end
def update
splitter = splitter_class.new(input_file)
FileUtils.mkdir_p(output_dir)
puts "Processing #{lexicon_lang} lexicon..."
splitter.each_slice(BLOCK_SIZE).with_index do |entries, index|
File.open(File.join(output_dir, "#{index}.json"), "w+") do |f|
f.write(
entries.inject({}) do |ret, entry|
ret[entry.first] = entry.last
ret
end.to_json
)
end
end
puts "Done."
end
private
def input_file
@input_file ||= File.join(CSKitStrongs.vendor_dir, "#{lexicon_lang}_lexicon.xml")
end
def output_dir
@output_dir ||= File.join(CSKitStrongs.resource_dir, "lexicon/#{lexicon_lang}")
end
end
class ConcordanceUpdater
def update
puts "Processing concordance..."
file_contents = split
puts "\nWriting to disk..."
write_to_disk(file_contents)
puts "Done."
end
private
def write_to_disk(file_contents)
file_contents.each do |path, contents|
FileUtils.mkdir_p(File.dirname(path))
File.open(path, "w+") { |f| f.write(contents.to_json) }
end
end
def split
total_lines = `wc -l #{input_file}`.strip.to_i
splitter = CSKitStrongs::Splitters::ConcordanceSplitter.new(input_file)
file_contents = {}
splitter.each_with_index do |(language, number, text, citation), index|
book = citation.book.strip.downcase.gsub(" ", "_")
chapter = citation.chapter_list.first.chapter_number
verse = citation.chapter_list.first.verse_list.first.start.to_i
path = File.join(output_dir, book, "#{chapter}.json")
((file_contents[path] ||= {})[verse] ||= []) << {
:text => text,
:number => number
}
if index % 1000 == 0
$stdout.write("\rProcessing #{index} of #{total_lines} (#{((index.to_f / total_lines.to_f) * 100).round(2)})")
end
end
file_contents
end
def input_file
@input_file ||= File.join(CSKitStrongs.vendor_dir, "concordance.txt")
end
def output_dir
@output_dir ||= File.join(CSKitStrongs.resource_dir, "concordance")
end
end
namespace :update do
task :default => [:concordance, :lexicon]
task :concordance do
ConcordanceUpdater.new.update
end
task :greek_lexicon do
splitter_class = CSKitStrongs::Splitters::GreekLexiconSplitter
LexiconUpdater.new("greek", splitter_class).update
end
task :hebrew_lexicon do
splitter_class = CSKitStrongs::Splitters::HebrewLexiconSplitter
LexiconUpdater.new("hebrew", splitter_class).update
end
end