forked from echen/sarah-palin-lda
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path4-get-email-dates.rb
43 lines (37 loc) · 942 Bytes
/
4-get-email-dates.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
require 'iconv'
require 'time'
EMAILS_DIRECTORY = "emails/"
class String
IC = Iconv.new('UTF-8//IGNORE', 'UTF-8')
def fix_encoding
IC.iconv(self + ' ')[0..-2]
end
def clean
self.fix_encoding
end
end
weights = {}
IO.foreach("document-topic-distributions.txt") do |line|
next if line.strip.empty?
line = line.strip.split(",")
doc_id = line[0].to_i
topic_weights = line[1..-1].map(&:to_f)
weights[doc_id] = topic_weights
end
count = 0
Dir.glob(EMAILS_DIRECTORY + "*.txt") do |filename|
base = File.basename(filename).gsub(".txt", "").to_i
text = File.read(filename).clean
if text =~ /sent\s*:(.+?)$/i
begin
date = text.match(/sent\s*:(.+?)$/i).captures.first
date = Time.parse(date)
puts [base, date.year, date.month, date.day, date.hour, date.wday].join("\t") + "\t" + weights[base].join("\t")
rescue
count += 1
end
else
count += 1
end
end
puts count