-
Notifications
You must be signed in to change notification settings - Fork 0
/
ngram_calcs.PL
56 lines (52 loc) · 2.23 KB
/
ngram_calcs.PL
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/Users/cat/perl
use strict;
use warnings;
use IPC::System::Simple qw(system capture);
use File::Basename;
use List::MoreUtils qw(natatime);
my $proj_dir = "/Volumes/2TB/Final_project";
my $fanfic_dir = "$proj_dir/Fanfic_all";
my $fanfic_list_dir = "$proj_dir/Fanfic lists";
my $bigram_dir = "$proj_dir/Bigrams";
sub count_bigrams_from_idlist {
my ($list_filename, $fanfic_dir, $dest_dir) = @_;
open(my $in, "<$list_filename") or die "Could not open list_filename, $!\n";
my $short_filename = basename($list_filename, ".txt");
print("$short_filename\n");
my @filenames = ();
while(my $line = <$in>) {
chomp($line);
my $filename = "$fanfic_dir/$line.txt";
push(@filenames, $filename);
}
if (@filenames > 4000) { # guessing at what the argument size limit is
my $counter = 0;
my $iter = natatime 4000, @filenames;
while( my @tmp = $iter->() ){
system($^X, "/Users/cat/perl5/bin/huge-count.pl", "--token=valid_tokens.txt",
"--tokenlist", $dest_dir, @tmp);
system($^X, "/Users/cat/perl5/bin/huge-sort.pl", "$dest_dir/complete-huge-count.output");
rename("$dest_dir/complete-huge-count.output-sorted",
"$dest_dir/${short_filename}_count$counter.sorted") or die "Rename failed, $!\n";
$counter += 1;
}
system($^X, "/Users/cat/perl5/bin/huge-merge.pl", $dest_dir);
}
close $in;
system($^X, "/Users/cat/perl5/bin/huge-count.pl", "--token=valid_tokens.txt",
"--tokenlist", $dest_dir, @filenames);
rename("$dest_dir/complete-huge-count.output", "$dest_dir/results/${short_filename}_count.txt")
or die "Final rename failed, $!\n";
}
opendir(DH, $fanfic_list_dir) or die "opendir($fanfic_list_dir) failed: $!";
my @idfiles = readdir(DH);
closedir(DH);
foreach my $idfilename (@idfiles) {
chomp($idfilename);
if ($idfilename =~ /.txt/) {
count_bigrams_from_idlist("$fanfic_list_dir/$idfilename", $fanfic_dir,
$bigram_dir);
}
}
# # statistic.pl -score 6.00 -frequency 5 ll.pm holmes1.ll h.cnt
# system($^X, "/Users/cat/perl5/bin/statistic.pl", "--score=6.63", "ll.pm", "lab01_java_bigrams.ll", "/Users/cat/Perl_scripts/214_Final_project/lab01_java_bigrams.cnt");