-
Notifications
You must be signed in to change notification settings - Fork 11
/
wc_stop
executable file
·94 lines (84 loc) · 2.2 KB
/
wc_stop
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env perl
#===============================================================================
#
# FILE: wc_stop
#
# USAGE: wc_stop -s stoplist_file file1 file2 file3 file4...
#
# DESCRIPTION: Count words in file1 file2, ..., not counting words in
# stoplist_file. Outputs results as comma-separated values to stdout:
# filename,wordcount
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Andrew Goldstone (agoldst), [email protected]
# ORGANIZATION: Rutgers University, New Brunswick
# VERSION: 1.0
# CREATED: 12/04/2012 07:59:30
# REVISION: ---
#===============================================================================
use v5.14; # entails strict, unicode_strings
use autodie;
use utf8; # source code itself is in utf-8
use warnings;
use warnings FATAL => "utf8"; # Unicode encode errors are fatal
use open qw( :std :utf8 ); # default utf8 layer
my $USAGE = <<EOM;
Usage:
wc_stop -s stoplist_file file1 file2 file3 file4...
EOM
my $first = shift;
unless ($first && $first eq "-s") {
say $USAGE;
exit;
}
my $stoplist = shift;
open STOP, $stoplist or die;
my %STOPLIST = ();
while(<STOP>) {
chomp;
$STOPLIST{$_} = 1;
}
close STOP;
my $count;
foreach my $filename (@ARGV) {
open my $fh, "$filename" or die;
if($filename =~ /\.csv$/i) {
$count = count_csv($fh);
}
else {
$count = count($fh);
}
close $fh;
print "$filename,$count\n";
}
# experimental, untested
# tally up wordcountsXXX.CSV files from jstor DfR
sub count_csv {
my $fh = shift;
my $header = <$fh>;
unless($header && $header eq "WORDCOUNTS,WEIGHT\n") {
die "unexpected header found in csv file";
}
my $result = 0;
while(<$fh>) {
chomp;
my ($word,$count) = split /,/;
$result += $count unless $STOPLIST{$word};
}
return $result;
}
sub count {
my $fh = shift;
my $result = 0;
while(<$fh>) {
chomp;
my @words = split;
foreach(@words) {
$result++ unless $STOPLIST{$_};
}
}
return $result;
}