-
Notifications
You must be signed in to change notification settings - Fork 11
/
subset_txt
executable file
·49 lines (40 loc) · 1.39 KB
/
subset_txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env perl
#===============================================================================
#
# FILE: subset_txt
#
# USAGE: ./subset_txt <subset_file>
#
# DESCRIPTION: filters lines from stdin to stdout, keeping only those
# whose first word is listed in subset_file. Use in conjunction with the
# function write.filenames in metadata.R to extract a subset of a bag of
# words file.
#
# AUTHOR: Andrew Goldstone (agoldst), [email protected]
# ORGANIZATION: Rutgers University, New Brunswick
# VERSION: 1.0
# CREATED: 11/15/2012 16:09:43
# REVISION: ---
#===============================================================================
use v5.14; # entails strict, unicode_strings
use autodie;
use utf8; # source code itself is in utf-8
use warnings;
use warnings FATAL => "utf8"; # Unicode encode errors are fatal
use open qw( :std :utf8 ); # default utf8 layer
# Put the subset filenames into a hash for fast lookup
my $subset_file = shift;
open SUBSET, "< :encoding(UTF-8)", $subset_file
or die "Couldn't open $subset_file";
my %subset;
while(<SUBSET>) {
chomp;
$subset{$_} = 1;
}
close SUBSET;
# and filter
while(<STDIN>) {
my ($countfile) = /^(\S+)/;
die "ill-formed line: $!" unless $countfile;
print $_ if $subset{$countfile};
}