subset_txt

#!/usr/bin/env perl 
#===============================================================================
#
#         FILE: subset_txt
#
#        USAGE: ./subset_txt <subset_file>  
#
#  DESCRIPTION: filters lines from stdin to stdout, keeping only those
# whose first word is listed in subset_file. Use in conjunction with the
# function write.filenames in metadata.R to extract a subset of a bag of
# words file.  
#
#       AUTHOR: Andrew Goldstone (agoldst), andrew.goldstone@gmail.com
# ORGANIZATION: Rutgers University, New Brunswick
#      VERSION: 1.0
#      CREATED: 11/15/2012 16:09:43
#     REVISION: ---
#===============================================================================

use v5.14;                                  # entails strict, unicode_strings 
use autodie;
use utf8;                                   # source code itself is in utf-8
use warnings;
use warnings FATAL => "utf8";               # Unicode encode errors are fatal
use open qw( :std :utf8 );                  # default utf8 layer

# Put the subset filenames into a hash for fast lookup

my $subset_file = shift;
open SUBSET, "< :encoding(UTF-8)", $subset_file
    or die "Couldn't open $subset_file";

my %subset;

while(<SUBSET>) { 
    chomp;
    $subset{$_} = 1;
}

close SUBSET;


# and filter

while(<STDIN>) {
    my ($countfile) = /^(\S+)/;
    die "ill-formed line: $!" unless $countfile;
    print $_ if $subset{$countfile}; 
}