Skip to content

Commit

Permalink
Cleanup after import:
Browse files Browse the repository at this point in the history
- Switch encoding from Latin-1 to UTF-8.
- Replace <TAB> characters with spaces and remove extra whitespace.
- Update Copyright notices.
- Include `FindBin` to simplify local run.
- Add README.md, from https://www.cs.upc.edu/~egonzalez/autopan.html with minimal updates.
  • Loading branch information
edgar-gip committed May 30, 2023
1 parent ec13f70 commit 2554a2a
Show file tree
Hide file tree
Showing 12 changed files with 821 additions and 669 deletions.
14 changes: 7 additions & 7 deletions Alignment.pm
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
# Copyright (C) Edgar Gonzàlez i Pellicer
# Maria Fuentes Fort
# Copyright (C) 2005 Edgar Gonzàlez i Pellicer
# Maria Fuentes Fort
#
# This file is part of AutoPan
#
#
# AutoPan is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

use strict;
Expand Down Expand Up @@ -43,7 +43,7 @@ sub new {
# Set my text
sub setMyText {
my ($this, $tokens) = @_;

$this->[9] = join(' ', @{$tokens}[$this->[1]..$this->[2]]);
}

Expand Down
87 changes: 42 additions & 45 deletions EngTok.pm
Original file line number Diff line number Diff line change
@@ -1,86 +1,86 @@
# Copyright (C) Erik Tjong Kim Sang
# Edgar Gonzàlez i Pellicer
# Maria Fuentes Fort
# Copyright (C) 2005 Erik Tjong Kim Sang
# Edgar Gonzàlez i Pellicer
# Maria Fuentes Fort
#
# This file is part of AutoPan
#
#
# AutoPan is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

# English Tokenizer
# Class EngTok

package EngTok;
package EngTok;

use strict;
use strict;


# Constructor
sub new {
my ($pkg) = @_;

my $tok = [ {} ];
my $tok = [ {} ];

## abbreviations
my @abbrev = qw( apr aug av bldg co dec dr calif corp feb fla inc jan jr jul jun lt ltd mar mr mrs ms mt no nov oct rev sep sept st vol vols vs );
my $a;
my $a;
foreach $a (@abbrev) { $tok->[0]{$a} = 1; }

return bless $tok, $pkg;
return bless $tok, $pkg;
}


# Tokenize a string
sub tokenizeString {
my ($this, $line, $lf) = @_;
my ($this, $line, $lf) = @_;

# Fem separacio d'oracions o no
$lf = 0 if !defined($lf);

$line =~ s/^\s*//;
$line =~ s/\s*$//;
my @T = split(/\s+/,$line);

my $i = 0;
while ($i <= $#T) {
# Remove double quotes '' or `` and substitute by "
# at the beggining of word

# remove sentence breaking punctuation with quote from end of word
if ($T[$i] =~ /^([`][`]|[']['])(.+)$/) {
splice(@T,$i,1,'`',$2);
$i++;
# remove punctuation from start of word
} elsif ($T[$i] =~ /^(["])(.+)$/) {
splice(@T,$i,1,'`',$2);
$i++;
# change sentence breaking punctuation with double quote '' from end of word
} elsif ($T[$i] =~ /^([`'\(\)\[\]\$:;,\/\%])(.+)$/ and
# Remove double quotes '' or `` and substitute by "
# at the beggining of word

# remove sentence breaking punctuation with quote from end of word
if ($T[$i] =~ /^([`][`]|[']['])(.+)$/) {
splice(@T,$i,1,'`',$2);
$i++;

# remove punctuation from start of word
} elsif ($T[$i] =~ /^(["])(.+)$/) {
splice(@T,$i,1,'`',$2);
$i++;

# change sentence breaking punctuation with double quote '' from end of word
} elsif ($T[$i] =~ /^([`'\(\)\[\]\$:;,\/\%])(.+)$/ and
$T[$i] !~ /^'[dsm]$/i and $T[$i] !~ /^'re$/i and
$T[$i] !~ /^'ve$/i and $T[$i] !~ /^'ll$/i) {
splice(@T,$i,1,$1,$2);
$i++;

# change sentence breaking punctuation with double quote '' from end of word
} elsif ($T[$i] =~ /^(.+)([?!\.])(['][']|[`][`])$/) {
if ($lf) { splice(@T,$i,1,"\n",$2); }
if ($lf) { splice(@T,$i,1,"\n",$2); }
else { splice(@T,$i,1,'`',$2); }
splice(@T,$i,1,$1,$2,'`');
splice(@T,$i,1,$1,$2,'`');

# remove sentence breaking punctuation with quote from end of word
# remove sentence breaking punctuation with quote from end of word
} elsif ($T[$i] =~ /^(.+)([?!\.])(['])$/) {
if ($lf) { splice(@T,$i,1,$1,"$2$3","\n"); }
else { splice(@T,$i,1,$1,$2,$3); }
Expand All @@ -105,8 +105,8 @@ sub tokenizeString {
# remove sentence-breaking punctuation (not period) from end of word
} elsif ($T[$i] =~ /^(.+)([?!])$/ or
$T[$i] =~ /^(.+[^\.])(\.\.+)$/) {
if ($lf) { splice(@T,$i,1,$1,$2,"\n"); }
else { splice(@T,$i,1,$1,$2); }
if ($lf) { splice(@T,$i,1,$1,$2,"\n"); }
else { splice(@T,$i,1,$1,$2); }

# separate currency symbol from value
} elsif ($T[$i] =~ /^([A-Za-z]+\$)(.+)$/i) {
Expand Down Expand Up @@ -155,7 +155,7 @@ sub tokenizeString {
splice(@T,$i,1,$1,$2);

# split words containing a slash if they are not a URI
} elsif ($T[$i] !~ /^(ht|f)tps*/i and
} elsif ($T[$i] !~ /^(ht|f)tps*/i and
$T[$i] =~ /[^0-9\/\-]/ and
$T[$i] =~ /^(.+)\/(.+)$/) {
splice(@T,$i,1,$1,"/",$2);
Expand All @@ -165,21 +165,21 @@ sub tokenizeString {
$T[$i] !~ /^[0-9]+\./) {
my $word = $1;
if ($i != $#T and $this->abbrev($word)) { $i++; }
else {
else {
if ($lf) { splice(@T,$i,1,$1,$2,"\n"); }
else { splice(@T,$i,1,$1,$2); }
}
} else { $i++; }
}
return @T;

return @T;
}


# Is it an abbreviation
sub abbrev {
my ($this, $word) = @_;

$word =~ tr/[A-Z]/[a-z]/;
if ($word =~ /\./ and $word !~ /[0-9]/) { return(1); };
if ($word =~ /^[a-z]$/) { return(1); };
Expand All @@ -189,6 +189,3 @@ sub abbrev {

# Return true
1;



68 changes: 29 additions & 39 deletions Porter.pm
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
# Copyright (C) Martin Porter
# Edgar Gonzàlez i Pellicer
# Maria Fuentes Fort
# Copyright (C) 2005 Martin Porter
# Edgar Gonzàlez i Pellicer
# Maria Fuentes Fort
#
# This file is part of AutoPan
#
#
# AutoPan is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

# Porter Stemmer
Expand All @@ -27,11 +27,10 @@ use strict;

use IO::File;


package Porter;

use vars qw( %step2list %step3list
$c $v $C $V $mgr0 $meq1 $mgr1 $_v );
$c $v $C $V $mgr0 $meq1 $mgr1 $_v );

# Static vars
%step2list =
Expand All @@ -44,10 +43,9 @@ use vars qw( %step2list %step3list
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log');

%step3list =
('icate'=>'ic', 'ative'=>'', 'alize'=>'al',
('icate'=>'ic', 'ative'=>'', 'alize'=>'al',
'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>'');


$c = qr/[^aeiou]/; # consonant
$v = qr/[aeiouy]/; # vowel
$C = qr/${c}[^aeiouy]*/; # consonant sequence
Expand All @@ -58,7 +56,6 @@ $meq1 = qr/^(${C})?${V}${C}(${V})?$/; # [C]VC[V] is m=1
$mgr1 = qr/^(${C})?${V}${C}${V}${C}/; # [C]VCVC... is m>1
$_v = qr/^(${C})?${v}/; # vowel in stem


# Constructor
sub new {
my ($class, $stopWordFile) = @_;
Expand All @@ -68,22 +65,21 @@ sub new {

# Load it
if ($stopWordFile) {
my $fin = new IO::File("< $stopWordFile")
or die "Can't Open Stop Word File $stopWordFile\n";
my $line;
while ($line = $fin->getline()) {
chomp($line);
$stopList->{lc($line)} = 1 if $line;
}
$fin->close();
my $fin = new IO::File("< $stopWordFile")
or die "Can't Open Stop Word File $stopWordFile\n";

my $line;
while ($line = $fin->getline()) {
chomp($line);
$stopList->{lc($line)} = 1 if $line;
}
$fin->close();
}

# Add the cache part and bless
return bless([ $stopList, {}], $class);
}


# Stem
sub stem {
my ($this, $w) = @_;
Expand All @@ -103,7 +99,7 @@ sub stem {

# Look in the cache
return $this->[1]{$w} if exists($this->[1]{$w});

# Save starting $w
my $initw = $w;

Expand All @@ -117,16 +113,16 @@ sub stem {
# Step 1a
if ($w =~ /(ss|i)es$/) { $w=$`.$1; }
elsif ($w =~ /([^s])s$/) { $w=$`.$1; }
# Step 1b

# Step 1b
if ($w =~ /eed$/) { if ($` =~ /$mgr0/o) { chop($w); } }
elsif ($w =~ /(ed|ing)$/) {
$stem = $`;
if ($stem =~ /$_v/o) {
$w = $stem;
if ($w =~ /(at|bl|iz)$/) { $w .= "e"; }
elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); }
elsif ($w =~ /^${C}${v}[^aeiouwxy]$/o) { $w .= "e"; }
elsif ($w =~ /(ed|ing)$/) {
$stem = $`;
if ($stem =~ /$_v/o) {
$w = $stem;
if ($w =~ /(at|bl|iz)$/) { $w .= "e"; }
elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); }
elsif ($w =~ /^${C}${v}[^aeiouwxy]$/o) { $w .= "e"; }
}
}

Expand Down Expand Up @@ -155,7 +151,6 @@ sub stem {
if ($stem =~ /$mgr1/o) { $w = $stem; }
}


# Step 5
if ($w =~ /e$/) {
$stem = $`;
Expand All @@ -174,24 +169,19 @@ sub stem {
return $w;
}


# Is a stop word
sub isStopWord {
my ($this, $w) = @_;

return ($w =~ /^\W+$/ || $this->[0]{$w});
}



# Is a non word
sub isNonWord {
my ($this, $w) = @_;

return $w =~ /^\W+$/;
}


# Return true
1;

Loading

0 comments on commit 2554a2a

Please sign in to comment.