Skip to content

Commit

Permalink
feature/RowProcessor-replaceNewlinesWithSpaces (#137)
Browse files Browse the repository at this point in the history
* new optional feature RowProcessor.replaceNewlinesWithSpaces and a basic unit test

* code review cleanup per @Craigacp

* replaceNewlinesWithSpacesTest=true unit test for @Craigacp

* weird that passed for me locally even after rebasing onto fresh `main`!?
  • Loading branch information
nezda authored Apr 29, 2021
1 parent 42ff2a0 commit fdf0ef4
Show file tree
Hide file tree
Showing 2 changed files with 183 additions and 4 deletions.
46 changes: 42 additions & 4 deletions Data/src/main/java/org/tribuo/data/columnar/RowProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ public class RowProcessor<T extends Output<T>> implements Configurable, Provenan
@Config(description="A map from a regex to field processors to apply to fields matching the regex.")
protected Map<String,FieldProcessor> regexMappingProcessors = new HashMap<>();

@Config(description="Replace newlines with spaces in values before passing them to field processors.")
protected boolean replaceNewlinesWithSpaces = true;

protected boolean configured;

/**
Expand Down Expand Up @@ -154,7 +157,36 @@ public RowProcessor(List<FieldExtractor<?>> metadataExtractors, ResponseProcesso
public RowProcessor(List<FieldExtractor<?>> metadataExtractors, FieldExtractor<Float> weightExtractor,
ResponseProcessor<T> responseProcessor, Map<String,FieldProcessor> fieldProcessorMap,
Set<FeatureProcessor> featureProcessors) {
this(metadataExtractors,weightExtractor,responseProcessor,fieldProcessorMap,Collections.emptyMap(),featureProcessors);
this(metadataExtractors,weightExtractor,responseProcessor,fieldProcessorMap,Collections.emptyMap(),featureProcessors, true);
}

/**
* Constructs a RowProcessor using the supplied responseProcessor to extract the response variable,
* and the supplied fieldProcessorMap to control which fields are parsed and how they are parsed.
* <p>
* In addition this processor can instantiate field processors which match the regexes supplied in
* the regexMappingProcessors. If a regex matches a field which already has a fieldProcessor assigned to
* it, it throws an IllegalArgumentException.
* <p>
* After extraction the features are then processed using the supplied set of feature processors.
* These processors can be used to insert conjunction features which are triggered when
* multiple features appear, or to filter out unnecessary features.
* <p>
* Additionally this processor can extract a weight from each row and insert it into the example, along
* with more general metadata fields (e.g., the row number, date stamps). The weightExtractor can be null,
* and if so the weights are left unset.
* @param metadataExtractors The metadata extractors to run per example. If two metadata extractors emit
* the same metadata name then the constructor throws a PropertyException.
* @param weightExtractor The weight extractor, if null the weights are left unset at their default.
* @param responseProcessor The response processor to use.
* @param fieldProcessorMap The keys are the field names and the values are the field processors to apply to those fields.
* @param regexMappingProcessors A set of field processors which can be instantiated if the regexes match the field names.
* @param featureProcessors The feature processors to run on each extracted feature list.
*/
public RowProcessor(List<FieldExtractor<?>> metadataExtractors, FieldExtractor<Float> weightExtractor,
ResponseProcessor<T> responseProcessor, Map<String,FieldProcessor> fieldProcessorMap,
Map<String,FieldProcessor> regexMappingProcessors, Set<FeatureProcessor> featureProcessors) {
this(metadataExtractors, weightExtractor, responseProcessor, fieldProcessorMap, regexMappingProcessors, featureProcessors, true);
}

/**
Expand All @@ -179,16 +211,19 @@ public RowProcessor(List<FieldExtractor<?>> metadataExtractors, FieldExtractor<F
* @param fieldProcessorMap The keys are the field names and the values are the field processors to apply to those fields.
* @param regexMappingProcessors A set of field processors which can be instantiated if the regexes match the field names.
* @param featureProcessors The feature processors to run on each extracted feature list.
* @param replaceNewlinesWithSpaces Replace newlines with spaces in values before passing them to field processors.
*/
public RowProcessor(List<FieldExtractor<?>> metadataExtractors, FieldExtractor<Float> weightExtractor,
ResponseProcessor<T> responseProcessor, Map<String,FieldProcessor> fieldProcessorMap,
Map<String,FieldProcessor> regexMappingProcessors, Set<FeatureProcessor> featureProcessors) {
Map<String,FieldProcessor> regexMappingProcessors, Set<FeatureProcessor> featureProcessors,
boolean replaceNewlinesWithSpaces) {
this.metadataExtractors = metadataExtractors.isEmpty() ? Collections.emptyList() : new ArrayList<>(metadataExtractors);
this.weightExtractor = weightExtractor;
this.responseProcessor = responseProcessor;
this.fieldProcessorMap = new HashMap<>(fieldProcessorMap);
this.regexMappingProcessors = regexMappingProcessors.isEmpty() ? Collections.emptyMap() : new HashMap<>(regexMappingProcessors);
this.featureProcessors.addAll(featureProcessors);
this.replaceNewlinesWithSpaces = replaceNewlinesWithSpaces;
postConfig();
}

Expand Down Expand Up @@ -349,7 +384,10 @@ public List<ColumnarFeature> generateFeatures(Map<String,String> row) {
for (Map.Entry<String,FieldProcessor> e : fieldProcessorMap.entrySet()) {
String value = row.get(e.getKey());
if (value != null) {
value = value.replace('\n', ' ').trim();
if (replaceNewlinesWithSpaces) {
value = value.replace('\n', ' ');
}
value = value.trim();
features.addAll(e.getValue().process(value));
}
}
Expand Down Expand Up @@ -524,7 +562,7 @@ protected Set<String> partialExpandRegexMapping(Collection<String> fieldNames) {
*/
@Deprecated
public RowProcessor<T> copy() {
return new RowProcessor<>(metadataExtractors, weightExtractor, responseProcessor, fieldProcessorMap, regexMappingProcessors, featureProcessors);
return new RowProcessor<>(metadataExtractors, weightExtractor, responseProcessor, fieldProcessorMap, regexMappingProcessors, featureProcessors, replaceNewlinesWithSpaces);
}

@Override
Expand Down
141 changes: 141 additions & 0 deletions Data/src/test/java/org/tribuo/data/columnar/RowProcessorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.tribuo.data.columnar;

import com.oracle.labs.mlrg.olcut.config.PropertyException;
import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
import org.tribuo.Example;
import org.tribuo.Feature;
import org.tribuo.data.columnar.extractors.DateExtractor;
Expand All @@ -26,8 +27,14 @@
import org.tribuo.data.columnar.extractors.IntExtractor;
import org.tribuo.data.columnar.processors.field.DoubleFieldProcessor;
import org.tribuo.data.columnar.processors.field.IdentityProcessor;
import org.tribuo.data.columnar.processors.field.TextFieldProcessor;
import org.tribuo.data.text.impl.TokenPipeline;
import org.tribuo.test.MockOutput;
import org.junit.jupiter.api.Test;
import org.tribuo.util.tokens.Token;
import org.tribuo.util.tokens.Token.TokenType;
import org.tribuo.util.tokens.Tokenizer;
import org.tribuo.util.tokens.impl.BreakIteratorTokenizer;

import java.time.LocalDate;
import java.time.LocalTime;
Expand All @@ -40,9 +47,15 @@
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.fail;

Expand Down Expand Up @@ -170,4 +183,132 @@ public void metadataExtractorTest() {
assertThrows(PropertyException.class, () -> new RowProcessor<>(badExtractors,weightExtractor,response,fixed,Collections.emptySet()));
}

@Test
public void replaceNewlinesWithSpacesTest() {
final Pattern BLANK_LINES = Pattern.compile("(\n[\\s-]*\n)+");

final Function<CharSequence, CharSequence> newLiner = (CharSequence charSequence) -> {
if (charSequence == null || charSequence.length() == 0) {
return charSequence;
}
return BLANK_LINES.splitAsStream(charSequence).collect(Collectors.joining(" *\n\n"));
};

Tokenizer tokenizer = new MungingTokenizer(new BreakIteratorTokenizer(Locale.US), newLiner);
TokenPipeline textPipeline = new TokenPipeline(tokenizer, 2, false);

final Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
fieldProcessors.put("order_text", new TextFieldProcessor("order_text", textPipeline));

MockResponseProcessor response = new MockResponseProcessor("Label");

Map<String,String> row = new HashMap<>();
row.put("order_text", "Jimmy\n\n\n\nHoffa");
row.put("Label", "Sheep");

RowProcessor<MockOutput> processor = new RowProcessor<>(Collections.emptyList(),null,response,fieldProcessors,Collections.emptyMap(),Collections.emptySet(), false);

Example<MockOutput> example = processor.generateExample(row,true).get();

// Check example is extracted correctly
assertEquals(5, example.size());
assertEquals("Sheep", example.getOutput().label);
Iterator<Feature> featureIterator = example.iterator();
Feature a = featureIterator.next();
assertEquals("order_text@1-N=*", a.getName());
assertEquals(1.0, a.getValue());
a = featureIterator.next();
assertEquals("order_text@1-N=Hoffa", a.getName());
a = featureIterator.next();
assertEquals("order_text@1-N=Jimmy", a.getName());
a = featureIterator.next();
assertEquals("order_text@2-N=*/Hoffa", a.getName());
a = featureIterator.next();
assertEquals("order_text@2-N=Jimmy/*", a.getName());
assertFalse(featureIterator.hasNext());

// same input with replaceNewlinesWithSpacesTest=true (the default) produces different features
processor = new RowProcessor<>(Collections.emptyList(),null,response,fieldProcessors,Collections.emptyMap(),Collections.emptySet(), true);

example = processor.generateExample(row,true).get();

// Check example is extracted correctly
assertEquals(3, example.size());
assertEquals("Sheep", example.getOutput().label);
featureIterator = example.iterator();
a = featureIterator.next();
assertEquals("order_text@1-N=Hoffa", a.getName());
assertEquals(1.0, a.getValue());
a = featureIterator.next();
assertEquals("order_text@1-N=Jimmy", a.getName());
a = featureIterator.next();
assertEquals("order_text@2-N=Jimmy/Hoffa", a.getName());
assertFalse(featureIterator.hasNext());
}

static class MungingTokenizer implements Tokenizer {
private final Tokenizer tokenizer;
private final Function<CharSequence, CharSequence> munger;

MungingTokenizer(final Tokenizer tokenizer, final Function<CharSequence, CharSequence> munger) {
this.tokenizer = tokenizer;
this.munger = munger;
}

protected Tokenizer delegate() {
return tokenizer;
}

@Override
public List<Token> tokenize(CharSequence cs) {
return tokenizer.tokenize(munger.apply(cs));
}

@Override
public List<String> split(CharSequence cs) {
return tokenizer.split(munger.apply(cs));
}

@Override
public Tokenizer clone() throws CloneNotSupportedException {
Tokenizer copy = tokenizer.clone();
return new MungingTokenizer(copy, munger);
}

@Override
public void reset(final CharSequence cs) {
delegate().reset(cs);
}

@Override
public boolean advance() {
return delegate().advance();
}

@Override
public String getText() {
return delegate().getText();
}

@Override
public int getStart() {
return delegate().getStart();
}

@Override
public int getEnd() {
return delegate().getEnd();
}

@Override
public TokenType getType() {
return delegate().getType();
}

@Override
public ConfiguredObjectProvenance getProvenance() {
return delegate().getProvenance();
}
} // end class MungingTokenizer

}

0 comments on commit fdf0ef4

Please sign in to comment.