Skip to content

Commit

Permalink
code review cleanup per @Craigacp
Browse files Browse the repository at this point in the history
  • Loading branch information
nezda committed Apr 27, 2021
1 parent d357c7d commit d8347af
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 50 deletions.
6 changes: 3 additions & 3 deletions Data/src/main/java/org/tribuo/data/columnar/RowProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ public class RowProcessor<T extends Output<T>> implements Configurable, Provenan
protected Map<String,FieldProcessor> regexMappingProcessors = new HashMap<>();

@Config(description="Replace newlines with spaces in values before passing them to field processors.")
protected boolean replaceNewlinesWithSpaces;
protected boolean replaceNewlinesWithSpaces = true;

protected boolean configured;

Expand Down Expand Up @@ -385,7 +385,7 @@ public List<ColumnarFeature> generateFeatures(Map<String,String> row) {
String value = row.get(e.getKey());
if (value != null) {
if (replaceNewlinesWithSpaces) {
value = value.replace('\n', ' ');
value = value.replace('\n', ' ');
}
value = value.trim();
features.addAll(e.getValue().process(value));
Expand Down Expand Up @@ -562,7 +562,7 @@ protected Set<String> partialExpandRegexMapping(Collection<String> fieldNames) {
*/
@Deprecated
public RowProcessor<T> copy() {
return new RowProcessor<>(metadataExtractors, weightExtractor, responseProcessor, fieldProcessorMap, regexMappingProcessors, featureProcessors, replaceNewlinesWithSpaces);
return new RowProcessor<>(metadataExtractors, weightExtractor, responseProcessor, fieldProcessorMap, regexMappingProcessors, featureProcessors, replaceNewlinesWithSpaces);
}

@Override
Expand Down
81 changes: 34 additions & 47 deletions Data/src/test/java/org/tribuo/data/columnar/RowProcessorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ public void replaceNewlinesWithSpacesTest() {
if (charSequence == null || charSequence.length() == 0) {
return charSequence;
}
return BLANK_LINES.splitAsStream(charSequence).collect(Collectors.joining(" \n\n"));
return BLANK_LINES.splitAsStream(charSequence).collect(Collectors.joining(" *\n\n"));
};

Tokenizer tokenizer = new MungingTokenizer(new BreakIteratorTokenizer(Locale.US), newLiner);
Expand All @@ -215,23 +215,47 @@ public void replaceNewlinesWithSpacesTest() {
assertEquals("Sheep",example.getOutput().label);
Iterator<Feature> featureIterator = example.iterator();
Feature a = featureIterator.next();
assertEquals("order_text@1-N=Hoffa", a.getName());
assertEquals("order_text@1-N=*", a.getName());
assertEquals(1.0, a.getValue());
a = featureIterator.next();
assertEquals("order_text@1-N=Jimmy", a.getName());
assertEquals("order_text@1-N=Hoffa", a.getName());
a = featureIterator.next();
assertEquals("order_text@1-N=", a.getName());
assertEquals("order_text@1-N=Jimmy", a.getName());
a = featureIterator.next();
assertEquals("order_text@2-N=Jimmy/⏎", a.getName());
assertEquals("order_text@2-N=*/Hoffa", a.getName());
a = featureIterator.next();
assertEquals("order_text@2-N=⏎/Hoffa", a.getName());
assertEquals("order_text@2-N=Jimmy/*", a.getName());
assertFalse(featureIterator.hasNext());
a = featureIterator.next(); // weird this doesn't throw NoSuchElementException?
assertNull(a.getName());
}

public abstract static class ForwardingTokenizer implements Tokenizer {
protected abstract Tokenizer delegate();
static class MungingTokenizer implements Tokenizer {
private final Tokenizer tokenizer;
private final Function<CharSequence, CharSequence> munger;

MungingTokenizer(final Tokenizer tokenizer, final Function<CharSequence, CharSequence> munger) {
this.tokenizer = tokenizer;
this.munger = munger;
}

protected Tokenizer delegate() {
return tokenizer;
}

@Override
public List<Token> tokenize(CharSequence cs) {
return tokenizer.tokenize(munger.apply(cs));
}

@Override
public List<String> split(CharSequence cs) {
return tokenizer.split(munger.apply(cs));
}

@Override
public Tokenizer clone() throws CloneNotSupportedException {
Tokenizer copy = tokenizer.clone();
return new MungingTokenizer(copy, munger);
}

@Override
public void reset(final CharSequence cs) {
Expand Down Expand Up @@ -267,43 +291,6 @@ public TokenType getType() {
public ConfiguredObjectProvenance getProvenance() {
return delegate().getProvenance();
}

@Override
public Tokenizer clone() throws CloneNotSupportedException {
// must be overridden in subclass but can't be left out here or this class won't compile
throw new CloneNotSupportedException();
}
} // end class ForwardingTokenizer

static class MungingTokenizer extends ForwardingTokenizer {
private final Tokenizer tokenizer;
private final Function<CharSequence, CharSequence> munger;

MungingTokenizer(final Tokenizer tokenizer, final Function<CharSequence, CharSequence> munger) {
this.tokenizer = tokenizer;
this.munger = munger;
}

@Override
protected Tokenizer delegate() {
return tokenizer;
}

@Override
public List<Token> tokenize(CharSequence cs) {
return tokenizer.tokenize(munger.apply(cs));
}

@Override
public List<String> split(CharSequence cs) {
return tokenizer.split(munger.apply(cs));
}

@Override
public Tokenizer clone() throws CloneNotSupportedException {
Tokenizer copy = tokenizer.clone();
return new MungingTokenizer(copy, munger);
}
} // end class MungingTokenizer

}

0 comments on commit d8347af

Please sign in to comment.