feature/RowProcessor-replaceNewlinesWithSpaces (#137)

* new optional feature RowProcessor.replaceNewlinesWithSpaces and a basic unit test * code review cleanup per @Craigacp * replaceNewlinesWithSpacesTest=true unit test for @Craigacp * weird that passed for me locally even after rebasing onto fresh `main`!?
oracle · Apr 29, 2021 · fdf0ef4 · fdf0ef4
1 parent 42ff2a0
commit fdf0ef4
Show file tree

Hide file tree

Showing 2 changed files with 183 additions and 4 deletions.
diff --git a/Data/src/main/java/org/tribuo/data/columnar/RowProcessor.java b/Data/src/main/java/org/tribuo/data/columnar/RowProcessor.java
@@ -85,6 +85,9 @@ public class RowProcessor<T extends Output<T>> implements Configurable, Provenan
     @Config(description="A map from a regex to field processors to apply to fields matching the regex.")
     protected Map<String,FieldProcessor> regexMappingProcessors = new HashMap<>();
 
+    @Config(description="Replace newlines with spaces in values before passing them to field processors.")
+    protected boolean replaceNewlinesWithSpaces = true;
+
     protected boolean configured;
 
     /**
@@ -154,7 +157,36 @@ public RowProcessor(List<FieldExtractor<?>> metadataExtractors, ResponseProcesso
     public RowProcessor(List<FieldExtractor<?>> metadataExtractors, FieldExtractor<Float> weightExtractor,
                         ResponseProcessor<T> responseProcessor, Map<String,FieldProcessor> fieldProcessorMap,
                         Set<FeatureProcessor> featureProcessors) {
-        this(metadataExtractors,weightExtractor,responseProcessor,fieldProcessorMap,Collections.emptyMap(),featureProcessors);
+        this(metadataExtractors,weightExtractor,responseProcessor,fieldProcessorMap,Collections.emptyMap(),featureProcessors, true);
+    }
+
+    /**
+     * Constructs a RowProcessor using the supplied responseProcessor to extract the response variable,
+     * and the supplied fieldProcessorMap to control which fields are parsed and how they are parsed.
+     * <p>
+     * In addition this processor can instantiate field processors which match the regexes supplied in
+     * the regexMappingProcessors. If a regex matches a field which already has a fieldProcessor assigned to
+     * it, it throws an IllegalArgumentException.
+     * <p>
+     * After extraction the features are then processed using the supplied set of feature processors.
+     * These processors can be used to insert conjunction features which are triggered when
+     * multiple features appear, or to filter out unnecessary features.
+     * <p>
+     * Additionally this processor can extract a weight from each row and insert it into the example, along
+     * with more general metadata fields (e.g., the row number, date stamps). The weightExtractor can be null,
+     * and if so the weights are left unset.
+     * @param metadataExtractors The metadata extractors to run per example. If two metadata extractors emit
+     *                           the same metadata name then the constructor throws a PropertyException.
+     * @param weightExtractor The weight extractor, if null the weights are left unset at their default.
+     * @param responseProcessor The response processor to use.
+     * @param fieldProcessorMap The keys are the field names and the values are the field processors to apply to those fields.
+     * @param regexMappingProcessors A set of field processors which can be instantiated if the regexes match the field names.
+     * @param featureProcessors The feature processors to run on each extracted feature list.
+     */
+    public RowProcessor(List<FieldExtractor<?>> metadataExtractors, FieldExtractor<Float> weightExtractor,
+        ResponseProcessor<T> responseProcessor, Map<String,FieldProcessor> fieldProcessorMap,
+        Map<String,FieldProcessor> regexMappingProcessors, Set<FeatureProcessor> featureProcessors) {
+        this(metadataExtractors, weightExtractor, responseProcessor, fieldProcessorMap, regexMappingProcessors, featureProcessors, true);
     }
 
     /**
@@ -179,16 +211,19 @@ public RowProcessor(List<FieldExtractor<?>> metadataExtractors, FieldExtractor<F
      * @param fieldProcessorMap The keys are the field names and the values are the field processors to apply to those fields.
      * @param regexMappingProcessors A set of field processors which can be instantiated if the regexes match the field names.
      * @param featureProcessors The feature processors to run on each extracted feature list.
+     * @param replaceNewlinesWithSpaces Replace newlines with spaces in values before passing them to field processors.
      */
     public RowProcessor(List<FieldExtractor<?>> metadataExtractors, FieldExtractor<Float> weightExtractor,
                         ResponseProcessor<T> responseProcessor, Map<String,FieldProcessor> fieldProcessorMap,
-                        Map<String,FieldProcessor> regexMappingProcessors, Set<FeatureProcessor> featureProcessors) {
+                        Map<String,FieldProcessor> regexMappingProcessors, Set<FeatureProcessor> featureProcessors,
+                        boolean replaceNewlinesWithSpaces) {
         this.metadataExtractors = metadataExtractors.isEmpty() ? Collections.emptyList() : new ArrayList<>(metadataExtractors);
         this.weightExtractor = weightExtractor;
         this.responseProcessor = responseProcessor;
         this.fieldProcessorMap = new HashMap<>(fieldProcessorMap);
         this.regexMappingProcessors = regexMappingProcessors.isEmpty() ? Collections.emptyMap() : new HashMap<>(regexMappingProcessors);
         this.featureProcessors.addAll(featureProcessors);
+        this.replaceNewlinesWithSpaces = replaceNewlinesWithSpaces;
         postConfig();
     }
 
@@ -349,7 +384,10 @@ public List<ColumnarFeature> generateFeatures(Map<String,String> row) {
         for (Map.Entry<String,FieldProcessor> e : fieldProcessorMap.entrySet()) {
             String value = row.get(e.getKey());
             if (value != null) {
-                value = value.replace('\n', ' ').trim();
+                if (replaceNewlinesWithSpaces) {
+                    value = value.replace('\n', ' ');
+                }
+                value = value.trim();
                 features.addAll(e.getValue().process(value));
             }
         }
@@ -524,7 +562,7 @@ protected Set<String> partialExpandRegexMapping(Collection<String> fieldNames) {
      */
     @Deprecated
     public RowProcessor<T> copy() {
-        return new RowProcessor<>(metadataExtractors, weightExtractor, responseProcessor, fieldProcessorMap, regexMappingProcessors, featureProcessors);
+        return new RowProcessor<>(metadataExtractors, weightExtractor, responseProcessor, fieldProcessorMap, regexMappingProcessors, featureProcessors, replaceNewlinesWithSpaces);
     }
 
     @Override

diff --git a/Data/src/test/java/org/tribuo/data/columnar/RowProcessorTest.java b/Data/src/test/java/org/tribuo/data/columnar/RowProcessorTest.java
@@ -17,6 +17,7 @@
 package org.tribuo.data.columnar;
 
 import com.oracle.labs.mlrg.olcut.config.PropertyException;
+import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
 import org.tribuo.Example;
 import org.tribuo.Feature;
 import org.tribuo.data.columnar.extractors.DateExtractor;
@@ -26,8 +27,14 @@
 import org.tribuo.data.columnar.extractors.IntExtractor;
 import org.tribuo.data.columnar.processors.field.DoubleFieldProcessor;
 import org.tribuo.data.columnar.processors.field.IdentityProcessor;
+import org.tribuo.data.columnar.processors.field.TextFieldProcessor;
+import org.tribuo.data.text.impl.TokenPipeline;
 import org.tribuo.test.MockOutput;
 import org.junit.jupiter.api.Test;
+import org.tribuo.util.tokens.Token;
+import org.tribuo.util.tokens.Token.TokenType;
+import org.tribuo.util.tokens.Tokenizer;
+import org.tribuo.util.tokens.impl.BreakIteratorTokenizer;
 
 import java.time.LocalDate;
 import java.time.LocalTime;
@@ -40,9 +47,15 @@
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
+import java.util.function.Function;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.fail;
 
@@ -170,4 +183,132 @@ public void metadataExtractorTest() {
         assertThrows(PropertyException.class, () -> new RowProcessor<>(badExtractors,weightExtractor,response,fixed,Collections.emptySet()));
     }
 
+    @Test
+    public void replaceNewlinesWithSpacesTest() {
+        final Pattern BLANK_LINES = Pattern.compile("(\n[\\s-]*\n)+");
+
+        final Function<CharSequence, CharSequence> newLiner = (CharSequence charSequence) -> {
+            if (charSequence == null || charSequence.length() == 0) {
+                return charSequence;
+            }
+            return BLANK_LINES.splitAsStream(charSequence).collect(Collectors.joining(" *\n\n"));
+        };
+
+        Tokenizer tokenizer = new MungingTokenizer(new BreakIteratorTokenizer(Locale.US), newLiner);
+        TokenPipeline textPipeline = new TokenPipeline(tokenizer, 2, false);
+
+        final Map<String, FieldProcessor> fieldProcessors = new HashMap<>();
+        fieldProcessors.put("order_text", new TextFieldProcessor("order_text", textPipeline));
+
+        MockResponseProcessor response = new MockResponseProcessor("Label");
+
+        Map<String,String> row = new HashMap<>();
+        row.put("order_text", "Jimmy\n\n\n\nHoffa");
+        row.put("Label", "Sheep");
+
+        RowProcessor<MockOutput> processor = new RowProcessor<>(Collections.emptyList(),null,response,fieldProcessors,Collections.emptyMap(),Collections.emptySet(), false);
+
+        Example<MockOutput> example = processor.generateExample(row,true).get();
+
+        // Check example is extracted correctly
+        assertEquals(5, example.size());
+        assertEquals("Sheep", example.getOutput().label);
+        Iterator<Feature> featureIterator = example.iterator();
+        Feature a = featureIterator.next();
+        assertEquals("order_text@1-N=*", a.getName());
+        assertEquals(1.0, a.getValue());
+        a = featureIterator.next();
+        assertEquals("order_text@1-N=Hoffa", a.getName());
+        a = featureIterator.next();
+        assertEquals("order_text@1-N=Jimmy", a.getName());
+        a = featureIterator.next();
+        assertEquals("order_text@2-N=*/Hoffa", a.getName());
+        a = featureIterator.next();
+        assertEquals("order_text@2-N=Jimmy/*", a.getName());
+        assertFalse(featureIterator.hasNext());
+
+        // same input with replaceNewlinesWithSpacesTest=true (the default) produces different features
+        processor = new RowProcessor<>(Collections.emptyList(),null,response,fieldProcessors,Collections.emptyMap(),Collections.emptySet(), true);
+
+        example = processor.generateExample(row,true).get();
+
+        // Check example is extracted correctly
+        assertEquals(3, example.size());
+        assertEquals("Sheep", example.getOutput().label);
+        featureIterator = example.iterator();
+        a = featureIterator.next();
+        assertEquals("order_text@1-N=Hoffa", a.getName());
+        assertEquals(1.0, a.getValue());
+        a = featureIterator.next();
+        assertEquals("order_text@1-N=Jimmy", a.getName());
+        a = featureIterator.next();
+        assertEquals("order_text@2-N=Jimmy/Hoffa", a.getName());
+        assertFalse(featureIterator.hasNext());
+    }
+
+    static class MungingTokenizer implements Tokenizer {
+        private final Tokenizer tokenizer;
+        private final Function<CharSequence, CharSequence> munger;
+
+        MungingTokenizer(final Tokenizer tokenizer, final Function<CharSequence, CharSequence> munger) {
+            this.tokenizer = tokenizer;
+            this.munger = munger;
+        }
+
+        protected Tokenizer delegate() {
+            return tokenizer;
+        }
+
+        @Override
+        public List<Token> tokenize(CharSequence cs) {
+            return tokenizer.tokenize(munger.apply(cs));
+        }
+
+        @Override
+        public List<String> split(CharSequence cs) {
+            return tokenizer.split(munger.apply(cs));
+        }
+
+        @Override
+        public Tokenizer clone() throws CloneNotSupportedException {
+            Tokenizer copy = tokenizer.clone();
+            return new MungingTokenizer(copy, munger);
+        }
+
+        @Override
+        public void reset(final CharSequence cs) {
+            delegate().reset(cs);
+        }
+
+        @Override
+        public boolean advance() {
+            return delegate().advance();
+        }
+
+        @Override
+        public String getText() {
+            return delegate().getText();
+        }
+
+        @Override
+        public int getStart() {
+            return delegate().getStart();
+        }
+
+        @Override
+        public int getEnd() {
+            return delegate().getEnd();
+        }
+
+        @Override
+        public TokenType getType() {
+            return delegate().getType();
+        }
+
+        @Override
+        public ConfiguredObjectProvenance getProvenance() {
+            return delegate().getProvenance();
+        }
+    } // end class MungingTokenizer
+
 }