From 67fde341abb0cf350cc226b12c997559c2c5f8a4 Mon Sep 17 00:00:00 2001
From: Przemko Robakowski <przemko.robakowski@elastic.co>
Date: Fri, 22 Nov 2019 22:26:54 +0100
Subject: [PATCH] CSV Processor for Ingest

This change adds new ingest processor that breaks line from CSV file into separate fields.
By default it conforms to RFC 4180 but can be tweaked.

Closes #49113
---
 .../ingest/common/CsvParser.java              | 172 ++++++++++++++++
 .../ingest/common/CsvProcessor.java           | 104 ++++++++++
 .../ingest/common/IngestCommonPlugin.java     |   3 +-
 .../ingest/common/CsvProcessorTests.java      | 194 ++++++++++++++++++
 4 files changed, 472 insertions(+), 1 deletion(-)
 create mode 100644 modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvParser.java
 create mode 100644 modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvProcessor.java
 create mode 100644 modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java

diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvParser.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvParser.java
new file mode 100644
index 0000000000000..6c5d5dc7dea83
--- /dev/null
+++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvParser.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.ingest.common;
+
+import org.elasticsearch.ingest.IngestDocument;
+
+final class CsvParser {
+
+    private enum State {
+        START, UNQUOTED, QUOTED, QUOTED_END
+    }
+
+    private final char quote;
+    private final char separator;
+    private final boolean trim;
+    private final String[] headers;
+    private final IngestDocument ingestDocument;
+    private final StringBuilder builder = new StringBuilder();
+    private State state = State.START;
+    private String line;
+    private int currentHeader;
+    private int startIndex;
+    private int length;
+    private int currentIndex;
+
+    CsvParser(IngestDocument ingestDocument, char quote, char separator, boolean trim, String[] headers) {
+        this.ingestDocument = ingestDocument;
+        this.quote = quote;
+        this.separator = separator;
+        this.trim = trim;
+        this.headers = headers;
+    }
+
+    void process(String line) {
+        this.line = line;
+        length = line.length();
+        for (currentIndex = 0; currentIndex < length; currentIndex++) {
+            switch (state) {
+                case START:
+                    if (processStart()) {
+                        return;
+                    }
+                    break;
+                case UNQUOTED:
+                    if (processUnquoted()) {
+                        return;
+                    }
+                    break;
+                case QUOTED:
+                    processQuoted();
+                    break;
+                case QUOTED_END:
+                    if (processQuotedEnd()) {
+                        return;
+                    }
+                    break;
+            }
+        }
+
+        //we've reached end of string, we need to handle last field
+        switch (state) {
+            case UNQUOTED:
+                setField(length);
+                break;
+            case QUOTED_END:
+                setField(length - 1);
+                break;
+            case QUOTED:
+                throw new IllegalArgumentException("Unmatched quote");
+        }
+    }
+
+    private boolean processStart() {
+        for (; currentIndex < length; currentIndex++) {
+            char c = line.charAt(currentIndex);
+            if (c == quote) {
+                state = State.QUOTED;
+                builder.setLength(0);
+                startIndex++;
+                return false;
+            } else if (c == separator) {
+                startIndex++;
+                if (nextHeader()) {
+                    return true;
+                }
+            } else if (trim && (c == ' ' || c == '\t')) {
+                startIndex++;
+            } else {
+                state = State.UNQUOTED;
+                builder.setLength(0);
+                return false;
+            }
+        }
+        return true;
+    }
+
+    private boolean processUnquoted() {
+        for (; currentIndex < length; currentIndex++) {
+            char c = line.charAt(currentIndex);
+            if (c == '\n' || c == '\r' || c == quote) {
+                throw new IllegalArgumentException("Illegal character inside unquoted field at " + currentIndex);
+            } else if (c == separator) {
+                state = State.START;
+                if (setField(currentIndex)) {
+                    return true;
+                }
+                startIndex = currentIndex + 1;
+                return false;
+            }
+        }
+        return false;
+    }
+
+    private void processQuoted() {
+        for (; currentIndex < length; currentIndex++) {
+            if (line.charAt(currentIndex) == quote) {
+                state = State.QUOTED_END;
+                break;
+            }
+        }
+    }
+
+    private boolean processQuotedEnd() {
+        char c = line.charAt(currentIndex);
+        if (c == quote) {
+            builder.append(line, startIndex, currentIndex - 1).append(quote);
+            startIndex = currentIndex + 1;
+            state = State.QUOTED;
+        } else if (c == separator) {
+            if (setField(currentIndex - 1)) {
+                return true;
+            }
+            startIndex = currentIndex + 1;
+            state = State.START;
+        } else {
+            throw new IllegalArgumentException("Characters after quoted field at " + currentIndex);
+        }
+        return false;
+    }
+
+    private boolean setField(int endIndex) {
+        if (builder.length() == 0) {
+            ingestDocument.setFieldValue(headers[currentHeader], line.substring(startIndex, endIndex));
+        } else {
+            builder.append(line, startIndex, endIndex);
+            ingestDocument.setFieldValue(headers[currentHeader], builder.toString());
+        }
+        return nextHeader();
+    }
+
+    private boolean nextHeader() {
+        currentHeader++;
+        return currentHeader == headers.length;
+    }
+}
diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvProcessor.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvProcessor.java
new file mode 100644
index 0000000000000..b62e1f5561a0e
--- /dev/null
+++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/CsvProcessor.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.ingest.common;
+
+import org.elasticsearch.ingest.AbstractProcessor;
+import org.elasticsearch.ingest.ConfigurationUtils;
+import org.elasticsearch.ingest.IngestDocument;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A processor that breaks line from CSV file into separate fields.
+ * If there's more fields requested than there is in the CSV extra field will no be present in document after processing.
+ * In the same way this processor will skip any field that is empty in CSV.
+ *
+ *
+ * By default it uses rules according to <a href="https://tools.ietf.org/html/rfc4180">RCF 4180</a>,
+ * but it can be tweaked with following parameters:
+ *
+ * quote: set custom quote character (defaults to ")
+ * separator: set custom separator (defaults to ,)
+ * trim: trim leading whitespaces in each field (allows also whitespaces before quoted fields, defaults to false)
+ */
+public class CsvProcessor extends AbstractProcessor {
+
+    public static final String TYPE = "csv";
+
+    private final String field;
+    private final String[] headers;
+    private final boolean trim;
+    private final char quote;
+    private final char separator;
+    private final boolean ignoreMissing;
+
+    public CsvProcessor(String tag, String field, String[] headers, boolean trim, char separator, char quote, boolean ignoreMissing) {
+        super(tag);
+        this.field = field;
+        this.headers = headers;
+        this.trim = trim;
+        this.quote = quote;
+        this.separator = separator;
+        this.ignoreMissing = ignoreMissing;
+    }
+
+    @Override
+    public IngestDocument execute(IngestDocument ingestDocument) {
+        if (headers.length == 0) {
+            return ingestDocument;
+        }
+
+        String line = ingestDocument.getFieldValue(field, String.class, ignoreMissing);
+        if (line == null && ignoreMissing == false) {
+            return ingestDocument;
+        } else if (line == null) {
+            throw new IllegalArgumentException("field [" + field + "] is null, cannot process it.");
+        }
+        new CsvParser(ingestDocument, quote, separator, trim, headers).process(line);
+        return ingestDocument;
+    }
+
+    @Override
+    public String getType() {
+        return TYPE;
+    }
+
+    public static final class Factory implements org.elasticsearch.ingest.Processor.Factory {
+        @Override
+        public CsvProcessor create(Map<String, org.elasticsearch.ingest.Processor.Factory> registry, String processorTag,
+                                   Map<String, Object> config) {
+            String field = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "field");
+            String quote = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "quote", "\"");
+            if (quote.length() != 1) {
+                throw new IllegalArgumentException("quote has to be single character like \" or '");
+            }
+            String separator = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "separator", ",");
+            if (separator.length() != 1) {
+                throw new IllegalArgumentException("separator has to be single character like \" or '");
+            }
+            boolean trim = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "trim", false);
+            boolean ignoreMissing = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false);
+            List<String> targetFields = ConfigurationUtils.readOptionalList(TYPE, processorTag, config, "target_fields");
+            return new CsvProcessor(processorTag, field, targetFields == null ? new String[0] : targetFields.toArray(String[]::new),
+                trim, separator.charAt(0), quote.charAt(0), ignoreMissing);
+        }
+    }
+}
diff --git a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java
index 4f99c850e5bd3..b37e5d13e4602 100644
--- a/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java
+++ b/modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/IngestCommonPlugin.java
@@ -88,7 +88,8 @@ public Map<String, Processor.Factory> getProcessors(Processor.Parameters paramet
                 entry(PipelineProcessor.TYPE, new PipelineProcessor.Factory(parameters.ingestService)),
                 entry(DissectProcessor.TYPE, new DissectProcessor.Factory()),
                 entry(DropProcessor.TYPE, new DropProcessor.Factory()),
-                entry(HtmlStripProcessor.TYPE, new HtmlStripProcessor.Factory()));
+                entry(HtmlStripProcessor.TYPE, new HtmlStripProcessor.Factory()),
+                entry(CsvProcessor.TYPE, new CsvProcessor.Factory()));
     }
 
     @Override
diff --git a/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java
new file mode 100644
index 0000000000000..4de62afaa6dc2
--- /dev/null
+++ b/modules/ingest-common/src/test/java/org/elasticsearch/ingest/common/CsvProcessorTests.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.ingest.common;
+
+import com.carrotsearch.randomizedtesting.annotations.Name;
+import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
+import org.elasticsearch.ingest.IngestDocument;
+import org.elasticsearch.ingest.RandomDocumentPicks;
+import org.elasticsearch.test.ESTestCase;
+import org.junit.Before;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public class CsvProcessorTests extends ESTestCase {
+
+    private static final Character[] SEPARATORS = new Character[]{',', ';', '|', '.'};
+    private final String quote;
+    private char separator;
+
+
+    public CsvProcessorTests(@Name("quote") String quote) {
+        this.quote = quote;
+    }
+
+    @ParametersFactory
+    public static Iterable<Object[]> parameters() {
+        return Arrays.asList(new Object[]{"'"}, new Object[]{"\""}, new Object[]{""});
+    }
+
+    @Before
+    public void setup() {
+        separator = randomFrom(SEPARATORS);
+    }
+
+    public void testExactNumberOfFields() throws Exception {
+        int numItems = randomIntBetween(2, 10);
+        Map<String, String> items = new LinkedHashMap<>();
+        for (int i = 0; i < numItems; i++) {
+            items.put(randomAlphaOfLengthBetween(5, 10), randomAlphaOfLengthBetween(5, 10));
+        }
+        String[] headers = items.keySet().toArray(new String[numItems]);
+        String csv = items.values().stream().map(v -> quote + v + quote).collect(Collectors.joining(separator + ""));
+
+        IngestDocument ingestDocument = processDocument(headers, csv);
+
+        items.forEach((key, value) -> assertEquals(value, ingestDocument.getFieldValue(key, String.class)));
+    }
+
+    public void testLessFieldsThanHeaders() throws Exception {
+        int numItems = randomIntBetween(4, 10);
+        Map<String, String> items = new LinkedHashMap<>();
+        for (int i = 0; i < numItems; i++) {
+            items.put(randomAlphaOfLengthBetween(5, 10), randomAlphaOfLengthBetween(5, 10));
+        }
+        String[] headers = items.keySet().toArray(new String[numItems]);
+        String csv = items.values().stream().map(v -> quote + v + quote).limit(3).collect(Collectors.joining(separator + ""));
+
+        IngestDocument ingestDocument = processDocument(headers, csv);
+
+        items.keySet().stream().skip(3).forEach(key -> assertFalse(ingestDocument.hasField(key)));
+        items.entrySet().stream().limit(3).forEach(e -> assertEquals(e.getValue(), ingestDocument.getFieldValue(e.getKey(), String.class)));
+    }
+
+    public void testLessHeadersThanFields() throws Exception {
+        int numItems = randomIntBetween(5, 10);
+        Map<String, String> items = new LinkedHashMap<>();
+        for (int i = 0; i < numItems; i++) {
+            items.put(randomAlphaOfLengthBetween(5, 10), randomAlphaOfLengthBetween(5, 10));
+        }
+        String[] headers = items.keySet().stream().limit(3).toArray(String[]::new);
+        String csv = items.values().stream().map(v -> quote + v + quote).collect(Collectors.joining(separator + ""));
+
+        IngestDocument ingestDocument = processDocument(headers, csv);
+
+        items.entrySet().stream().limit(3).forEach(e -> assertEquals(e.getValue(), ingestDocument.getFieldValue(e.getKey(), String.class)));
+    }
+
+    public void testSingleField() throws Exception {
+        String[] headers = new String[]{randomAlphaOfLengthBetween(5, 10)};
+        String value = randomAlphaOfLengthBetween(5, 10);
+        String csv = quote + value + quote;
+
+        IngestDocument ingestDocument = processDocument(headers, csv);
+
+        assertEquals(value, ingestDocument.getFieldValue(headers[0], String.class));
+    }
+
+    public void testEscapedQuote() throws Exception {
+        int numItems = randomIntBetween(2, 10);
+        Map<String, String> items = new LinkedHashMap<>();
+        for (int i = 0; i < numItems; i++) {
+            items.put(randomAlphaOfLengthBetween(5, 10), randomAlphaOfLengthBetween(5, 10) + quote + quote + randomAlphaOfLengthBetween(5
+                , 10) + quote + quote);
+        }
+        String[] headers = items.keySet().toArray(new String[numItems]);
+        String csv = items.values().stream().map(v -> quote + v + quote).collect(Collectors.joining(separator + ""));
+
+        IngestDocument ingestDocument = processDocument(headers, csv);
+
+        items.forEach((key, value) -> assertEquals(value.replace(quote + quote, quote), ingestDocument.getFieldValue(key, String.class)));
+    }
+
+    public void testQuotedStrings() throws Exception {
+        assumeFalse("quote needed", quote.isEmpty());
+        int numItems = randomIntBetween(2, 10);
+        Map<String, String> items = new LinkedHashMap<>();
+        for (int i = 0; i < numItems; i++) {
+            items.put(randomAlphaOfLengthBetween(5, 10),
+                separator + randomAlphaOfLengthBetween(5, 10) + separator + "\n\r" + randomAlphaOfLengthBetween(5, 10));
+        }
+        String[] headers = items.keySet().toArray(new String[numItems]);
+        String csv = items.values().stream().map(v -> quote + v + quote).collect(Collectors.joining(separator + ""));
+
+        IngestDocument ingestDocument = processDocument(headers, csv);
+
+        items.forEach((key, value) -> assertEquals(value.replace(quote + quote, quote), ingestDocument.getFieldValue(key,
+            String.class)));
+    }
+
+    public void testEmptyFields() throws Exception {
+        int numItems = randomIntBetween(5, 10);
+        Map<String, String> items = new LinkedHashMap<>();
+        for (int i = 0; i < numItems; i++) {
+            items.put(randomAlphaOfLengthBetween(5, 10), randomAlphaOfLengthBetween(5, 10));
+        }
+        String[] headers = items.keySet().toArray(new String[numItems]);
+        String csv =
+            items.values().stream().map(v -> quote + v + quote).limit(numItems - 1).skip(3).collect(Collectors.joining(separator + ""));
+
+        IngestDocument ingestDocument = processDocument(headers,
+            "" + separator + "" + separator + "" + separator + csv + separator + separator +
+                "abc");
+
+        items.keySet().stream().limit(3).forEach(key -> assertFalse(ingestDocument.hasField(key)));
+        items.entrySet().stream().limit(numItems - 1).skip(3).forEach(e -> assertEquals(e.getValue(),
+            ingestDocument.getFieldValue(e.getKey(), String.class)));
+        items.keySet().stream().skip(numItems - 1).forEach(key -> assertFalse(ingestDocument.hasField(key)));
+    }
+
+    public void testWrongStings() throws Exception {
+        assumeTrue("single run only", quote.isEmpty());
+        expectThrows(IllegalArgumentException.class, () -> processDocument(new String[]{"a"}, "abc\"abc"));
+        expectThrows(IllegalArgumentException.class, () -> processDocument(new String[]{"a"}, "\"abc\"asd"));
+        expectThrows(IllegalArgumentException.class, () -> processDocument(new String[]{"a"}, "\"abcasd"));
+        expectThrows(IllegalArgumentException.class, () -> processDocument(new String[]{"a"}, "abc\nabc"));
+        expectThrows(IllegalArgumentException.class, () -> processDocument(new String[]{"a"}, "abc\rabc"));
+    }
+
+    public void testEmptyHeaders() throws Exception {
+        assumeTrue("single run only", quote.isEmpty());
+        IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random());
+        String fieldName = RandomDocumentPicks.addRandomField(random(), ingestDocument, "abc,abc");
+        HashMap<String, Object> metadata = new HashMap<>(ingestDocument.getSourceAndMetadata());
+
+        CsvProcessor processor = new CsvProcessor(randomAlphaOfLength(5), fieldName, new String[0], false, ',', '"', false);
+
+        processor.execute(ingestDocument);
+
+        assertEquals(metadata, ingestDocument.getSourceAndMetadata());
+    }
+
+    private IngestDocument processDocument(String[] headers, String csv) throws Exception {
+        IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random());
+
+        String fieldName = RandomDocumentPicks.addRandomField(random(), ingestDocument, csv);
+        char quoteChar = quote.isEmpty() ? '"' : quote.charAt(0);
+        CsvProcessor processor = new CsvProcessor(randomAlphaOfLength(5), fieldName, headers, true, separator, quoteChar, false);
+
+        processor.execute(ingestDocument);
+
+        return ingestDocument;
+    }
+}