From fe9abbc486002d416d1c297f3f72bf0d0102dc9a Mon Sep 17 00:00:00 2001 From: James Gowdy Date: Tue, 14 Apr 2020 18:58:22 +0100 Subject: [PATCH] [ML] Improving parsing of large uploaded files (#62970) * [ML] Improving parsing of large uploaded files * small clean up * increasing max to 1GB * adding comments Co-authored-by: Elastic Machine --- .../common/constants/file_datavisualizer.ts | 4 +- .../ml/common/types/file_datavisualizer.ts | 4 +- .../file_datavisualizer_view.js | 28 +++++------ .../components/import_view/import_view.js | 4 +- .../import_view/importer/importer.ts | 48 ++++++++++++++++--- .../import_view/importer/message_importer.ts | 34 ++++++------- .../import_view/importer/ndjson_importer.ts | 35 ++++++++++---- .../file_based/components/utils/index.ts | 1 - .../file_based/components/utils/utils.ts | 22 ++++----- 9 files changed, 113 insertions(+), 67 deletions(-) diff --git a/x-pack/plugins/ml/common/constants/file_datavisualizer.ts b/x-pack/plugins/ml/common/constants/file_datavisualizer.ts index 81d51bfa2581..675247af2db9 100644 --- a/x-pack/plugins/ml/common/constants/file_datavisualizer.ts +++ b/x-pack/plugins/ml/common/constants/file_datavisualizer.ts @@ -4,8 +4,8 @@ * you may not use this file except in compliance with the Elastic License. */ -export const MAX_BYTES = 104857600; -export const ABSOLUTE_MAX_BYTES = MAX_BYTES * 5; +export const MAX_BYTES = 104857600; // 100MB +export const ABSOLUTE_MAX_BYTES = 1073741274; // 1GB export const FILE_SIZE_DISPLAY_FORMAT = '0,0.[0] b'; // Value to use in the Elasticsearch index mapping meta data to identify the diff --git a/x-pack/plugins/ml/common/types/file_datavisualizer.ts b/x-pack/plugins/ml/common/types/file_datavisualizer.ts index f771547b9781..c997a4e24f86 100644 --- a/x-pack/plugins/ml/common/types/file_datavisualizer.ts +++ b/x-pack/plugins/ml/common/types/file_datavisualizer.ts @@ -67,13 +67,15 @@ export interface ImportResponse { export interface ImportFailure { item: number; reason: string; - doc: Doc; + doc: ImportDoc; } export interface Doc { message: string; } +export type ImportDoc = Doc | string; + export interface Settings { pipeline?: string; index: string; diff --git a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/file_datavisualizer_view/file_datavisualizer_view.js b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/file_datavisualizer_view/file_datavisualizer_view.js index d1b615a878b2..c73ab4b9e11c 100644 --- a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/file_datavisualizer_view/file_datavisualizer_view.js +++ b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/file_datavisualizer_view/file_datavisualizer_view.js @@ -24,14 +24,11 @@ import { readFile, createUrlOverrides, processResults, - reduceData, hasImportPermission, } from '../utils'; import { MODE } from './constants'; -const UPLOAD_SIZE_MB = 5; - export class FileDataVisualizerView extends Component { constructor(props) { super(props); @@ -40,6 +37,7 @@ export class FileDataVisualizerView extends Component { files: {}, fileName: '', fileContents: '', + data: [], fileSize: 0, fileTooLarge: false, fileCouldNotBeRead: false, @@ -79,6 +77,7 @@ export class FileDataVisualizerView extends Component { loaded: false, fileName: '', fileContents: '', + data: [], fileSize: 0, fileTooLarge: false, fileCouldNotBeRead: false, @@ -97,15 +96,15 @@ export class FileDataVisualizerView extends Component { async loadFile(file) { if (file.size <= this.maxFileUploadBytes) { try { - const fileContents = await readFile(file); - const data = fileContents.data; + const { data, fileContents } = await readFile(file); this.setState({ - fileContents: data, + data, + fileContents, fileName: file.name, fileSize: file.size, }); - await this.loadSettings(data); + await this.analyzeFile(fileContents); } catch (error) { this.setState({ loaded: false, @@ -124,14 +123,9 @@ export class FileDataVisualizerView extends Component { } } - async loadSettings(data, overrides, isRetry = false) { + async analyzeFile(fileContents, overrides, isRetry = false) { try { - // reduce the amount of data being sent to the endpoint - // 5MB should be enough to contain 1000 lines - const lessData = reduceData(data, UPLOAD_SIZE_MB); - console.log('overrides', overrides); - const { analyzeFile } = ml.fileDatavisualizer; - const resp = await analyzeFile(lessData, overrides); + const resp = await ml.fileDatavisualizer.analyzeFile(fileContents, overrides); const serverSettings = processResults(resp); const serverOverrides = resp.overrides; @@ -198,7 +192,7 @@ export class FileDataVisualizerView extends Component { loading: true, loaded: false, }); - this.loadSettings(data, this.previousOverrides, true); + this.analyzeFile(fileContents, this.previousOverrides, true); } } } @@ -240,7 +234,7 @@ export class FileDataVisualizerView extends Component { }, () => { const formattedOverrides = createUrlOverrides(overrides, this.originalSettings); - this.loadSettings(this.state.fileContents, formattedOverrides); + this.analyzeFile(this.state.fileContents, formattedOverrides); } ); }; @@ -261,6 +255,7 @@ export class FileDataVisualizerView extends Component { results, explanation, fileContents, + data, fileName, fileSize, fileTooLarge, @@ -339,6 +334,7 @@ export class FileDataVisualizerView extends Component { results={results} fileName={fileName} fileContents={fileContents} + data={data} indexPatterns={this.props.indexPatterns} kibanaConfig={this.props.kibanaConfig} showBottomBar={this.showBottomBar} diff --git a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/import_view.js b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/import_view.js index 4c9579bfd4b4..2bf7bbeb641d 100644 --- a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/import_view.js +++ b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/import_view.js @@ -94,7 +94,7 @@ export class ImportView extends Component { // TODO - sort this function out. it's a mess async import() { - const { fileContents, results, indexPatterns, kibanaConfig, showBottomBar } = this.props; + const { data, results, indexPatterns, kibanaConfig, showBottomBar } = this.props; const { format } = results; let { timeFieldName } = this.state; @@ -217,7 +217,7 @@ export class ImportView extends Component { if (success) { const importer = importerFactory(format, results, indexCreationSettings); if (importer !== undefined) { - const readResp = importer.read(fileContents, this.setReadProgress); + const readResp = importer.read(data, this.setReadProgress); success = readResp.success; this.setState({ readStatus: success ? IMPORT_STATUS.COMPLETE : IMPORT_STATUS.FAILED, diff --git a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/importer/importer.ts b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/importer/importer.ts index c97f1c147c45..718587ad15ad 100644 --- a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/importer/importer.ts +++ b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/importer/importer.ts @@ -9,7 +9,7 @@ import moment from 'moment'; import { i18n } from '@kbn/i18n'; import { ml } from '../../../../../services/ml_api_service'; import { - Doc, + ImportDoc, ImportFailure, ImportResponse, Mappings, @@ -20,6 +20,7 @@ import { const CHUNK_SIZE = 5000; const MAX_CHUNK_CHAR_COUNT = 1000000; const IMPORT_RETRIES = 5; +const STRING_CHUNKS_MB = 100; export interface ImportConfig { settings: Settings; @@ -34,12 +35,19 @@ export interface ImportResults { error?: any; } -export class Importer { +export interface CreateDocsResponse { + success: boolean; + remainder: number; + docs: ImportDoc[]; + error?: any; +} + +export abstract class Importer { private _settings: Settings; private _mappings: Mappings; private _pipeline: IngestPipeline; - protected _docArray: Doc[] = []; + protected _docArray: ImportDoc[] = []; constructor({ settings, mappings, pipeline }: ImportConfig) { this._settings = settings; @@ -47,7 +55,33 @@ export class Importer { this._pipeline = pipeline; } - async initializeImport(index: string) { + public read(data: ArrayBuffer) { + const decoder = new TextDecoder(); + const size = STRING_CHUNKS_MB * Math.pow(2, 20); + + // chop the data up into 100MB chunks for processing. + // if the chop produces a partial line at the end, a character "remainder" count + // is returned which is used to roll the next chunk back that many chars so + // it is included in the next chunk. + const parts = Math.ceil(data.byteLength / size); + let remainder = 0; + for (let i = 0; i < parts; i++) { + const byteArray = decoder.decode(data.slice(i * size - remainder, (i + 1) * size)); + const { success, docs, remainder: tempRemainder } = this._createDocs(byteArray); + if (success) { + this._docArray = this._docArray.concat(docs); + remainder = tempRemainder; + } else { + return { success: false }; + } + } + + return { success: true }; + } + + protected abstract _createDocs(t: string): CreateDocsResponse; + + public async initializeImport(index: string) { const settings = this._settings; const mappings = this._mappings; const pipeline = this._pipeline; @@ -75,7 +109,7 @@ export class Importer { return createIndexResp; } - async import( + public async import( id: string, index: string, pipelineId: string, @@ -201,8 +235,8 @@ function updatePipelineTimezone(ingestPipeline: IngestPipeline) { } } -function createDocumentChunks(docArray: Doc[]) { - const chunks: Doc[][] = []; +function createDocumentChunks(docArray: ImportDoc[]) { + const chunks: ImportDoc[][] = []; // chop docArray into 5000 doc chunks const tempChunks = chunk(docArray, CHUNK_SIZE); diff --git a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/importer/message_importer.ts b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/importer/message_importer.ts index 7ccc5a8d673f..65be24d9e7be 100644 --- a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/importer/message_importer.ts +++ b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/importer/message_importer.ts @@ -4,7 +4,7 @@ * you may not use this file except in compliance with the Elastic License. */ -import { Importer, ImportConfig } from './importer'; +import { Importer, ImportConfig, CreateDocsResponse } from './importer'; import { Doc, FindFileStructureResponse, @@ -33,54 +33,54 @@ export class MessageImporter extends Importer { // multiline_start_pattern regex // if it does, it is a legitimate end of line and can be pushed into the list, // if not, it must be a newline char inside a field value, so keep looking. - read(text: string) { + protected _createDocs(text: string): CreateDocsResponse { + let remainder = 0; try { - const data: Doc[] = []; + const docs: Doc[] = []; let message = ''; let line = ''; for (let i = 0; i < text.length; i++) { const char = text[i]; if (char === '\n') { - message = this.processLine(data, message, line); + message = this._processLine(docs, message, line); line = ''; } else { line += char; } } - // the last line may have been missing a newline ending - if (line !== '') { - message = this.processLine(data, message, line); - } + remainder = line.length; - // add the last message to the list if not already done + // // add the last message to the list if not already done if (message !== '') { - this.addMessage(data, message); + this._addMessage(docs, message); } // remove first line if it is blank - if (data[0] && data[0].message === '') { - data.shift(); + if (docs[0] && docs[0].message === '') { + docs.shift(); } - this._docArray = data; - return { success: true, + docs, + remainder, }; } catch (error) { return { success: false, + docs: [], + remainder, error, }; } } - processLine(data: Doc[], message: string, line: string) { + private _processLine(data: Doc[], message: string, line: string) { if (this._excludeLinesRegex === null || line.match(this._excludeLinesRegex) === null) { if (this._multilineStartRegex === null || line.match(this._multilineStartRegex) !== null) { - this.addMessage(data, message); + this._addMessage(data, message); message = ''; } else if (data.length === 0) { // discard everything before the first line that is considered the first line of a message @@ -95,7 +95,7 @@ export class MessageImporter extends Importer { return message; } - addMessage(data: Doc[], message: string) { + private _addMessage(data: Doc[], message: string) { // if the message ended \r\n (Windows line endings) // then omit the \r as well as the \n for consistency message = message.replace(/\r$/, ''); diff --git a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/importer/ndjson_importer.ts b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/importer/ndjson_importer.ts index 7f5f37abc524..17c9de8ef455 100644 --- a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/importer/ndjson_importer.ts +++ b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/import_view/importer/ndjson_importer.ts @@ -4,7 +4,7 @@ * you may not use this file except in compliance with the Elastic License. */ -import { Importer, ImportConfig } from './importer'; +import { Importer, ImportConfig, CreateDocsResponse } from './importer'; import { FindFileStructureResponse } from '../../../../../../../common/types/file_datavisualizer'; export class NdjsonImporter extends Importer { @@ -12,27 +12,42 @@ export class NdjsonImporter extends Importer { super(settings); } - read(json: string) { + protected _createDocs(json: string): CreateDocsResponse { + let remainder = 0; try { const splitJson = json.split(/}\s*\n/); + const incompleteLastLine = json.match(/}\s*\n?$/) === null; - const ndjson: any[] = []; - for (let i = 0; i < splitJson.length; i++) { - if (splitJson[i] !== '') { - // note the extra } at the end of the line, adding back - // the one that was eaten in the split - ndjson.push(`${splitJson[i]}}`); + const docs: string[] = []; + if (splitJson.length) { + for (let i = 0; i < splitJson.length - 1; i++) { + if (splitJson[i] !== '') { + // note the extra } at the end of the line, adding back + // the one that was eaten in the split + docs.push(`${splitJson[i]}}`); + } } - } - this._docArray = ndjson; + const lastDoc = splitJson[splitJson.length - 1]; + if (lastDoc) { + if (incompleteLastLine === true) { + remainder = lastDoc.length; + } else { + docs.push(`${lastDoc}}`); + } + } + } return { success: true, + docs, + remainder, }; } catch (error) { return { success: false, + docs: [], + remainder, error, }; } diff --git a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/utils/index.ts b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/utils/index.ts index 0f0036a7c461..492a797f7a2f 100644 --- a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/utils/index.ts +++ b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/utils/index.ts @@ -9,7 +9,6 @@ export { hasImportPermission, processResults, readFile, - reduceData, getMaxBytes, getMaxBytesFormatted, } from './utils'; diff --git a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/utils/utils.ts b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/utils/utils.ts index 0d2016b71ed8..ecef01aae051 100644 --- a/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/utils/utils.ts +++ b/x-pack/plugins/ml/public/application/datavisualizer/file_based/components/utils/utils.ts @@ -15,6 +15,7 @@ import { import { getMlConfig } from '../../../../util/dependency_cache'; const DEFAULT_LINES_TO_SAMPLE = 1000; +const UPLOAD_SIZE_MB = 5; const overrideDefaults = { timestampFormat: undefined, @@ -34,15 +35,22 @@ export function readFile(file: File) { return new Promise((resolve, reject) => { if (file && file.size) { const reader = new FileReader(); - reader.readAsText(file); + reader.readAsArrayBuffer(file); reader.onload = (() => { return () => { + const decoder = new TextDecoder(); const data = reader.result; - if (data === '') { + if (data === null || typeof data === 'string') { + return reject(); + } + const size = UPLOAD_SIZE_MB * Math.pow(2, 20); + const fileContents = decoder.decode(data.slice(0, size)); + + if (fileContents === '') { reject(); } else { - resolve({ data }); + resolve({ fileContents, data }); } }; })(); @@ -52,14 +60,6 @@ export function readFile(file: File) { }); } -export function reduceData(data: string, mb: number) { - // assuming ascii characters in the file where 1 char is 1 byte - // TODO - change this when other non UTF-8 formats are - // supported for the read data - const size = mb * Math.pow(2, 20); - return data.length >= size ? data.slice(0, size) : data; -} - export function getMaxBytes() { const maxBytes = getMlConfig().file_data_visualizer.max_file_size_bytes; return maxBytes < ABSOLUTE_MAX_BYTES ? maxBytes : ABSOLUTE_MAX_BYTES;