From 39cf0f71659d0052ca78b6f7d252dcdeb303f2a0 Mon Sep 17 00:00:00 2001 From: Travis Patterson Date: Thu, 15 Aug 2024 13:21:49 -0600 Subject: [PATCH] fix: Revert "fix: Include whitespaces in extracted tokens (#13738)" (#13902) --- pkg/pattern/tokenization/tokenization.go | 19 ++-- pkg/pattern/tokenization/tokenization_test.go | 91 +++++++++---------- 2 files changed, 56 insertions(+), 54 deletions(-) diff --git a/pkg/pattern/tokenization/tokenization.go b/pkg/pattern/tokenization/tokenization.go index 52b239d049c7f..d4918ccb8e048 100644 --- a/pkg/pattern/tokenization/tokenization.go +++ b/pkg/pattern/tokenization/tokenization.go @@ -1,6 +1,7 @@ package tokenization import ( + "bytes" "unsafe" ) @@ -28,7 +29,7 @@ type tokenizer struct { tokens []string } -func (t *tokenizer) countOrSaveToken(endTokenPos int) { +func (t *tokenizer) countOrSaveToken(endTokenPos, skip int) { if t.tokens != nil { // Intentionally written like this and not with append(), so this can // panic if we ever exceed the preallocated slice size, since that means @@ -36,7 +37,7 @@ func (t *tokenizer) countOrSaveToken(endTokenPos int) { t.tokens[t.tokenCount] = t.line[t.tpos:endTokenPos] } t.tokenCount++ - t.tpos = endTokenPos + t.tpos = endTokenPos + skip } func (t *tokenizer) handleNextToken() bool { @@ -54,7 +55,7 @@ func (t *tokenizer) handleNextToken() bool { // outside of a quoted string. case escaped: if curQuotePos < 0 && delimiters[c] { - t.countOrSaveToken(p + 1) + t.countOrSaveToken(p, 1) return true } else { escaped = false @@ -88,7 +89,7 @@ func (t *tokenizer) handleNextToken() bool { // If we encounter a delimiter outside of a quote, count or save the // token and skip the delimiter. case delimiters[c]: - t.countOrSaveToken(p + 1) + t.countOrSaveToken(p, 1) return true // Handle likely JSON object keys that have been serialized without @@ -107,11 +108,11 @@ func (t *tokenizer) handleNextToken() bool { // wasn't a delimiter right before the comma. case t.maybeJSON && p > t.tpos && (c == ':' || c == ',') && p+1 < lineLen: if c == ':' && t.line[p-1] == '"' && !delimiters[t.line[p+1]] { - t.countOrSaveToken(p + 1) + t.countOrSaveToken(p+1, 0) return true } if c == ',' && t.line[p+1] == '"' { - t.countOrSaveToken(p) + t.countOrSaveToken(p, 0) return true } } @@ -125,12 +126,12 @@ func (t *tokenizer) handleNextToken() bool { // unterminated quote and the quote itself as a single token, and continue // fairly normally from there. if curQuotePos > 0 { - t.countOrSaveToken(curQuotePos + 1) + t.countOrSaveToken(curQuotePos+1, 0) return true } if t.tpos < len(t.line) { - t.countOrSaveToken(len(t.line)) + t.countOrSaveToken(len(t.line), 0) return true } @@ -192,6 +193,8 @@ func (t *tokenizer) tokenize() []string { } func PreprocessAndTokenize(content []byte) []string { + content = bytes.TrimSpace(content) + t := tokenizer{rawLine: content, maxTokens: 100} // TODO: parametrize maxTokens return t.tokenize() diff --git a/pkg/pattern/tokenization/tokenization_test.go b/pkg/pattern/tokenization/tokenization_test.go index 1461e14d35c77..dde2814a2fe91 100644 --- a/pkg/pattern/tokenization/tokenization_test.go +++ b/pkg/pattern/tokenization/tokenization_test.go @@ -21,54 +21,53 @@ var tokenizationCornerTestCases = []tokenizationTestCase{ }, { " foo ", - // TOFIX: maybe this is not exactly what we want? should we try to combine empty space into a single token? - []string{" ", "foo ", " "}, + []string{"foo"}, }, { "foo bar baz", - []string{"foo ", "bar ", "baz"}, + []string{"foo", "bar", "baz"}, }, { "\nfoo\t bar baz\r\n", - // TOFIX: same as above - should we combine empty space into a single token? - []string{"\n", "foo\t", " ", " ", "bar ", "baz\r", "\n"}, + // TOFIX: remove empty tokens? + []string{"foo", "", "", "bar", "baz"}, }, { "ends single char C", - []string{"ends ", "single ", "char ", "C"}, + []string{"ends", "single", "char", "C"}, }, { "0 1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z Z Y X W V U T S R Q P O N M L K J I H G F E D C B A z y x w v u t s r q p o n m l k j i h g f e d c b a 9 8 7 6 5 4 3 2 1 0", // The tail end of strings longer than maxTokens is returned as a single token - []string{" ", " ", " ", " ", " ", " ", " ", " ", " ", " ", "a ", "b ", "c ", "d ", "e ", "f ", "g ", "h ", "i ", "j ", "k ", "l ", "m ", "n ", "o ", "p ", "q ", "r ", "s ", "t ", "u ", "v ", "w ", "x ", "y ", "z ", "A ", "B ", "C ", "D ", "E ", "F ", "G ", "H ", "I ", "J ", "K ", "L ", "M ", "N ", "O ", "P ", "Q ", "R ", "S ", "T ", "U ", "V ", "W ", "X ", "Y ", "Z ", "Z ", "Y ", "X ", "W ", "V ", "U ", "T ", "S ", "R ", "Q ", "P ", "O ", "N ", "M ", "L ", "K ", "J ", "I ", "H ", "G ", "F ", "E ", "D ", "C ", "B ", "A ", "z ", "y ", "x ", "w ", "v ", "u ", "t ", "s ", "r ", "q ", "p ", "<...>"}, + []string{"", "", "", "", "", "", "", "", "", "", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "Z", "Y", "X", "W", "V", "U", "T", "S", "R", "Q", "P", "O", "N", "M", "L", "K", "J", "I", "H", "G", "F", "E", "D", "C", "B", "A", "z", "y", "x", "w", "v", "u", "t", "s", "r", "q", "p", "<...>"}, }, { `a "quoted string"`, - []string{"a ", `"quoted string"`}, + []string{"a", `"quoted string"`}, }, { `a "quoted string with \" escapes!"`, - []string{"a ", `"quoted string with \" escapes!"`}, + []string{"a", `"quoted string with \" escapes!"`}, }, { `a 'singly quoted string"'`, - []string{"a ", `'singly quoted string"'`}, + []string{"a", `'singly quoted string"'`}, }, { `a 'singly quoted string" \''`, - []string{"a ", `'singly quoted string" \''`}, + []string{"a", `'singly quoted string" \''`}, }, { `a 'singly quoted string" \\\''`, - []string{"a ", `'singly quoted string" \\\''`}, + []string{"a", `'singly quoted string" \\\''`}, }, { `a'twisted excappe\\' "with an unterminated quote" 'at_the_end`, - []string{`a'twisted excappe\\' `, `"with an unterminated quote" `, `'`, "at_the_end"}, + []string{`a'twisted excappe\\'`, `"with an unterminated quote"`, `'`, "at_the_end"}, }, { `a "quoted string 'inception'"!` + "`woot`'much`wow'", - []string{"a ", `"quoted string 'inception'"!` + "`woot`'much`wow'"}, + []string{"a", `"quoted string 'inception'"!` + "`woot`'much`wow'"}, }, { `unterminated"quote`, @@ -76,11 +75,11 @@ var tokenizationCornerTestCases = []tokenizationTestCase{ }, { "`mix`" + ` "and" 'match'`, - []string{"`mix` ", `"and" `, `'match'`}, + []string{"`mix`", `"and"`, `'match'`}, }, { "`mix`" + ` "and" 'match'`, - []string{"`mix` ", `"and" `, `'match'`}, + []string{"`mix`", `"and"`, `'match'`}, }, { `{"json":"uninterrupted \"logline\"","foo":"bar"}`, @@ -105,60 +104,60 @@ var tokenizationCornerTestCases = []tokenizationTestCase{ // We deliberately do treat "escaped" whitespaces outside of quotes as // delimiters, i.e. whitespaces outside of strings cannot be escaped. `weird\ escape`, - []string{`weird\ `, `escape`}, + []string{`weird\`, `escape`}, }, { "-3.14-foo 0.0.0.0/24-0.0.0.1-255.255.255.255-256.255.255.255 1337-ber 0.12-ber n0tnumb3er 12faux -123.0.1.123 -123 -1231.11 333. 123.456. 123.45-", - []string{"-foo ", "/---... ", "-ber ", "-ber ", "n0tnumb3er ", "12faux ", "- ", " ", " ", ". ", ".. ", "-"}, + []string{"-foo", "/---...", "-ber", "-ber", "n0tnumb3er", "12faux", "-", "", "", ".", "..", "-"}, }, { "2022-12-31 12:12:31 3022-12-31 12:12:31-Jul 1 00:21:28", - []string{" ", "-- ", "::-"}, + []string{"", "--", "::-"}, }, { "2022/12/01 12:12:31 - 2022/13/32 12:12:31", - []string{" ", "- ", "// ", "::"}, + []string{"", "-", "//", "::"}, }, { "UUIDS: 123e4567-e89b-12d3-a456-426614174000, 550E8400-E29B-41D4-A716-446655440000, -00000000-0000-0000-0000-000000000000, 12345678-dead-beef-1337-000000000000 {c6ad1a63-10b5-460e-ab2c-05c13604539d} ''''", - []string{"UUIDS: ", ", ", ", ", "-, ", " ", "{} ", "''<>''"}, + []string{"UUIDS:", ",", ",", "-,", "", "{}", "''<>''"}, }, // Mixed case UUID and hex strings are ignored, to limit false positives { "Not UUIDS: 123e4567-E89B-12d3-a456-426614174000, 1234567-dead-beef-1337-00000000000a", - []string{"Not ", "UUIDS: ", "123e4567-E89B-12d3-a456-, ", "-dead-beef--"}, + []string{"Not", "UUIDS:", "123e4567-E89B-12d3-a456-,", "-dead-beef--"}, }, { "Hexes: 0x0123456789 0xabcdef0123 deadbeef1337-ABCDEF0123456?0123456789ab:FFFFAAAAFFFF Curses: 0x012345678 dEaDbeef1337 abcdefabcde ABCDEFABCDE 0xASDFASDFASDF abcdef0123456NOT", - []string{"Hexes: ", " ", " ", "-?: ", "Curses: ", "0x012345678 ", "dEaDbeef1337 ", "abcdefabcde ", "ABCDEFABCDE ", "0xASDFASDFASDF ", "abcdef0123456NOT"}, + []string{"Hexes:", "", "", "-?:", "Curses:", "0x012345678", "dEaDbeef1337", "abcdefabcde", "ABCDEFABCDE", "0xASDFASDFASDF", "abcdef0123456NOT"}, }, { "30546354_3313121680 0_123_456_789 foo_123", - []string{"_ ", "___ ", "foo_"}, + []string{"_", "___", "foo_"}, }, { `3.31ms/1h2m|-12h2m6.1s 31m "165m2.1s(6h0m12.05us)" -451325.31µs 6m23μs 123h21m3.4124561s/0s/-0.0123ms`, - []string{"/| ", " ", `"()" `, " ", " ", "//"}, + []string{"/|", "", `"()"`, "", "", "//"}, }, { // Invalid duration values "3.31.1ms 3h121m3.4124561s 1h0.12s 100usa 0.12msa", - []string{". ", "3h121m3. ", "1h0. ", "100usa ", "0.12msa"}, + []string{".", "3h121m3.", "1h0.", "100usa", "0.12msa"}, }, { // We only consider integers to be valid bytesizes in bytes (0.2B doesn't make sense) "2Mib 0.12KB-5GB 3.12kb 123Gbps 124mbit:512Tbit 5 B;124.1 KB/3b - 2b or 2 BeNot 13.37 b 3 b", []string{ - " ", "- ", " ", " ", ": ", - ";/ ", "- ", " ", "or ", " ", "BeNot ", " ", "b ", ""}, + "", "-", "", "", ":", + ";/", "-", "", "or", "", "BeNot", "", "b", ""}, }, { `status=123 status_code:500 status 200 status="-1" status_code:"404" httpStatus=200`, - []string{"status=123 ", "status_code:500 ", "status ", "200 ", `status="-1" `, `status_code:"404" `, "httpStatus=200"}, + []string{"status=123", "status_code:500", "status", "200", `status="-1"`, `status_code:"404"`, "httpStatus=200"}, }, { `status_code_foo=123 status_code:500.1 status 2023-09-06T00:59:59.98 status:"404KiB"`, - []string{"status_code_foo= ", "status_code: ", "status ", " ", `status:""`}, + []string{"status_code_foo=", "status_code:", "status", "", `status:""`}, }, } @@ -169,27 +168,27 @@ var tokenizationRealisticTestCases = []tokenizationTestCase{ `level=info ts=2023-09-06T00:59:59.982171323Z caller=metrics.go:160 component=frontend org_id=29 traceID=4b93729ff3efabd0 latency=fast query="{stream=\"stdout\",pod=\"loki-canary-nl54q\"} " query_hash=1280418884 query_type=limited range_type=range length=20s start_delta=2h54m30.690801022s end_delta=2h54m10.690801238s step=1s duration=13.926955ms status=200 limit=1000 returned_lines=0 throughput=16MB total_bytes=219kB total_bytes_non_indexed_labels=2.1kB lines_per_second=14935 total_lines=208 post_filter_lines=208 total_entries=41 store_chunks_download_time=1.592805ms queue_time=127µs splits=0 shards=0 chunk_refs_fetch_time=3.599883ms cache_chunk_req=1 cache_chunk_hit=1 cache_chunk_bytes_stored=0 cache_chunk_bytes_fetched=480079 cache_chunk_download_time=1.307396ms cache_index_req=0 cache_index_hit=0 cache_index_download_time=0s cache_stats_results_req=1 cache_stats_results_hit=1 cache_stats_results_download_time=361.913µs cache_result_req=0 cache_result_hit=0 cache_result_download_time=0s token_id=gcom-1234`, []string{ - "level=info ", "ts= ", "caller=metrics.go: ", "component=frontend ", "org_id= ", "traceID= ", "latency=fast ", `query="{stream=\"stdout\",pod=\"loki-canary-nl54q\"} " `, "query_hash= ", "query_type=limited ", "range_type=range ", "length= ", "start_delta= ", "end_delta= ", "step= ", "duration= ", "status=200 ", "limit= ", "returned_lines= ", "throughput= ", "total_bytes= ", "total_bytes_non_indexed_labels= ", "lines_per_second= ", "total_lines= ", "post_filter_lines= ", "total_entries= ", "store_chunks_download_time= ", "queue_time= ", "splits= ", "shards= ", "chunk_refs_fetch_time= ", "cache_chunk_req= ", "cache_chunk_hit= ", "cache_chunk_bytes_stored= ", "cache_chunk_bytes_fetched= ", "cache_chunk_download_time= ", "cache_index_req= ", "cache_index_hit= ", "cache_index_download_time= ", "cache_stats_results_req= ", "cache_stats_results_hit= ", "cache_stats_results_download_time= ", "cache_result_req= ", "cache_result_hit= ", "cache_result_download_time= ", "token_id=gcom-", + "level=info", "ts=", "caller=metrics.go:", "component=frontend", "org_id=", "traceID=", "latency=fast", `query="{stream=\"stdout\",pod=\"loki-canary-nl54q\"} "`, "query_hash=", "query_type=limited", "range_type=range", "length=", "start_delta=", "end_delta=", "step=", "duration=", "status=200", "limit=", "returned_lines=", "throughput=", "total_bytes=", "total_bytes_non_indexed_labels=", "lines_per_second=", "total_lines=", "post_filter_lines=", "total_entries=", "store_chunks_download_time=", "queue_time=", "splits=", "shards=", "chunk_refs_fetch_time=", "cache_chunk_req=", "cache_chunk_hit=", "cache_chunk_bytes_stored=", "cache_chunk_bytes_fetched=", "cache_chunk_download_time=", "cache_index_req=", "cache_index_hit=", "cache_index_download_time=", "cache_stats_results_req=", "cache_stats_results_hit=", "cache_stats_results_download_time=", "cache_result_req=", "cache_result_hit=", "cache_result_download_time=", "token_id=gcom-", }, }, // logfmt from loki, with string multi-word messages { `level=debug ts=2023-09-06T00:59:59.98214402Z caller=shard_resolver.go:114 bytes=205kB chunks=2 streams=2 entries=200 msg="queried index" type=single matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}" duration=9.498885ms from=2023-09-06T00:48:53.138Z through=2023-09-06T00:49:43.138Z length=50s`, []string{ - "level=debug ", "ts= ", "caller=shard_resolver.go: ", "bytes= ", "chunks= ", "streams= ", "entries= ", `msg="queried index" `, "type=single ", `matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}" `, "duration= ", "from= ", "through= ", "length=", + "level=debug", "ts=", "caller=shard_resolver.go:", "bytes=", "chunks=", "streams=", "entries=", `msg="queried index"`, "type=single", `matchers="{stream=\"stdout\", pod=\"loki-canary-v75j4\"}"`, "duration=", "from=", "through=", "length=", }, }, // tricky loki distributor message: { `level=debug ts=2024-07-12T12:25:06.175464934Z caller=push.go:146 org_id=29 traceID=7af4f918eab1c80f msg="push request parsed" path=/loki/api/v1/push contentType=application/x-protobuf contentEncoding= bodySize="8.8 kB" streams=11 entries=43 streamLabelsSize="3.4 kB" entriesSize="19 kB" structuredMetadataSize="71 B" totalSize="22 kB" mostRecentLagMs=167 adaptiveLogsDroppedLines=10 adaptiveLogsDroppedSize=4965 adaptiveLogsMatchedLines=37`, []string{ - "level=debug ", "ts= ", "caller=push.go: ", "org_id= ", "traceID= ", `msg="push request parsed" `, "path=/loki/api/v1/push ", "contentType=application/x-protobuf ", "contentEncoding= ", `bodySize="" `, "streams= ", "entries= ", `streamLabelsSize="" `, `entriesSize="" `, `structuredMetadataSize="" `, `totalSize="" `, "mostRecentLagMs= ", "adaptiveLogsDroppedLines= ", "adaptiveLogsDroppedSize= ", "adaptiveLogsMatchedLines=", + "level=debug", "ts=", "caller=push.go:", "org_id=", "traceID=", `msg="push request parsed"`, "path=/loki/api/v1/push", "contentType=application/x-protobuf", "contentEncoding=", `bodySize=""`, "streams=", "entries=", `streamLabelsSize=""`, `entriesSize=""`, `structuredMetadataSize=""`, `totalSize=""`, "mostRecentLagMs=", "adaptiveLogsDroppedLines=", "adaptiveLogsDroppedSize=", "adaptiveLogsMatchedLines=", }, }, // random JSON logs { `{"timestamp": "2022-12-23T12:34:56Z", "level": "debug", "message": "Server starting", "server_id": "abcdefghij", "start_time": "2022-12-23T12:30:00Z"}`, - []string{`{"timestamp": `, `"", `, `"level": `, `"debug", `, `"message": `, `"Server starting", `, `"server_id": `, `"abcdefghij", `, `"start_time": `, `""}`}, + []string{`{"timestamp":`, `"",`, `"level":`, `"debug",`, `"message":`, `"Server starting",`, `"server_id":`, `"abcdefghij",`, `"start_time":`, `""}`}, }, { // JSON logs without spaces between elements, like how JavaScript's JSON.Stringify() produces: @@ -204,7 +203,7 @@ var tokenizationRealisticTestCases = []tokenizationTestCase{ // - timestamp is not correctly detected // - empty "" token? `03-17 16:13:40.345 1702 14638 D PowerManagerService: release:lock=166121161, flg=0x0, tag="RILJ_ACK_WL", name=com.android.phone", ws=null, uid=1001, pid=2626`, - []string{"- ", ":: ", " ", " ", " ", "D ", "PowerManagerService: ", "release:lock=, ", "flg=0x0, ", `tag="RILJ_ACK_WL", `, `name=com.android.phone"`, ", ", "ws=null, ", "uid=, ", "pid="}, + []string{"-", "::", "", "", "", "D", "PowerManagerService:", "release:lock=,", "flg=0x0,", `tag="RILJ_ACK_WL",`, `name=com.android.phone"`, ",", "ws=null,", "uid=,", "pid="}, }, { // TOFIX: @@ -212,52 +211,52 @@ var tokenizationRealisticTestCases = []tokenizationTestCase{ // - empty "" tokens `03-17 16:13:47.518 1702 8671 D ActivityManager: Skipping, withExcluded: false, tr.intent:Intent { act=android.intent.action.VIEW dat=file:///storage/emulated/0/Tencent/QQfile_recv/b.apk typ=application/vnd.android.package-archive flg=0x10800000 cmp=com.android.packageinstaller/.PackageInstallerActivity (has extras) }`, []string{ - "- ", ":: ", " ", " ", " ", " ", "D ", "ActivityManager: ", "Skipping, ", "withExcluded: ", "false, ", "tr.intent:Intent ", "{ ", "act=android.intent.action.VIEW ", "dat=file:///storage/emulated//Tencent/QQfile_recv/b.apk ", "typ=application/vnd.android.package-archive ", "flg=0x10800000 ", "cmp=com.android.packageinstaller/.PackageInstallerActivity ", "(has ", "extras) ", "}", + "-", "::", "", "", "", "", "D", "ActivityManager:", "Skipping,", "withExcluded:", "false,", "tr.intent:Intent", "{", "act=android.intent.action.VIEW", "dat=file:///storage/emulated//Tencent/QQfile_recv/b.apk", "typ=application/vnd.android.package-archive", "flg=0x10800000", "cmp=com.android.packageinstaller/.PackageInstallerActivity", "(has", "extras)", "}", }, }, // Apache logs from https://github.com/logpai/logparser/blob/main/data/loghub_2k/Apache/Apache_2k.log { `[Mon Dec 05 13:16:27 2005] [notice] jk2_init() Found child 5877 in scoreboard slot 9`, - []string{"[] ", "[notice] ", "jk2_init() ", "Found ", "child ", " ", "in ", "scoreboard ", "slot ", ""}, + []string{"[]", "[notice]", "jk2_init()", "Found", "child", "", "in", "scoreboard", "slot", ""}, }, { `[Mon Dec 05 19:14:11 2005] [notice] workerEnv.init() ok /etc/httpd/conf/workers2.properties`, - []string{"[] ", "[notice] ", "workerEnv.init() ", "ok ", "/etc/httpd/conf/workers2.properties"}, + []string{"[]", "[notice]", "workerEnv.init()", "ok", "/etc/httpd/conf/workers2.properties"}, }, // nginx logs by running `docker run -p 80:80 -v $(pwd):/usr/share/nginx/html nginx` locally { `2024/03/27 14:31:42 [error] 29#29: *1 directory index of "/usr/share/nginx/html/" is forbidden, client: 172.17.0.1, server: localhost, request: "GET / HTTP/1.1", host: "127.0.0.1"`, - []string{" ", "[error] ", "#: ", "* ", "directory ", "index ", "of ", "\"/usr/share/nginx/html/\" ", "is ", "forbidden, ", "client: ", ", ", "server: ", "localhost, ", "request: ", `"GET / HTTP/", `, "host: ", "\"\""}, + []string{"", "[error]", "#:", "*", "directory", "index", "of", "\"/usr/share/nginx/html/\"", "is", "forbidden,", "client:", ",", "server:", "localhost,", "request:", `"GET / HTTP/",`, "host:", "\"\""}, }, { // TOFIX: // - probably not all numbers should be replaced with , e.g. for "*1", "(2:", "HTTP/1.1" it's definitely a worse UX `2024/03/27 14:34:37 [error] 29#29: *1 open() "/usr/share/nginx/html/test url with spaces" failed (2: No such file or directory), client: 172.17.0.1, server: localhost, request: "GET /test%20url%20with%20spaces HTTP/1.1", host: "127.0.0.1" 172.17.0.1 - - [31/Mar/2024:14:34:37 +0000] "GET /test%20url%20with%20spaces HTTP/1.1" 404 153 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0" "-"`, - []string{" ", "[error] ", "#: ", "* ", "open() ", `"/usr/share/nginx/html/test url with spaces" `, "failed ", "(: ", "No ", "such ", "file ", "or ", "directory), ", "client: ", ", ", "server: ", "localhost, ", "request: ", `"GET /test%20url%20with%20spaces HTTP/", `, "host: ", "\"\"\n", "\t", "\t", " ", "- ", "- ", "[] ", `"GET /test%20url%20with%20spaces HTTP/" `, " ", " ", "\"-\" ", `"Mozilla/ (X11; Linux x86_; rv:) Gecko/ Firefox/" `, `"-"`}, + []string{"", "[error]", "#:", "*", "open()", `"/usr/share/nginx/html/test url with spaces"`, "failed", "(:", "No", "such", "file", "or", "directory),", "client:", ",", "server:", "localhost,", "request:", `"GET /test%20url%20with%20spaces HTTP/",`, "host:", "\"\"", "", "", "", "-", "-", "[]", `"GET /test%20url%20with%20spaces HTTP/"`, "", "", "\"-\"", `"Mozilla/ (X11; Linux x86_; rv:) Gecko/ Firefox/"`, `"-"`}, }, // Linux systemd (journalctl) logs { `Mar 27 11:52:21 hostname systemd-networkd[2043]: enp6s0: LLDP Rx: Invoking callback for 'refreshed' event.`, - []string{" ", "hostname ", "systemd-networkd[]: ", "enp6s0: ", "LLDP ", "Rx: ", "Invoking ", "callback ", "for ", "'refreshed' ", "event."}, + []string{"", "hostname", "systemd-networkd[]:", "enp6s0:", "LLDP", "Rx:", "Invoking", "callback", "for", "'refreshed'", "event."}, }, { `Feb 29 23:00:14 nixos dbus-daemon[11432]: [system] Activating via systemd: service name='org.opensuse.Snapper' unit='snapperd.service' requested by ':1.324' (uid=0 pid=22089 comm="/nix/store/7rgimysvkczzyiaq4fkfymyjad4vbd9c-snappe" label="kernel")`, - []string{" ", "nixos ", "dbus-daemon[]: ", "[system] ", "Activating ", "via ", "systemd: ", "service ", "name='org.opensuse.Snapper' ", "unit='snapperd.service' ", "requested ", "by ", "':' ", "(uid= ", "pid= ", "comm=\"/nix/store/7rgimysvkczzyiaq4fkfymyjad4vbd9c-snappe\" ", "label=\"kernel\")"}, + []string{"", "nixos", "dbus-daemon[]:", "[system]", "Activating", "via", "systemd:", "service", "name='org.opensuse.Snapper'", "unit='snapperd.service'", "requested", "by", "':'", "(uid=", "pid=", "comm=\"/nix/store/7rgimysvkczzyiaq4fkfymyjad4vbd9c-snappe\"", "label=\"kernel\")"}, }, // Random slack logs: { `Apr-10 23:43:46.807 [API-Q] (T02S4RCS0) c37dfd20-1712781826.804 conversations.suggestions is ACTIVE`, - []string{" ", "[API-Q] ", "(T02S4RCS0) ", "c37dfd20- ", "conversations.suggestions ", "is ", "ACTIVE"}, + []string{"", "[API-Q]", "(T02S4RCS0)", "c37dfd20-", "conversations.suggestions", "is", "ACTIVE"}, }, { `Apr-11 00:01:57.743 [DEVICE-PERMISSIONS-MA] Permissions saved to local storage: {"permissions":{"microphone":"granted","camera":"prompt","screen":"prompt"},"timestamp":1712782917742}`, - []string{" ", "[DEVICE-PERMISSIONS-MA] ", "Permissions ", "saved ", "to ", "local ", "storage: ", `{"permissions":`, `{"microphone":`, `"granted"`, `,"camera":`, `"prompt"`, `,"screen":`, `"prompt"}`, `,"timestamp":`, `}`}, + []string{"", "[DEVICE-PERMISSIONS-MA]", "Permissions", "saved", "to", "local", "storage:", `{"permissions":`, `{"microphone":`, `"granted"`, `,"camera":`, `"prompt"`, `,"screen":`, `"prompt"}`, `,"timestamp":`, `}`}, }, // Another weird log from loki: { `ts=2023-09-06T00:59:59.900879737Z caller=spanlogger.go:86 user=29 level=debug msg="querying ingester" params="selector={stream=\"stdout\", pod=\"loki-canary-t98wq\"}, direction=BACKWARD, start=2023-09-05 23:20:28.030285153 +0000 UTC, end=2023-09-05 23:20:48.030285153 +0000 UTC, limit=1000, shards="`, - []string{"ts= ", "caller=spanlogger.go: ", "user= ", "level=debug ", `msg="querying ingester" `, `params="selector={stream=\"stdout\", pod=\"loki-canary-t98wq\"}, direction=BACKWARD, start=, end=, limit=, shards="`}, + []string{"ts=", "caller=spanlogger.go:", "user=", "level=debug", `msg="querying ingester"`, `params="selector={stream=\"stdout\", pod=\"loki-canary-t98wq\"}, direction=BACKWARD, start=, end=, limit=, shards="`}, }, // {``, []string{}}, } @@ -365,7 +364,7 @@ func TestTokenizationMemcpy(t *testing.T) { func TestTokenizationPlayground(t *testing.T) { tc := tokenizationTestCase{ "foo 121113.21231 bar 123.0.1.123 -123 -1231.11", - []string{"foo ", " ", "bar ", " ", " ", ""}, + []string{"foo", "", "bar", "", "", ""}, } result := PreprocessAndTokenize([]byte(tc.line)) assert.Equal(