From e37a7c1ebab5925297a93f952fff35642cfec56c Mon Sep 17 00:00:00 2001 From: Andrew Stucki Date: Fri, 4 Dec 2020 17:44:26 -0500 Subject: [PATCH 1/8] Add mimetype processor --- NOTICE.txt | 4 +- go.mod | 2 +- go.sum | 1 + libbeat/processors/mime/config.go | 42 ++++++ libbeat/processors/mime/mime.go | 126 ++++++++++++++++++ libbeat/processors/mime/mime_test.go | 134 ++++++++++++++++++++ packetbeat/_meta/config/processors.yml.tmpl | 6 + packetbeat/packetbeat.yml | 6 + x-pack/packetbeat/packetbeat.yml | 6 + 9 files changed, 324 insertions(+), 3 deletions(-) create mode 100644 libbeat/processors/mime/config.go create mode 100644 libbeat/processors/mime/mime.go create mode 100644 libbeat/processors/mime/mime_test.go diff --git a/NOTICE.txt b/NOTICE.txt index 04001e312bb..0106cf4bc51 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -10142,11 +10142,11 @@ Contents of probable licence file $GOMODCACHE/github.com/gorhill/cronexpr@v0.0.0 -------------------------------------------------------------------------------- Dependency : github.com/h2non/filetype -Version: v1.0.12 +Version: v1.1.1-0.20201130172452-f60988ab73d5 Licence type (autodetected): MIT -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/h2non/filetype@v1.0.12/LICENSE: +Contents of probable licence file $GOMODCACHE/github.com/h2non/filetype@v1.1.1-0.20201130172452-f60988ab73d5/LICENSE: The MIT License diff --git a/go.mod b/go.mod index 83c39ae0a20..f29e3b16382 100644 --- a/go.mod +++ b/go.mod @@ -97,7 +97,7 @@ require ( github.com/gorilla/mux v1.7.2 // indirect github.com/gorilla/websocket v1.4.1 // indirect github.com/grpc-ecosystem/grpc-gateway v1.13.0 // indirect - github.com/h2non/filetype v1.0.12 + github.com/h2non/filetype v1.1.1-0.20201130172452-f60988ab73d5 github.com/hashicorp/go-multierror v1.1.0 github.com/hashicorp/go-retryablehttp v0.6.6 github.com/hashicorp/golang-lru v0.5.2-0.20190520140433-59383c442f7d // indirect diff --git a/go.sum b/go.sum index fb1b0b64c0d..53f6409e56a 100644 --- a/go.sum +++ b/go.sum @@ -418,6 +418,7 @@ github.com/grpc-ecosystem/grpc-gateway v1.13.0 h1:sBDQoHXrOlfPobnKw69FIKa1wg9qsL github.com/grpc-ecosystem/grpc-gateway v1.13.0/go.mod h1:8XEsbTttt/W+VvjtQhLACqCisSPWTxCZ7sBRjU6iH9c= github.com/h2non/filetype v1.0.12 h1:yHCsIe0y2cvbDARtJhGBTD2ecvqMSTvlIcph9En/Zao= github.com/h2non/filetype v1.0.12/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY= +github.com/h2non/filetype v1.1.1-0.20201130172452-f60988ab73d5/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY= github.com/hashicorp/errwrap v0.0.0-20141028054710-7554cd9344ce/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= diff --git a/libbeat/processors/mime/config.go b/libbeat/processors/mime/config.go new file mode 100644 index 00000000000..c2c0a3f68ea --- /dev/null +++ b/libbeat/processors/mime/config.go @@ -0,0 +1,42 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package mime + +type config struct { + From string `config:"from"` + To string `config:"to"` +} + +const ( + defaultFrom = "http.request.body.content" + defaultTo = "http.request.mime_type" +) + +func (c config) FromOrDefault() string { + if c.From == "" { + return defaultFrom + } + return c.From +} + +func (c config) ToOrDefault() string { + if c.To == "" { + return defaultTo + } + return c.To +} diff --git a/libbeat/processors/mime/mime.go b/libbeat/processors/mime/mime.go new file mode 100644 index 00000000000..72e387c098d --- /dev/null +++ b/libbeat/processors/mime/mime.go @@ -0,0 +1,126 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package mime + +import ( + "encoding/json" + "encoding/xml" + "net/http" + "strings" + + "github.com/elastic/beats/v7/libbeat/beat" + "github.com/elastic/beats/v7/libbeat/common" + "github.com/elastic/beats/v7/libbeat/logp" + "github.com/elastic/beats/v7/libbeat/processors" + "github.com/h2non/filetype" + "github.com/pkg/errors" +) + +const ( + processorName = "mime" + // size for mime detection, office file + // detection requires ~8kb to detect properly + headerSize = 8192 +) + +func init() { + processors.RegisterPlugin(processorName, New) +} + +type mimeType struct { + from string + to string + log *logp.Logger +} + +// New constructs a new mime processor. +func New(cfg *common.Config) (processors.Processor, error) { + var config config + if err := cfg.Unpack(&config); err != nil { + return nil, errors.Wrapf(err, "fail to unpack the %v configuration", processorName) + } + + log := logp.NewLogger(processorName) + + return &mimeType{ + from: config.FromOrDefault(), + to: config.ToOrDefault(), + log: log, + }, nil +} + +func (p *mimeType) Run(event *beat.Event) (*beat.Event, error) { + valI, err := event.GetValue(p.from) + if err != nil { + // doesn't have the required from value to analyze + return event, nil + } + val, _ := valI.(string) + if val == "" { + // wrong type or not set + return event, nil + } + data := []byte(val) + mimeType := p.analyze(data) + if mimeType != "" { + event.Fields.DeepUpdate(common.MapStr{ + p.to: mimeType, + }) + } + return event, nil +} + +func (p *mimeType) analyze(data []byte) string { + header := data + if len(data) > headerSize { + header = data[:headerSize] + } + kind, err := filetype.Match(header) + if err == nil && kind != filetype.Unknown { + // we have a known filetype, return + return kind.MIME.Value + } + // if the above fails, try and sniff with http sniffing + netType := http.DetectContentType(header) + if netType == "application/octet-stream" { + return "" + } + // try and parse any sort of text as json or xml + if strings.HasPrefix(netType, "text/plain") { + if detected := p.detectEncodedText(data); detected != "" { + return detected + } + } + return netType +} + +func (p *mimeType) detectEncodedText(data []byte) string { + // figure out how to optimize this so we don't have to try and parse the whole payload + // every time + if json.Valid(data) { + return "application/json" + } + if xml.Unmarshal(data, new(interface{})) == nil { + return "text/xml" + } + return "" +} + +func (p *mimeType) String() string { + return processorName +} diff --git a/libbeat/processors/mime/mime_test.go b/libbeat/processors/mime/mime_test.go new file mode 100644 index 00000000000..c2dbfb8c214 --- /dev/null +++ b/libbeat/processors/mime/mime_test.go @@ -0,0 +1,134 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package mime + +import ( + "encoding/hex" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/elastic/beats/v7/libbeat/beat" + "github.com/elastic/beats/v7/libbeat/common" +) + +func TestMimeType(t *testing.T) { + tests := []struct { + name string + expectedType string + body string + }{ + { + name: "html", + expectedType: "text/html; charset=utf-8", + body: "Test", + }, + { + name: "pe", + expectedType: "application/vnd.microsoft.portable-executable", + body: convertToData(t, "4d5a90000300000004000000ffff"), + }, + { + name: "elf", + expectedType: "application/x-executable", + body: convertToData(t, "7f454c460101010000000000000000000300030001000000f0dc01003400000080a318000000000034002000080028001e001d0001"), + }, + { + name: "macho", + expectedType: "application/x-mach-binary", + body: convertToData(t, "cffaedfe0700000103000000020000001000000058050000850020000000000019000000480000005f5f504147455a45524f"), + }, + { + name: "json", + expectedType: "application/json", + body: "{}", + }, + { + name: "xml", + expectedType: "text/xml", + body: "", + }, + { + name: "text", + expectedType: "text/plain; charset=utf-8", + body: "Hello world!", + }, + { + name: "png", + expectedType: "image/png", + body: convertToData(t, "89504e470d0a1a0a0000000d494844520000025800000258080200000031040f8b0000000467414d410000b18f0bfc610500"), + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + evt := beat.Event{ + Fields: common.MapStr{ + "http.request.body.content": test.body, + }, + } + p, err := New(common.MustNewConfigFrom(map[string]interface{}{})) + require.NoError(t, err) + observed, err := p.Run(&evt) + require.NoError(t, err) + enriched, err := observed.Fields.GetValue("http.request.mime_type") + require.NoError(t, err) + require.Equal(t, test.expectedType, enriched) + }) + } +} + +func TestMimeTypeFromTo(t *testing.T) { + evt := beat.Event{ + Fields: common.MapStr{ + "foo.bar.baz": "hello world!", + }, + } + p, err := New(common.MustNewConfigFrom(map[string]interface{}{ + "from": "foo.bar.baz", + "to": "bar.baz.zoiks", + })) + require.NoError(t, err) + observed, err := p.Run(&evt) + require.NoError(t, err) + enriched, err := observed.Fields.GetValue("bar.baz.zoiks") + require.NoError(t, err) + require.Equal(t, "text/plain; charset=utf-8", enriched) +} + +func TestMimeTypeTestNoMatch(t *testing.T) { + evt := beat.Event{ + Fields: common.MapStr{ + "http.request.body.content": string([]byte{0, 0}), + }, + } + p, err := New(common.MustNewConfigFrom(map[string]interface{}{})) + require.NoError(t, err) + observed, err := p.Run(&evt) + require.NoError(t, err) + hasKey, _ := observed.Fields.HasKey("http.request.mime_type") + require.False(t, hasKey) +} + +func convertToData(t *testing.T, sample string) string { + t.Helper() + decoded, err := hex.DecodeString(sample) + if err != nil { + t.Fatal(err) + } + return string(decoded) +} diff --git a/packetbeat/_meta/config/processors.yml.tmpl b/packetbeat/_meta/config/processors.yml.tmpl index d2cadbe46b1..ef2122a6115 100644 --- a/packetbeat/_meta/config/processors.yml.tmpl +++ b/packetbeat/_meta/config/processors.yml.tmpl @@ -10,3 +10,9 @@ processors: - add_host_metadata: ~ - add_cloud_metadata: ~ - add_docker_metadata: ~ + - mime: + from: http.request.body.content + to: http.request.mime_type + - mime: + from: http.response.body.content + to: http.response.mime_type diff --git a/packetbeat/packetbeat.yml b/packetbeat/packetbeat.yml index f7e19b268b8..833cedf0cf5 100644 --- a/packetbeat/packetbeat.yml +++ b/packetbeat/packetbeat.yml @@ -214,6 +214,12 @@ processors: - add_host_metadata: ~ - add_cloud_metadata: ~ - add_docker_metadata: ~ + - mime: + from: http.request.body.content + to: http.request.mime_type + - mime: + from: http.response.body.content + to: http.response.mime_type # ================================== Logging =================================== diff --git a/x-pack/packetbeat/packetbeat.yml b/x-pack/packetbeat/packetbeat.yml index f7e19b268b8..833cedf0cf5 100644 --- a/x-pack/packetbeat/packetbeat.yml +++ b/x-pack/packetbeat/packetbeat.yml @@ -214,6 +214,12 @@ processors: - add_host_metadata: ~ - add_cloud_metadata: ~ - add_docker_metadata: ~ + - mime: + from: http.request.body.content + to: http.request.mime_type + - mime: + from: http.response.body.content + to: http.response.mime_type # ================================== Logging =================================== From 01bc6a581c9e82c670a792cd4e916b6149db25f3 Mon Sep 17 00:00:00 2001 From: Andrew Stucki Date: Fri, 4 Dec 2020 18:04:15 -0500 Subject: [PATCH 2/8] Add mimetype detection for packetbeat --- libbeat/cmd/instance/imports_common.go | 1 + packetbeat/tests/system/config/packetbeat.yml.j2 | 11 ++++++++++- packetbeat/tests/system/test_0063_http_body.py | 3 +++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/libbeat/cmd/instance/imports_common.go b/libbeat/cmd/instance/imports_common.go index a2b2569d61c..2b6caf6e4d4 100644 --- a/libbeat/cmd/instance/imports_common.go +++ b/libbeat/cmd/instance/imports_common.go @@ -34,6 +34,7 @@ import ( _ "github.com/elastic/beats/v7/libbeat/processors/dns" _ "github.com/elastic/beats/v7/libbeat/processors/extract_array" _ "github.com/elastic/beats/v7/libbeat/processors/fingerprint" + _ "github.com/elastic/beats/v7/libbeat/processors/mime" _ "github.com/elastic/beats/v7/libbeat/processors/registered_domain" _ "github.com/elastic/beats/v7/libbeat/processors/translate_sid" _ "github.com/elastic/beats/v7/libbeat/processors/urldecode" diff --git a/packetbeat/tests/system/config/packetbeat.yml.j2 b/packetbeat/tests/system/config/packetbeat.yml.j2 index 7b253d8ec2c..d5f1e722e02 100644 --- a/packetbeat/tests/system/config/packetbeat.yml.j2 +++ b/packetbeat/tests/system/config/packetbeat.yml.j2 @@ -194,7 +194,16 @@ tags: [ packetbeat.shutdown_timeout: {{ shutdown_timeout|default('400ms') }} -{%- if processors %} +{%- if include_mime %} +processors: + - mime: + from: "http.request.body.content" + to: "http.request.mime_type" + - mime: + from: "http.response.body.content" + to: "http.response.mime_type" + +{%- elif processors %} #================================ Filters ===================================== diff --git a/packetbeat/tests/system/test_0063_http_body.py b/packetbeat/tests/system/test_0063_http_body.py index 349624e32d9..17cecb62b7d 100644 --- a/packetbeat/tests/system/test_0063_http_body.py +++ b/packetbeat/tests/system/test_0063_http_body.py @@ -43,6 +43,7 @@ def test_include_body_for_both_request_response(self): """ self.render_config_template( http_include_body_for=["x-www-form-urlencoded", "text/html"], + include_mime=True ) self.run_packetbeat(pcap="http_post.pcap", debug_selectors=["http", "httpdetailed"]) @@ -58,6 +59,8 @@ def test_include_body_for_both_request_response(self): assert len(o["http.request.body.content"]) > 0 assert len(o["http.response.body.content"]) > 0 + assert o["http.request.mime_type"] == "text/plain; charset=utf-8" + assert o["http.response.mime_type"] == "text/html; charset=utf-8" assert "request" not in o assert "response" not in o From 237fbbe7826cc61be24ad65b6d85ac1058e661b4 Mon Sep 17 00:00:00 2001 From: Andrew Stucki Date: Fri, 4 Dec 2020 18:12:47 -0500 Subject: [PATCH 3/8] Update changelog --- CHANGELOG.next.asciidoc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index b884fa81e7c..9702c1d0ab7 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -536,6 +536,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d - Add support for ephemeral containers in kubernetes autodiscover and `add_kubernetes_metadata`. {pull}22389[22389] {pull}22439[22439] - Added support for wildcard fields and keyword fallback in beats setup commands. {pull}22521[22521] - Fix polling node when it is not ready and monitor by hostname {pull}22666[22666] +- Added "mime" processor for detecting mime types {pull}22940[22940] *Auditbeat* @@ -865,6 +866,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d - Add support for overriding the published index on a per-protocol/flow basis. {pull}22134[22134] - Change build process for x-pack distribution {pull}21979[21979] - Tuned the internal queue size to reduce the chances of events being dropped. {pull}22650[22650] +- Add support for "http.request.mime_type" and "http.response.mime_type". {pull}22940[22940] *Functionbeat* From 9c7cf1d9263d057976eb1a42d3de1030381218be Mon Sep 17 00:00:00 2001 From: Andrew Stucki Date: Fri, 4 Dec 2020 19:08:47 -0500 Subject: [PATCH 4/8] Rev go.sum --- go.sum | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/go.sum b/go.sum index 53f6409e56a..f2fac60f077 100644 --- a/go.sum +++ b/go.sum @@ -416,8 +416,7 @@ github.com/gorilla/websocket v1.4.1/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/ad github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/grpc-ecosystem/grpc-gateway v1.13.0 h1:sBDQoHXrOlfPobnKw69FIKa1wg9qsLLvvQ/Y19WtFgI= github.com/grpc-ecosystem/grpc-gateway v1.13.0/go.mod h1:8XEsbTttt/W+VvjtQhLACqCisSPWTxCZ7sBRjU6iH9c= -github.com/h2non/filetype v1.0.12 h1:yHCsIe0y2cvbDARtJhGBTD2ecvqMSTvlIcph9En/Zao= -github.com/h2non/filetype v1.0.12/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY= +github.com/h2non/filetype v1.1.1-0.20201130172452-f60988ab73d5 h1:xI88renBpIJws9OfEQq4Dng10OppnY5u9bTok/GDFEI= github.com/h2non/filetype v1.1.1-0.20201130172452-f60988ab73d5/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY= github.com/hashicorp/errwrap v0.0.0-20141028054710-7554cd9344ce/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= From c37861fd50fe3926c88bcb606a096be89293529d Mon Sep 17 00:00:00 2001 From: Andrew Stucki Date: Fri, 4 Dec 2020 21:45:30 -0500 Subject: [PATCH 5/8] Refactor for reusability and rename to detect_mime_type --- libbeat/cmd/instance/imports_common.go | 1 - .../{processors/mime/mime.go => mime/byte.go} | 71 ++---------------- libbeat/{processors => }/mime/mime_test.go | 48 +----------- .../mime/config.go => mime/string.go} | 26 +------ .../processors/actions/detect_mime_type.go | 74 +++++++++++++++++++ .../actions/detect_mime_type_test.go | 62 ++++++++++++++++ packetbeat/_meta/config/processors.yml.tmpl | 12 +-- packetbeat/packetbeat.yml | 12 +-- .../tests/system/config/packetbeat.yml.j2 | 12 +-- x-pack/packetbeat/packetbeat.yml | 12 +-- 10 files changed, 173 insertions(+), 157 deletions(-) rename libbeat/{processors/mime/mime.go => mime/byte.go} (55%) rename libbeat/{processors => }/mime/mime_test.go (64%) rename libbeat/{processors/mime/config.go => mime/string.go} (68%) create mode 100644 libbeat/processors/actions/detect_mime_type.go create mode 100644 libbeat/processors/actions/detect_mime_type_test.go diff --git a/libbeat/cmd/instance/imports_common.go b/libbeat/cmd/instance/imports_common.go index 2b6caf6e4d4..a2b2569d61c 100644 --- a/libbeat/cmd/instance/imports_common.go +++ b/libbeat/cmd/instance/imports_common.go @@ -34,7 +34,6 @@ import ( _ "github.com/elastic/beats/v7/libbeat/processors/dns" _ "github.com/elastic/beats/v7/libbeat/processors/extract_array" _ "github.com/elastic/beats/v7/libbeat/processors/fingerprint" - _ "github.com/elastic/beats/v7/libbeat/processors/mime" _ "github.com/elastic/beats/v7/libbeat/processors/registered_domain" _ "github.com/elastic/beats/v7/libbeat/processors/translate_sid" _ "github.com/elastic/beats/v7/libbeat/processors/urldecode" diff --git a/libbeat/processors/mime/mime.go b/libbeat/mime/byte.go similarity index 55% rename from libbeat/processors/mime/mime.go rename to libbeat/mime/byte.go index 72e387c098d..59d653e43fc 100644 --- a/libbeat/processors/mime/mime.go +++ b/libbeat/mime/byte.go @@ -23,69 +23,18 @@ import ( "net/http" "strings" - "github.com/elastic/beats/v7/libbeat/beat" - "github.com/elastic/beats/v7/libbeat/common" - "github.com/elastic/beats/v7/libbeat/logp" - "github.com/elastic/beats/v7/libbeat/processors" "github.com/h2non/filetype" - "github.com/pkg/errors" ) const ( - processorName = "mime" // size for mime detection, office file // detection requires ~8kb to detect properly headerSize = 8192 ) -func init() { - processors.RegisterPlugin(processorName, New) -} - -type mimeType struct { - from string - to string - log *logp.Logger -} - -// New constructs a new mime processor. -func New(cfg *common.Config) (processors.Processor, error) { - var config config - if err := cfg.Unpack(&config); err != nil { - return nil, errors.Wrapf(err, "fail to unpack the %v configuration", processorName) - } - - log := logp.NewLogger(processorName) - - return &mimeType{ - from: config.FromOrDefault(), - to: config.ToOrDefault(), - log: log, - }, nil -} - -func (p *mimeType) Run(event *beat.Event) (*beat.Event, error) { - valI, err := event.GetValue(p.from) - if err != nil { - // doesn't have the required from value to analyze - return event, nil - } - val, _ := valI.(string) - if val == "" { - // wrong type or not set - return event, nil - } - data := []byte(val) - mimeType := p.analyze(data) - if mimeType != "" { - event.Fields.DeepUpdate(common.MapStr{ - p.to: mimeType, - }) - } - return event, nil -} - -func (p *mimeType) analyze(data []byte) string { +// DetectBytes tries to detect a mime-type based off +// of a chunk of bytes passed into the function +func DetectBytes(data []byte) string { header := data if len(data) > headerSize { header = data[:headerSize] @@ -97,19 +46,19 @@ func (p *mimeType) analyze(data []byte) string { } // if the above fails, try and sniff with http sniffing netType := http.DetectContentType(header) - if netType == "application/octet-stream" { - return "" - } // try and parse any sort of text as json or xml if strings.HasPrefix(netType, "text/plain") { - if detected := p.detectEncodedText(data); detected != "" { + if detected := detectEncodedText(data); detected != "" { return detected } } + if netType == "application/octet-stream" { + return "" + } return netType } -func (p *mimeType) detectEncodedText(data []byte) string { +func detectEncodedText(data []byte) string { // figure out how to optimize this so we don't have to try and parse the whole payload // every time if json.Valid(data) { @@ -120,7 +69,3 @@ func (p *mimeType) detectEncodedText(data []byte) string { } return "" } - -func (p *mimeType) String() string { - return processorName -} diff --git a/libbeat/processors/mime/mime_test.go b/libbeat/mime/mime_test.go similarity index 64% rename from libbeat/processors/mime/mime_test.go rename to libbeat/mime/mime_test.go index c2dbfb8c214..e4742fb9cfc 100644 --- a/libbeat/processors/mime/mime_test.go +++ b/libbeat/mime/mime_test.go @@ -22,9 +22,6 @@ import ( "testing" "github.com/stretchr/testify/require" - - "github.com/elastic/beats/v7/libbeat/beat" - "github.com/elastic/beats/v7/libbeat/common" ) func TestMimeType(t *testing.T) { @@ -76,54 +73,11 @@ func TestMimeType(t *testing.T) { } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - evt := beat.Event{ - Fields: common.MapStr{ - "http.request.body.content": test.body, - }, - } - p, err := New(common.MustNewConfigFrom(map[string]interface{}{})) - require.NoError(t, err) - observed, err := p.Run(&evt) - require.NoError(t, err) - enriched, err := observed.Fields.GetValue("http.request.mime_type") - require.NoError(t, err) - require.Equal(t, test.expectedType, enriched) + require.Equal(t, test.expectedType, Detect(test.body)) }) } } -func TestMimeTypeFromTo(t *testing.T) { - evt := beat.Event{ - Fields: common.MapStr{ - "foo.bar.baz": "hello world!", - }, - } - p, err := New(common.MustNewConfigFrom(map[string]interface{}{ - "from": "foo.bar.baz", - "to": "bar.baz.zoiks", - })) - require.NoError(t, err) - observed, err := p.Run(&evt) - require.NoError(t, err) - enriched, err := observed.Fields.GetValue("bar.baz.zoiks") - require.NoError(t, err) - require.Equal(t, "text/plain; charset=utf-8", enriched) -} - -func TestMimeTypeTestNoMatch(t *testing.T) { - evt := beat.Event{ - Fields: common.MapStr{ - "http.request.body.content": string([]byte{0, 0}), - }, - } - p, err := New(common.MustNewConfigFrom(map[string]interface{}{})) - require.NoError(t, err) - observed, err := p.Run(&evt) - require.NoError(t, err) - hasKey, _ := observed.Fields.HasKey("http.request.mime_type") - require.False(t, hasKey) -} - func convertToData(t *testing.T, sample string) string { t.Helper() decoded, err := hex.DecodeString(sample) diff --git a/libbeat/processors/mime/config.go b/libbeat/mime/string.go similarity index 68% rename from libbeat/processors/mime/config.go rename to libbeat/mime/string.go index c2c0a3f68ea..40f231e8abc 100644 --- a/libbeat/processors/mime/config.go +++ b/libbeat/mime/string.go @@ -17,26 +17,8 @@ package mime -type config struct { - From string `config:"from"` - To string `config:"to"` -} - -const ( - defaultFrom = "http.request.body.content" - defaultTo = "http.request.mime_type" -) - -func (c config) FromOrDefault() string { - if c.From == "" { - return defaultFrom - } - return c.From -} - -func (c config) ToOrDefault() string { - if c.To == "" { - return defaultTo - } - return c.To +// Detect tries to detect a mime-type based off +// of a byte string passed into the function +func Detect(data string) string { + return DetectBytes([]byte(data)) } diff --git a/libbeat/processors/actions/detect_mime_type.go b/libbeat/processors/actions/detect_mime_type.go new file mode 100644 index 00000000000..12e66f497fc --- /dev/null +++ b/libbeat/processors/actions/detect_mime_type.go @@ -0,0 +1,74 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package actions + +import ( + "fmt" + + "github.com/elastic/beats/v7/libbeat/beat" + "github.com/elastic/beats/v7/libbeat/common" + "github.com/elastic/beats/v7/libbeat/mime" + "github.com/elastic/beats/v7/libbeat/processors" + "github.com/elastic/beats/v7/libbeat/processors/checks" + "github.com/pkg/errors" +) + +func init() { + processors.RegisterPlugin("detect_mime_type", + checks.ConfigChecked(NewDetectMimeType, + checks.RequireFields("field", "target"), + checks.AllowedFields("field", "target"))) +} + +type mimeTypeProcessor struct { + Field string `config:"field"` + Target string `config:"target"` +} + +// NewDetectMimeType constructs a new mime processor. +func NewDetectMimeType(cfg *common.Config) (processors.Processor, error) { + mimeType := &mimeTypeProcessor{} + if err := cfg.Unpack(mimeType); err != nil { + return nil, errors.Wrapf(err, "fail to unpack the detect_mime_type configuration") + } + + return mimeType, nil +} + +func (m *mimeTypeProcessor) Run(event *beat.Event) (*beat.Event, error) { + valI, err := event.GetValue(m.Field) + if err != nil { + // doesn't have the required fieldd value to analyze + return event, nil + } + val, _ := valI.(string) + if val == "" { + // wrong type or not set + return event, nil + } + if mimeType := mime.Detect(val); mimeType != "" { + event.Fields.DeepUpdate(common.MapStr{ + m.Target: mimeType, + }) + } + return event, nil +} + +func (m *mimeTypeProcessor) String() string { + return fmt.Sprintf("detect_mime_type=%+v->%+v", m.Field, m.Target) +} diff --git a/libbeat/processors/actions/detect_mime_type_test.go b/libbeat/processors/actions/detect_mime_type_test.go new file mode 100644 index 00000000000..51de6c9062f --- /dev/null +++ b/libbeat/processors/actions/detect_mime_type_test.go @@ -0,0 +1,62 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package actions + +import ( + "testing" + + "github.com/stretchr/testify/require" + + "github.com/elastic/beats/v7/libbeat/beat" + "github.com/elastic/beats/v7/libbeat/common" +) + +func TestMimeTypeFromTo(t *testing.T) { + evt := beat.Event{ + Fields: common.MapStr{ + "foo.bar.baz": "hello world!", + }, + } + p, err := NewDetectMimeType(common.MustNewConfigFrom(map[string]interface{}{ + "field": "foo.bar.baz", + "target": "bar.baz.zoiks", + })) + require.NoError(t, err) + observed, err := p.Run(&evt) + require.NoError(t, err) + enriched, err := observed.Fields.GetValue("bar.baz.zoiks") + require.NoError(t, err) + require.Equal(t, "text/plain; charset=utf-8", enriched) +} + +func TestMimeTypeTestNoMatch(t *testing.T) { + evt := beat.Event{ + Fields: common.MapStr{ + "foo.bar.baz": string([]byte{0, 0}), + }, + } + p, err := NewDetectMimeType(common.MustNewConfigFrom(map[string]interface{}{ + "field": "foo.bar.baz", + "target": "bar.baz.zoiks", + })) + require.NoError(t, err) + observed, err := p.Run(&evt) + require.NoError(t, err) + hasKey, _ := observed.Fields.HasKey("bar.baz.zoiks") + require.False(t, hasKey) +} diff --git a/packetbeat/_meta/config/processors.yml.tmpl b/packetbeat/_meta/config/processors.yml.tmpl index ef2122a6115..17b1ca2b540 100644 --- a/packetbeat/_meta/config/processors.yml.tmpl +++ b/packetbeat/_meta/config/processors.yml.tmpl @@ -10,9 +10,9 @@ processors: - add_host_metadata: ~ - add_cloud_metadata: ~ - add_docker_metadata: ~ - - mime: - from: http.request.body.content - to: http.request.mime_type - - mime: - from: http.response.body.content - to: http.response.mime_type + - detect_mime_type: + field: http.request.body.content + target: http.request.mime_type + - detect_mime_type: + field: http.response.body.content + target: http.response.mime_type diff --git a/packetbeat/packetbeat.yml b/packetbeat/packetbeat.yml index 833cedf0cf5..102ba0fb045 100644 --- a/packetbeat/packetbeat.yml +++ b/packetbeat/packetbeat.yml @@ -214,12 +214,12 @@ processors: - add_host_metadata: ~ - add_cloud_metadata: ~ - add_docker_metadata: ~ - - mime: - from: http.request.body.content - to: http.request.mime_type - - mime: - from: http.response.body.content - to: http.response.mime_type + - detect_mime_type: + field: http.request.body.content + target: http.request.mime_type + - detect_mime_type: + field: http.response.body.content + target: http.response.mime_type # ================================== Logging =================================== diff --git a/packetbeat/tests/system/config/packetbeat.yml.j2 b/packetbeat/tests/system/config/packetbeat.yml.j2 index d5f1e722e02..cee36a769d9 100644 --- a/packetbeat/tests/system/config/packetbeat.yml.j2 +++ b/packetbeat/tests/system/config/packetbeat.yml.j2 @@ -196,12 +196,12 @@ packetbeat.shutdown_timeout: {{ shutdown_timeout|default('400ms') }} {%- if include_mime %} processors: - - mime: - from: "http.request.body.content" - to: "http.request.mime_type" - - mime: - from: "http.response.body.content" - to: "http.response.mime_type" + - detect_mime_type: + field: http.request.body.content + target: http.request.mime_type + - detect_mime_type: + field: http.response.body.content + target: http.response.mime_type {%- elif processors %} diff --git a/x-pack/packetbeat/packetbeat.yml b/x-pack/packetbeat/packetbeat.yml index 833cedf0cf5..102ba0fb045 100644 --- a/x-pack/packetbeat/packetbeat.yml +++ b/x-pack/packetbeat/packetbeat.yml @@ -214,12 +214,12 @@ processors: - add_host_metadata: ~ - add_cloud_metadata: ~ - add_docker_metadata: ~ - - mime: - from: http.request.body.content - to: http.request.mime_type - - mime: - from: http.response.body.content - to: http.response.mime_type + - detect_mime_type: + field: http.request.body.content + target: http.request.mime_type + - detect_mime_type: + field: http.response.body.content + target: http.response.mime_type # ================================== Logging =================================== From 56efaadd3eb926fedf7fec8c4c58c5b84a71e4f5 Mon Sep 17 00:00:00 2001 From: Andrew Stucki Date: Fri, 4 Dec 2020 21:51:51 -0500 Subject: [PATCH 6/8] reformat imports --- libbeat/processors/actions/detect_mime_type.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libbeat/processors/actions/detect_mime_type.go b/libbeat/processors/actions/detect_mime_type.go index 12e66f497fc..f53794bc1ff 100644 --- a/libbeat/processors/actions/detect_mime_type.go +++ b/libbeat/processors/actions/detect_mime_type.go @@ -20,12 +20,13 @@ package actions import ( "fmt" + "github.com/pkg/errors" + "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/common" "github.com/elastic/beats/v7/libbeat/mime" "github.com/elastic/beats/v7/libbeat/processors" "github.com/elastic/beats/v7/libbeat/processors/checks" - "github.com/pkg/errors" ) func init() { From 56f4abc7360c36b88321d65b5a9ffdfebb2f7a87 Mon Sep 17 00:00:00 2001 From: Andrew Stucki Date: Fri, 4 Dec 2020 23:20:03 -0500 Subject: [PATCH 7/8] update docs --- libbeat/docs/processors-list.asciidoc | 6 +++++ .../actions/docs/detect_mime_type.asciidoc | 23 +++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 libbeat/processors/actions/docs/detect_mime_type.asciidoc diff --git a/libbeat/docs/processors-list.asciidoc b/libbeat/docs/processors-list.asciidoc index 5dd95e2e3d5..89e78ca24ad 100644 --- a/libbeat/docs/processors-list.asciidoc +++ b/libbeat/docs/processors-list.asciidoc @@ -62,6 +62,9 @@ endif::[] ifndef::no_decompress_gzip_field_processor[] * <> endif::[] +ifndef::no_detect_mime_type_processor[] +* <> +endif::[] ifndef::no_dissect_processor[] * <> endif::[] @@ -168,6 +171,9 @@ endif::[] ifndef::no_decompress_gzip_field_processor[] include::{libbeat-processors-dir}/actions/docs/decompress_gzip_field.asciidoc[] endif::[] +ifndef::no_detect_mime_type_processor[] +include::{libbeat-processors-dir}/actions/docs/detect_mime_type.asciidoc[] +endif::[] ifndef::no_dissect_processor[] include::{libbeat-processors-dir}/dissect/docs/dissect.asciidoc[] endif::[] diff --git a/libbeat/processors/actions/docs/detect_mime_type.asciidoc b/libbeat/processors/actions/docs/detect_mime_type.asciidoc new file mode 100644 index 00000000000..c93c6f882e9 --- /dev/null +++ b/libbeat/processors/actions/docs/detect_mime_type.asciidoc @@ -0,0 +1,23 @@ +[[detect-mime-type]] +=== Detect mime type + +++++ +detect_mime_type +++++ + +The `detect_mime_type` processor attempts to detect a mime type for a field that +contains a given stream of bytes. The `field` key contains the field used as +the data source and the `target` key contains the field to populate with the detected type + +[source,yaml] +------- +processors: + - detect_mime_type: + field: http.request.body.content + target: http.request.mime_type +------- + +In the example above: + - http.request.body.content is used as the source and http.request.mime_type is set to the detected mime type + +See <> for a list of supported conditions. From 23cc8d9dd3eee69245adf2adb8c7d5a0ba10d28c Mon Sep 17 00:00:00 2001 From: Andrew Stucki Date: Mon, 7 Dec 2020 14:04:50 -0500 Subject: [PATCH 8/8] Update maxHeaderSize name and add comment on the fallback behavior --- libbeat/mime/byte.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/libbeat/mime/byte.go b/libbeat/mime/byte.go index 59d653e43fc..c8be7def361 100644 --- a/libbeat/mime/byte.go +++ b/libbeat/mime/byte.go @@ -29,15 +29,15 @@ import ( const ( // size for mime detection, office file // detection requires ~8kb to detect properly - headerSize = 8192 + maxHeaderSize = 8192 ) // DetectBytes tries to detect a mime-type based off // of a chunk of bytes passed into the function func DetectBytes(data []byte) string { header := data - if len(data) > headerSize { - header = data[:headerSize] + if len(data) > maxHeaderSize { + header = data[:maxHeaderSize] } kind, err := filetype.Match(header) if err == nil && kind != filetype.Unknown { @@ -52,6 +52,11 @@ func DetectBytes(data []byte) string { return detected } } + // The fallback for http.DetectContentType is "application/octet-stream" + // meaning that if we see it, we were unable to determine the type and + // we just know we're dealing with a chunk of some sort of bytes. Rather + // than reporting the fallback, we'll just say we were unable to detect + // the type. if netType == "application/octet-stream" { return "" }