From bff14f39513d7624c04f0e8f0173099ac4d14699 Mon Sep 17 00:00:00 2001 From: tballison Date: Mon, 30 Jan 2023 11:20:06 -0500 Subject: [PATCH] TIKA-3962 - set rfc822 parser to no recurse --- .../apache/tika/parser/mail/RFC822Parser.java | 1 + .../tika/parser/mail/RFC822ParserTest.java | 14 ++++++++-- .../test-documents/testGroupWiseEml.eml | 28 +++++++++---------- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java index 335a630638..3717b13b55 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java @@ -92,6 +92,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, config.isStrictParsing(), extractAllAlternatives); parser.setContentHandler(mch); parser.setContentDecoding(true); + parser.setNoRecurse(); xhtml.startDocument(); TikaInputStream tstream = TikaInputStream.get(stream); try { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java index 943e7c5db9..f558a7ffe7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java @@ -558,10 +558,20 @@ public void testSimpleBodyInlined() throws Exception { @Test public void testGroupwise() throws Exception { List metadataList = getRecursiveMetadata("testGroupWiseEml.eml"); - assertEquals(2, metadataList.size()); - assertContains("ssssss", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); + assertEquals(3, metadataList.size()); + assertContains("test<", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + assertContains("test2", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString(), metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + assertEquals("/test.eml", + metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + + assertContains("ssssss", metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT)); + assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString(), + metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + assertEquals("/Neues Textdokument.txt", + metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/testGroupWiseEml.eml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/testGroupWiseEml.eml index a6d2398e1c..845527164b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/testGroupWiseEml.eml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/testGroupWiseEml.eml @@ -35,20 +35,20 @@ Content-Type: message/rfc822 Content-Transfer-Encoding: base64 Content-Disposition: attachment; filename="test.eml" -TWltZS1WZXJzaW9uOiAxLjANClgtTWFpbGVyOiBHcm91cFdpc2UgMjAxMg0KU3ViamVjdDogdGVz -dA0KRGF0ZTogVGh1LCAyNyBKdW4gMjAxMyAxMzoyNzoxMiArMDIwMA0KTWVzc2FnZS1JRDogPDUx -Q0MzREIwMDIwMDAwMDAwMDAwMDAwM0AkJCQ+DQpGcm9tOiAiTm92ZWxsIEdyb3VwV2lzZSIgPCQk -JC4kJCQuJCQkPg0KQ29udGVudC1UeXBlOiBtdWx0aXBhcnQvYWx0ZXJuYXRpdmU7IGJvdW5kYXJ5 -PSJfX19fTFBITVhMWk1YT01STEZLU0VKQ1dfX19fIg0KDQoNCi0tX19fX0xQSE1YTFpNWE9NUkxG -S1NFSkNXX19fXw0KQ29udGVudC1UeXBlOiB0ZXh0L3BsYWluOyBjaGFyc2V0PXV0Zi04DQpDb250 -ZW50LVRyYW5zZmVyLUVuY29kaW5nOiBiYXNlNjQNCkNvbnRlbnQtRGlzcG9zaXRpb246IGlubGlu -ZQ0KDQpkR1Z6ZEE9PQ0KLS1fX19fTFBITVhMWk1YT01STEZLU0VKQ1dfX19fDQpDb250ZW50LVR5 -cGU6IHRleHQvaHRtbDsgY2hhcnNldD11dGYtOA0KQ29udGVudC1UcmFuc2Zlci1FbmNvZGluZzog -cXVvdGVkLXByaW50YWJsZQ0KDQo8SFRNTD48SEVBRD4NCjxNRVRBIGNvbnRlbnQ9M0QidGV4dC9o -dG1sOyBjaGFyc2V0PTNEdXRmLTgiIGh0dHAtZXF1aXY9M0RDb250ZW50LVR5cGU+DQo8TUVUQSBu -YW1lPTNER0VORVJBVE9SIGNvbnRlbnQ9M0QiTVNIVE1MIDguMDAuNzYwMS4xNzY5OSI+PC9IRUFE -Pg0KPEJPRFkgc3R5bGU9M0QiTUFSR0lOOiA0cHggNHB4IDFweDsgRk9OVDogMTBwdCBTZWdvZSBV -SSI+dGVzdDwvQk9EWT48L0hUTUw+DQotLV9fX19MUEhNWExaTVhPTVJMRktTRUpDV19fX18tLQ0K +TWltZS1WZXJzaW9uOiAxLjAKWC1NYWlsZXI6IEdyb3VwV2lzZSAyMDEyClN1YmplY3Q6IHRlc3Qy +CkRhdGU6IFRodSwgMjcgSnVuIDIwMTMgMTM6Mjc6MTIgKzAyMDAKTWVzc2FnZS1JRDogPDUxQ0Mz +REIwMDIwMDAwMDAwMDAwMDAwM0AkJCQ+CkZyb206ICJOb3ZlbGwgR3JvdXBXaXNlIiA8JCQkLiQk +JC4kJCQ+CkNvbnRlbnQtVHlwZTogbXVsdGlwYXJ0L2FsdGVybmF0aXZlOyBib3VuZGFyeT0iX19f +X0xQSE1YTFpNWE9NUkxGS1NFSkNXX19fXyIKCgotLV9fX19MUEhNWExaTVhPTVJMRktTRUpDV19f +X18KQ29udGVudC1UeXBlOiB0ZXh0L3BsYWluOyBjaGFyc2V0PXV0Zi04CkNvbnRlbnQtVHJhbnNm +ZXItRW5jb2Rpbmc6IGJhc2U2NApDb250ZW50LURpc3Bvc2l0aW9uOiBpbmxpbmUKCmRHVnpkREk9 +Ci0tX19fX0xQSE1YTFpNWE9NUkxGS1NFSkNXX19fXwpDb250ZW50LVR5cGU6IHRleHQvaHRtbDsg +Y2hhcnNldD11dGYtOApDb250ZW50LVRyYW5zZmVyLUVuY29kaW5nOiBxdW90ZWQtcHJpbnRhYmxl +Cgo8SFRNTD48SEVBRD4KPE1FVEEgY29udGVudD0zRCJ0ZXh0L2h0bWw7IGNoYXJzZXQ9M0R1dGYt +OCIgaHR0cC1lcXVpdj0zRENvbnRlbnQtVHlwZT4KPE1FVEEgbmFtZT0zREdFTkVSQVRPUiBjb250 +ZW50PTNEIk1TSFRNTCA4LjAwLjc2MDEuMTc2OTkiPjwvSEVBRD4KPEJPRFkgc3R5bGU9M0QiTUFS +R0lOOiA0cHggNHB4IDFweDsgRk9OVDogMTBwdCBTZWdvZSBVSSI+dGVzdDI8L0JPRFk+PC9IVE1M +PgotLV9fX19MUEhNWExaTVhPTVJMRktTRUpDV19fX18tLQo= --____LPHMXLZMXOMRLFKSEJCW____ Content-Type: text/plain; charset=us-ascii Content-Transfer-Encoding: quoted-printable