From 20a8f47a2ff8658805f746d6845a015f14621907 Mon Sep 17 00:00:00 2001 From: Benny Bottema Date: Tue, 22 Oct 2019 09:59:23 +0200 Subject: [PATCH] #1 (bbottema/outlook-message-parser/issues/17): Restored support for UTF-8's legacy name (cp)65001 --- .../rtftohtml/impl/util/CharsetHelper.java | 10 +- .../impl/RTF2HTMLConverterClassicTest.java | 14 +- .../RTF2HTMLConverterJEditorPaneTest.java | 8 + .../RTF2HTMLConverterRFCCompliantTest.java | 8 + .../test-messages/input/unicode-test.rtf | 129 ++++ .../output/classic/unicode-test.html | 670 ++++++++++++++++++ .../output/rfcompliant/unicode-test.html | 670 ++++++++++++++++++ .../output/swing/unicode-test.html | 670 ++++++++++++++++++ 8 files changed, 2174 insertions(+), 5 deletions(-) create mode 100644 src/test/resources/test-messages/input/unicode-test.rtf create mode 100644 src/test/resources/test-messages/output/classic/unicode-test.html create mode 100644 src/test/resources/test-messages/output/rfcompliant/unicode-test.html create mode 100644 src/test/resources/test-messages/output/swing/unicode-test.html diff --git a/src/main/java/org/bbottema/rtftohtml/impl/util/CharsetHelper.java b/src/main/java/org/bbottema/rtftohtml/impl/util/CharsetHelper.java index 51b8fed..c369393 100644 --- a/src/main/java/org/bbottema/rtftohtml/impl/util/CharsetHelper.java +++ b/src/main/java/org/bbottema/rtftohtml/impl/util/CharsetHelper.java @@ -3,12 +3,18 @@ import java.nio.charset.Charset; import java.nio.charset.UnsupportedCharsetException; +import static java.nio.charset.StandardCharsets.UTF_8; + public class CharsetHelper { private static String[] CHARSET_PREFIXES = {"", "cp", "iso-", "ibm", "x-windows-", "ms"}; - + public static final Charset WINDOWS_CHARSET = Charset.forName("CP1252"); - + public static Charset findCharset(String rtfCodePage) { + return rtfCodePage.equals("65001") || rtfCodePage.equalsIgnoreCase("cp65001") ? UTF_8 : detectCharset(rtfCodePage); + } + + private static Charset detectCharset(String rtfCodePage) { for (String prefix : CHARSET_PREFIXES) { try { return Charset.forName(prefix + rtfCodePage); diff --git a/src/test/java/org/bbottema/rtftohtml/impl/RTF2HTMLConverterClassicTest.java b/src/test/java/org/bbottema/rtftohtml/impl/RTF2HTMLConverterClassicTest.java index d8d715b..1c9e3fe 100644 --- a/src/test/java/org/bbottema/rtftohtml/impl/RTF2HTMLConverterClassicTest.java +++ b/src/test/java/org/bbottema/rtftohtml/impl/RTF2HTMLConverterClassicTest.java @@ -23,12 +23,20 @@ public void testComplexRtfConversion() { assertThat(normalizeText(html)).isEqualTo(normalizeText(expectedHtml)); } - + @Test - public void testChineseRtfConversion() { + public void testChineseRtfConversion() { String html = RTF2HTMLConverterClassic.INSTANCE.rtf2html(classpathFileToString("test-messages/input/chinese-exotic-test.rtf")); String expectedHtml = classpathFileToString("test-messages/output/classic/chinese-exotic-test.html"); - + + assertThat(normalizeText(html)).isEqualTo(normalizeText(expectedHtml)); + } + + @Test + public void testUnicodeRtfConversion() { + String html = RTF2HTMLConverterClassic.INSTANCE.rtf2html(classpathFileToString("test-messages/input/unicode-test.rtf")); + String expectedHtml = classpathFileToString("test-messages/output/classic/unicode-test.html"); + assertThat(normalizeText(html)).isEqualTo(normalizeText(expectedHtml)); } } \ No newline at end of file diff --git a/src/test/java/org/bbottema/rtftohtml/impl/RTF2HTMLConverterJEditorPaneTest.java b/src/test/java/org/bbottema/rtftohtml/impl/RTF2HTMLConverterJEditorPaneTest.java index a0e1a63..3c9ef07 100644 --- a/src/test/java/org/bbottema/rtftohtml/impl/RTF2HTMLConverterJEditorPaneTest.java +++ b/src/test/java/org/bbottema/rtftohtml/impl/RTF2HTMLConverterJEditorPaneTest.java @@ -31,4 +31,12 @@ public void testChineseRtfConversion() { assertThat(normalizeText(html)).isEqualTo(normalizeText(expectedHtml)); } + + @Test + public void testUnicodeRtfConversion() { + String html = RTF2HTMLConverterClassic.INSTANCE.rtf2html(classpathFileToString("test-messages/input/unicode-test.rtf")); + String expectedHtml = classpathFileToString("test-messages/output/swing/unicode-test.html"); + + assertThat(normalizeText(html)).isEqualTo(normalizeText(expectedHtml)); + } } \ No newline at end of file diff --git a/src/test/java/org/bbottema/rtftohtml/impl/RTF2HTMLConverterRFCCompliantTest.java b/src/test/java/org/bbottema/rtftohtml/impl/RTF2HTMLConverterRFCCompliantTest.java index db6f448..2f496f1 100644 --- a/src/test/java/org/bbottema/rtftohtml/impl/RTF2HTMLConverterRFCCompliantTest.java +++ b/src/test/java/org/bbottema/rtftohtml/impl/RTF2HTMLConverterRFCCompliantTest.java @@ -31,4 +31,12 @@ public void testChineseRtfConversion() { assertThat(normalizeText(html)).isEqualTo(normalizeText(expectedHtml)); } + + @Test + public void testUnicodeRtfConversion() { + String html = RTF2HTMLConverterClassic.INSTANCE.rtf2html(classpathFileToString("test-messages/input/unicode-test.rtf")); + String expectedHtml = classpathFileToString("test-messages/output/rfcompliant/unicode-test.html"); + + assertThat(normalizeText(html)).isEqualTo(normalizeText(expectedHtml)); + } } \ No newline at end of file diff --git a/src/test/resources/test-messages/input/unicode-test.rtf b/src/test/resources/test-messages/input/unicode-test.rtf new file mode 100644 index 0000000..0ce0ed7 --- /dev/null +++ b/src/test/resources/test-messages/input/unicode-test.rtf @@ -0,0 +1,129 @@ +{\rtf1\ansi\ansicpg65001\fromhtml1 \fbidis \deff0{\fonttbl + {\f0\fswiss\fcharset1 Arial;} + {\f1\fmodern Courier New;} + {\f2\fnil\fcharset2 Symbol;} + {\f3\fmodern\fcharset0 Courier New;}} + {\colortbl\red0\green0\blue0;\red5\green99\blue193;} + \uc1\pard\plain\deftab360 \f0\fs24 + {\*\htmltag19 } + {\*\htmltag34 } + {\*\htmltag161 } + {\*\htmltag161 } + {\*\htmltag161 } + {\*\htmltag161 } + {\*\htmltag241 } + {\*\htmltag161 } + {\*\htmltag161 } + {\*\htmltag241 } + {\*\htmltag241 } +{\*\htmltag241 } +{\*\htmltag241 } +{\*\htmltag241 } +{\*\htmltag241 } +{\*\htmltag41 } +{\*\htmltag50 }\htmlrtf \lang9 \htmlrtf0 +{\*\htmltag96
}\htmlrtf {\htmlrtf0 +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\lang1033 \htmlrtf0 -/- +{\*\htmltag244 } +{\*\htmltag252 } +{\*\htmltag156 }\htmlrtf }\htmlrtf0 \htmlrtf\par}\htmlrtf0 + +{\*\htmltag72

} +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\lang1033 \htmlrtf0 Char-\'c3\'a5-Char +{\*\htmltag244 } +{\*\htmltag252 } +{\*\htmltag156 }\htmlrtf }\htmlrtf0 \htmlrtf\par}\htmlrtf0 + +{\*\htmltag72

} +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\lang1033 \htmlrtf0 -/- +{\*\htmltag244 } +{\*\htmltag252 } +{\*\htmltag156 }\htmlrtf }\htmlrtf0 \htmlrtf\par}\htmlrtf0 + +{\*\htmltag72

} +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\lang1033 \htmlrtf0 Char-\'c3\'85-Char +{\*\htmltag244 } +{\*\htmltag252 } +{\*\htmltag156 }\htmlrtf }\htmlrtf0 \htmlrtf\par}\htmlrtf0 + +{\*\htmltag72

} +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\lang1033 \htmlrtf0 -/- +{\*\htmltag244 } +{\*\htmltag252 } +{\*\htmltag156 }\htmlrtf }\htmlrtf0 \htmlrtf\par}\htmlrtf0 + +{\*\htmltag72

} +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\lang1033 \htmlrtf0 Char-\'c3\'b8-Char +{\*\htmltag244 } +{\*\htmltag252 } +{\*\htmltag156 }\htmlrtf }\htmlrtf0 \htmlrtf\par}\htmlrtf0 + +{\*\htmltag72

} +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\lang1033 \htmlrtf0 -/- +{\*\htmltag244 } +{\*\htmltag252 } +{\*\htmltag156 }\htmlrtf }\htmlrtf0 \htmlrtf\par}\htmlrtf0 + +{\*\htmltag72

} +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\lang1033 \htmlrtf0 Char-\'c3\'98-Char +{\*\htmltag244 } +{\*\htmltag252 } +{\*\htmltag156 }\htmlrtf }\htmlrtf0 \htmlrtf\par}\htmlrtf0 + +{\*\htmltag72

} +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\lang1033 \htmlrtf0 -/- +{\*\htmltag244 } +{\*\htmltag252 } +{\*\htmltag156 }\htmlrtf }\htmlrtf0 \htmlrtf\par}\htmlrtf0 + +{\*\htmltag72

} +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\lang1033 \htmlrtf0 Char-\'c3\'a6-Char +{\*\htmltag244 } +{\*\htmltag252 } +{\*\htmltag156 }\htmlrtf }\htmlrtf0 \htmlrtf\par}\htmlrtf0 + +{\*\htmltag72

} +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\lang1033 \htmlrtf0 -/- +{\*\htmltag244 } +{\*\htmltag252 } +{\*\htmltag156 }\htmlrtf }\htmlrtf0 \htmlrtf\par}\htmlrtf0 + +{\*\htmltag72

} +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\lang1033 \htmlrtf0 Char-\'c3\'86-Char +{\*\htmltag244 } +{\*\htmltag252 } +{\*\htmltag156 }\htmlrtf }\htmlrtf0 \htmlrtf\par}\htmlrtf0 + +{\*\htmltag72

} +{\*\htmltag64

}\htmlrtf {\htmlrtf0 +{\*\htmltag148 }\htmlrtf {\lang9 \htmlrtf0 +{\*\htmltag244 } +{\*\htmltag84  }\htmlrtf {\f3\'a0} \htmlrtf0 +{\*\htmltag252 } +{\*\htmltag156 }\htmlrtf }\htmlrtf0 \htmlrtf\par}\htmlrtf0 + +{\*\htmltag72

} +{\*\htmltag104
}\htmlrtf }\htmlrtf0 +{\*\htmltag58 } +{\*\htmltag27 }}'> \ No newline at end of file diff --git a/src/test/resources/test-messages/output/classic/unicode-test.html b/src/test/resources/test-messages/output/classic/unicode-test.html new file mode 100644 index 0000000..52f86ed --- /dev/null +++ b/src/test/resources/test-messages/output/classic/unicode-test.html @@ -0,0 +1,670 @@ + + + + + + + + + + + + + + + + + +
+

+ -/- + + + + + +

+

+ Char-å-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-Å-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-ø-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-Ø-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-æ-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-Æ-Char + + + + + +

+

+ + +  + + + + +

+
+ + \ No newline at end of file diff --git a/src/test/resources/test-messages/output/rfcompliant/unicode-test.html b/src/test/resources/test-messages/output/rfcompliant/unicode-test.html new file mode 100644 index 0000000..52f86ed --- /dev/null +++ b/src/test/resources/test-messages/output/rfcompliant/unicode-test.html @@ -0,0 +1,670 @@ + + + + + + + + + + + + + + + + + +
+

+ -/- + + + + + +

+

+ Char-å-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-Å-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-ø-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-Ø-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-æ-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-Æ-Char + + + + + +

+

+ + +  + + + + +

+
+ + \ No newline at end of file diff --git a/src/test/resources/test-messages/output/swing/unicode-test.html b/src/test/resources/test-messages/output/swing/unicode-test.html new file mode 100644 index 0000000..52f86ed --- /dev/null +++ b/src/test/resources/test-messages/output/swing/unicode-test.html @@ -0,0 +1,670 @@ + + + + + + + + + + + + + + + + + +
+

+ -/- + + + + + +

+

+ Char-å-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-Å-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-ø-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-Ø-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-æ-Char + + + + + +

+

+ -/- + + + + + +

+

+ Char-Æ-Char + + + + + +

+

+ + +  + + + + +

+
+ + \ No newline at end of file