Add UTF-16 encoded subtitle support to SsaDecoder

Issue: #319 PiperOrigin-RevId: 527891646 (cherry picked from commit 06ac2f7)
androidx · May 15, 2023 · 841bdc6 · 841bdc6
1 parent 179e35b
commit 841bdc6
Show file tree

Hide file tree

Showing 6 changed files with 143 additions and 28 deletions.
diff --git a/RELEASENOTES.md b/RELEASENOTES.md
@@ -12,6 +12,9 @@
     *   Fix parsing of H.265 SPS in MPEG-TS files by re-using the parsing logic
         already used by RTSP and MP4 extractors
         ([#303](https://github.com/androidx/media/issues/303)).
+*   Text:
+    *   SSA: Add support for UTF-16 files if they start with a byte order mark
+        ([#319](https://github.com/androidx/media/issues/319)).
 *   Session:
     *   Fix issue where `MediaController` doesn't update its available commands
         when connected to a legacy `MediaSessionCompat` that updates its

diff --git a/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java b/libraries/common/src/main/java/androidx/media3/common/util/ParsableByteArray.java
@@ -233,11 +233,28 @@ public int peekUnsignedByte() {
     return (data[position] & 0xFF);
   }
 
-  /** Peeks at the next char. */
+  /**
+   * Peeks at the next char.
+   *
+   * <p>Equivalent to passing {@link Charsets#UTF_16} or {@link Charsets#UTF_16BE} to {@link
+   * #peekChar(Charset)}.
+   */
   public char peekChar() {
     return (char) ((data[position] & 0xFF) << 8 | (data[position + 1] & 0xFF));
   }
 
+  /**
+   * Peeks at the next char (as decoded by {@code charset})
+   *
+   * @throws IllegalArgumentException if charset is not supported. Only US_ASCII, UTF-8, UTF-16,
+   *     UTF-16BE, and UTF-16LE are supported.
+   */
+  public char peekChar(Charset charset) {
+    Assertions.checkArgument(
+        SUPPORTED_CHARSETS_FOR_READLINE.contains(charset), "Unsupported charset: " + charset);
+    return (char) (peekCharacterAndSize(charset) >> Short.SIZE);
+  }
+
   /** Reads the next byte as an unsigned value. */
   public int readUnsignedByte() {
     return (data[position++] & 0xFF);
@@ -649,27 +666,42 @@ private void skipLineTerminator(Charset charset) {
    * UTF-8 and two bytes for UTF-16).
    */
   private char readCharacterIfInList(Charset charset, char[] chars) {
-    char character;
-    int characterSize;
+    int characterAndSize = peekCharacterAndSize(charset);
+
+    if (characterAndSize != 0 && Chars.contains(chars, (char) (characterAndSize >> Short.SIZE))) {
+      position += characterAndSize & 0xFFFF;
+      return (char) (characterAndSize >> Short.SIZE);
+    } else {
+      return 0;
+    }
+  }
+
+  /**
+   * Peeks at the character at {@link #position} (as decoded by {@code charset}), returns it and the
+   * number of bytes the character takes up within the array packed into an int. First four bytes
+   * are the character and the second four is the size in bytes it takes. Returns 0 if {@link
+   * #bytesLeft()} doesn't allow reading a whole character in {@code charset} or if the {@code
+   * charset} is not one of US_ASCII, UTF-8, UTF-16, UTF-16BE, or UTF-16LE.
+   *
+   * <p>Only supports characters that occupy a single code unit (i.e. one byte for UTF-8 and two
+   * bytes for UTF-16).
+   */
+  private int peekCharacterAndSize(Charset charset) {
+    byte character;
+    short characterSize;
     if ((charset.equals(Charsets.UTF_8) || charset.equals(Charsets.US_ASCII)) && bytesLeft() >= 1) {
-      character = Chars.checkedCast(UnsignedBytes.toInt(data[position]));
+      character = (byte) Chars.checkedCast(UnsignedBytes.toInt(data[position]));
       characterSize = 1;
     } else if ((charset.equals(Charsets.UTF_16) || charset.equals(Charsets.UTF_16BE))
         && bytesLeft() >= 2) {
-      character = Chars.fromBytes(data[position], data[position + 1]);
+      character = (byte) Chars.fromBytes(data[position], data[position + 1]);
       characterSize = 2;
     } else if (charset.equals(Charsets.UTF_16LE) && bytesLeft() >= 2) {
-      character = Chars.fromBytes(data[position + 1], data[position]);
+      character = (byte) Chars.fromBytes(data[position + 1], data[position]);
       characterSize = 2;
     } else {
       return 0;
     }
-
-    if (Chars.contains(chars, character)) {
-      position += characterSize;
-      return Chars.checkedCast(character);
-    } else {
-      return 0;
-    }
+    return (Chars.checkedCast(character) << Short.SIZE) + characterSize;
   }
 }
diff --git a/libraries/extractor/src/main/java/androidx/media3/extractor/text/ssa/SsaDecoder.java b/libraries/extractor/src/main/java/androidx/media3/extractor/text/ssa/SsaDecoder.java
@@ -37,6 +37,8 @@
 import androidx.media3.extractor.text.SimpleSubtitleDecoder;
 import androidx.media3.extractor.text.Subtitle;
 import com.google.common.base.Ascii;
+import com.google.common.base.Charsets;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.LinkedHashMap;
 import java.util.List;
@@ -98,11 +100,14 @@ public SsaDecoder(@Nullable List<byte[]> initializationData) {
 
     if (initializationData != null && !initializationData.isEmpty()) {
       haveInitializationData = true;
+      // Currently, construction with initialization data is only relevant to SSA subtitles muxed
+      // in a MKV. According to https://www.matroska.org/technical/subtitles.html, these muxed
+      // subtitles are always encoded in UTF-8.
       String formatLine = Util.fromUtf8Bytes(initializationData.get(0));
       Assertions.checkArgument(formatLine.startsWith(FORMAT_LINE_PREFIX));
       dialogueFormatFromInitializationData =
           Assertions.checkNotNull(SsaDialogueFormat.fromFormatLine(formatLine));
-      parseHeader(new ParsableByteArray(initializationData.get(1)));
+      parseHeader(new ParsableByteArray(initializationData.get(1)), Charsets.UTF_8);
     } else {
       haveInitializationData = false;
       dialogueFormatFromInitializationData = null;
@@ -115,25 +120,37 @@ protected Subtitle decode(byte[] data, int length, boolean reset) {
     List<Long> cueTimesUs = new ArrayList<>();
 
     ParsableByteArray parsableData = new ParsableByteArray(data, length);
+    Charset charset = detectUtfCharset(parsableData);
+
     if (!haveInitializationData) {
-      parseHeader(parsableData);
+      parseHeader(parsableData, charset);
     }
-    parseEventBody(parsableData, cues, cueTimesUs);
+    parseEventBody(parsableData, cues, cueTimesUs, charset);
     return new SsaSubtitle(cues, cueTimesUs);
   }
 
+  /**
+   * Determine UTF encoding of the byte array from a byte order mark (BOM), defaulting to UTF-8 if
+   * no BOM is found.
+   */
+  private Charset detectUtfCharset(ParsableByteArray data) {
+    @Nullable Charset charset = data.readUtfCharsetFromBom();
+    return charset != null ? charset : Charsets.UTF_8;
+  }
+
   /**
    * Parses the header of the subtitle.
    *
    * @param data A {@link ParsableByteArray} from which the header should be read.
+   * @param charset The {@code Charset} of the encoding of {@code data}.
    */
-  private void parseHeader(ParsableByteArray data) {
+  private void parseHeader(ParsableByteArray data, Charset charset) {
     @Nullable String currentLine;
-    while ((currentLine = data.readLine()) != null) {
+    while ((currentLine = data.readLine(charset)) != null) {
       if ("[Script Info]".equalsIgnoreCase(currentLine)) {
-        parseScriptInfo(data);
+        parseScriptInfo(data, charset);
       } else if ("[V4+ Styles]".equalsIgnoreCase(currentLine)) {
-        styles = parseStyles(data);
+        styles = parseStyles(data, charset);
       } else if ("[V4 Styles]".equalsIgnoreCase(currentLine)) {
         Log.i(TAG, "[V4 Styles] are not supported");
       } else if ("[Events]".equalsIgnoreCase(currentLine)) {
@@ -151,11 +168,12 @@ private void parseHeader(ParsableByteArray data) {
    *
    * @param data A {@link ParsableByteArray} with {@link ParsableByteArray#getPosition() position}
    *     set to the beginning of the first line after {@code [Script Info]}.
+   * @param charset The {@code Charset} of the encoding of {@code data}.
    */
-  private void parseScriptInfo(ParsableByteArray data) {
+  private void parseScriptInfo(ParsableByteArray data, Charset charset) {
     @Nullable String currentLine;
-    while ((currentLine = data.readLine()) != null
-        && (data.bytesLeft() == 0 || data.peekUnsignedByte() != '[')) {
+    while ((currentLine = data.readLine(charset)) != null
+        && (data.bytesLeft() == 0 || data.peekChar(charset) != '[')) {
       String[] infoNameAndValue = currentLine.split(":");
       if (infoNameAndValue.length != 2) {
         continue;
@@ -187,13 +205,14 @@ private void parseScriptInfo(ParsableByteArray data) {
    *
    * @param data A {@link ParsableByteArray} with {@link ParsableByteArray#getPosition()} pointing
    *     at the beginning of the first line after {@code [V4+ Styles]}.
+   * @param charset The {@code Charset} of the encoding of {@code data}.
    */
-  private static Map<String, SsaStyle> parseStyles(ParsableByteArray data) {
+  private static Map<String, SsaStyle> parseStyles(ParsableByteArray data, Charset charset) {
     Map<String, SsaStyle> styles = new LinkedHashMap<>();
     @Nullable SsaStyle.Format formatInfo = null;
     @Nullable String currentLine;
-    while ((currentLine = data.readLine()) != null
-        && (data.bytesLeft() == 0 || data.peekUnsignedByte() != '[')) {
+    while ((currentLine = data.readLine(charset)) != null
+        && (data.bytesLeft() == 0 || data.peekChar(charset) != '[')) {
       if (currentLine.startsWith(FORMAT_LINE_PREFIX)) {
         formatInfo = SsaStyle.Format.fromFormatLine(currentLine);
       } else if (currentLine.startsWith(STYLE_LINE_PREFIX)) {
@@ -216,12 +235,14 @@ private static Map<String, SsaStyle> parseStyles(ParsableByteArray data) {
    * @param data A {@link ParsableByteArray} from which the body should be read.
    * @param cues A list to which parsed cues will be added.
    * @param cueTimesUs A sorted list to which parsed cue timestamps will be added.
+   * @param charset The {@code Charset} of the encoding of {@code data}.
    */
-  private void parseEventBody(ParsableByteArray data, List<List<Cue>> cues, List<Long> cueTimesUs) {
+  private void parseEventBody(
+      ParsableByteArray data, List<List<Cue>> cues, List<Long> cueTimesUs, Charset charset) {
     @Nullable
     SsaDialogueFormat format = haveInitializationData ? dialogueFormatFromInitializationData : null;
     @Nullable String currentLine;
-    while ((currentLine = data.readLine()) != null) {
+    while ((currentLine = data.readLine(charset)) != null) {
       if (currentLine.startsWith(FORMAT_LINE_PREFIX)) {
         format = SsaDialogueFormat.fromFormatLine(currentLine);
       } else if (currentLine.startsWith(DIALOGUE_LINE_PREFIX)) {

diff --git a/libraries/extractor/src/test/java/androidx/media3/extractor/text/ssa/SsaDecoderTest.java b/libraries/extractor/src/test/java/androidx/media3/extractor/text/ssa/SsaDecoderTest.java
@@ -30,6 +30,7 @@
 import com.google.common.collect.Iterables;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Objects;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
@@ -43,6 +44,8 @@ public final class SsaDecoderTest {
   private static final String TYPICAL_HEADER_ONLY = "media/ssa/typical_header";
   private static final String TYPICAL_DIALOGUE_ONLY = "media/ssa/typical_dialogue";
   private static final String TYPICAL_FORMAT_ONLY = "media/ssa/typical_format";
+  private static final String TYPICAL_UTF16LE = "media/ssa/typical_utf16le";
+  private static final String TYPICAL_UTF16BE = "media/ssa/typical_utf16be";
   private static final String OVERLAPPING_TIMECODES = "media/ssa/overlapping_timecodes";
   private static final String POSITIONS = "media/ssa/positioning";
   private static final String INVALID_TIMECODES = "media/ssa/invalid_timecodes";
@@ -130,6 +133,58 @@ public void decodeTypicalWithInitializationData() throws IOException {
     assertTypicalCue3(subtitle, 4);
   }
 
+  @Test
+  public void decodeTypicalUtf16le() throws IOException {
+    SsaDecoder decoder = new SsaDecoder();
+    byte[] bytes =
+        TestUtil.getByteArray(ApplicationProvider.getApplicationContext(), TYPICAL_UTF16LE);
+    Subtitle subtitle = decoder.decode(bytes, bytes.length, false);
+
+    assertThat(subtitle.getEventTimeCount()).isEqualTo(6);
+    // Check position, line, anchors & alignment are set from Alignment Style (2 - bottom-center).
+    Cue firstCue = subtitle.getCues(subtitle.getEventTime(0)).get(0);
+    assertWithMessage("Cue.textAlignment")
+        .that(firstCue.textAlignment)
+        .isEqualTo(Layout.Alignment.ALIGN_CENTER);
+    assertWithMessage("Cue.positionAnchor")
+        .that(firstCue.positionAnchor)
+        .isEqualTo(Cue.ANCHOR_TYPE_MIDDLE);
+    assertThat(firstCue.position).isEqualTo(0.5f);
+    assertThat(firstCue.lineAnchor).isEqualTo(Cue.ANCHOR_TYPE_END);
+    assertThat(firstCue.lineType).isEqualTo(Cue.LINE_TYPE_FRACTION);
+    assertThat(firstCue.line).isEqualTo(0.95f);
+
+    assertTypicalCue1(subtitle, 0);
+    assertTypicalCue2(subtitle, 2);
+    assertTypicalCue3(subtitle, 4);
+  }
+
+  @Test
+  public void decodeTypicalUtf16be() throws IOException {
+    SsaDecoder decoder = new SsaDecoder();
+    byte[] bytes =
+        TestUtil.getByteArray(ApplicationProvider.getApplicationContext(), TYPICAL_UTF16BE);
+    Subtitle subtitle = decoder.decode(bytes, bytes.length, false);
+
+    assertThat(subtitle.getEventTimeCount()).isEqualTo(6);
+    // Check position, line, anchors & alignment are set from Alignment Style (2 - bottom-center).
+    Cue firstCue = subtitle.getCues(subtitle.getEventTime(0)).get(0);
+    assertWithMessage("Cue.textAlignment")
+        .that(firstCue.textAlignment)
+        .isEqualTo(Layout.Alignment.ALIGN_CENTER);
+    assertWithMessage("Cue.positionAnchor")
+        .that(firstCue.positionAnchor)
+        .isEqualTo(Cue.ANCHOR_TYPE_MIDDLE);
+    assertThat(firstCue.position).isEqualTo(0.5f);
+    assertThat(firstCue.lineAnchor).isEqualTo(Cue.ANCHOR_TYPE_END);
+    assertThat(firstCue.lineType).isEqualTo(Cue.LINE_TYPE_FRACTION);
+    assertThat(firstCue.line).isEqualTo(0.95f);
+
+    assertTypicalCue1(subtitle, 0);
+    assertTypicalCue2(subtitle, 2);
+    assertTypicalCue3(subtitle, 4);
+  }
+
   @Test
   public void decodeOverlappingTimecodes() throws IOException {
     SsaDecoder decoder = new SsaDecoder();
@@ -438,6 +493,10 @@ private static void assertTypicalCue1(Subtitle subtitle, int eventIndex) {
     assertThat(subtitle.getEventTime(eventIndex)).isEqualTo(0);
     assertThat(subtitle.getCues(subtitle.getEventTime(eventIndex)).get(0).text.toString())
         .isEqualTo("This is the first subtitle.");
+    assertThat(
+            Objects.requireNonNull(
+                subtitle.getCues(subtitle.getEventTime(eventIndex)).get(0).textAlignment))
+        .isEqualTo(Layout.Alignment.ALIGN_CENTER);
     assertThat(subtitle.getEventTime(eventIndex + 1)).isEqualTo(1230000);
   }
 

diff --git a/libraries/test_data/src/test/assets/media/ssa/typical_utf16be b/libraries/test_data/src/test/assets/media/ssa/typical_utf16be
diff --git a/libraries/test_data/src/test/assets/media/ssa/typical_utf16le b/libraries/test_data/src/test/assets/media/ssa/typical_utf16le