Fix concurrency problem in Whisper integration

The SpeechToTextService interface ensures that a unique file is passed to transcription engines which these can safely use to store output files. Unfortunately, most Whisper implementations work slightly different and you cannot specify a single output file. That is probably why the Whisper engine throes the safe file name overboard and just uses the parent directly to store files. This has the side-effect that if `mediapackage-a/presenter.mp4` and `mediapackage-b/presenter.mp4` are processed concurrently, both write to the same file and an incorrect output file will end up being attached to one of the media packages. This patch fixes that problem by creating an unique directory instead and passing that to the engines to operate in. This also fixes the problem that most Whisper implementations create many more output files which whould have never been deleted before, slowly filling up the workspace.
lkiesow · Jan 7, 2024 · d0a7677 · d0a7677
1 parent cb63d82
commit d0a7677
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 56 deletions.
diff --git a/...ch-to-text-api/src/main/java/org/opencastproject/speechtotext/api/SpeechToTextEngine.java b/...ch-to-text-api/src/main/java/org/opencastproject/speechtotext/api/SpeechToTextEngine.java
@@ -22,11 +22,28 @@
 package org.opencastproject.speechtotext.api;
 
 import java.io.File;
-import java.util.Map;
 
 /** Interface for speech-to-text implementations. */
 public interface SpeechToTextEngine {
 
+  class Result {
+    private final String language;
+    private final File subtitleFile;
+
+    public Result(String language, File subtitleFile) {
+      this.language = language;
+      this.subtitleFile = subtitleFile;
+    }
+
+    public String getLanguage() {
+      return language;
+    }
+
+    public File getSubtitleFile() {
+      return subtitleFile;
+    }
+  }
+
   /**
    * Returns the name of the implemented engine.
    *
@@ -38,13 +55,13 @@ public interface SpeechToTextEngine {
    * Generates the subtitles file.
    *
    * @param mediaFile          The media package containing the audio track.
-   * @param preparedOutputFile The prepared output file where the subtitle's data should be saved.
+   * @param workingDirectory   A unique working directory to safely operate in.
    * @param language           The language of the audio track.
    * @param translate          If the subtitles should be translated into english
-   * @return HashMap which contains the language code and the subtitles file path.
+   * @return Result containing the language code and the subtitles file path.
    * @throws SpeechToTextEngineException Thrown when an error occurs at the process.
    */
-  Map<String, Object> generateSubtitlesFile(File mediaFile, File preparedOutputFile, String language,
+  Result generateSubtitlesFile(File mediaFile, File workingDirectory, String language,
           Boolean translate) throws SpeechToTextEngineException;
 
 }
diff --git a/...ext-impl/src/main/java/org/opencastproject/speechtotext/impl/SpeechToTextServiceImpl.java b/...ext-impl/src/main/java/org/opencastproject/speechtotext/impl/SpeechToTextServiceImpl.java
@@ -45,13 +45,12 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.File;
 import java.io.FileInputStream;
 import java.net.URI;
+import java.nio.file.Path;
 import java.util.Arrays;
 import java.util.Dictionary;
 import java.util.List;
-import java.util.Map;
 
 /** Creates a subtitles file for a video. */
 @Component(
@@ -146,34 +145,26 @@ protected String process(Job job) throws Exception {
       translate = false;
     }
     URI subtitleFilesURI;
-    File subtitlesFile = null;
-    String vttFileName = String.format("%s%d_%s.%s", TMP_PREFIX,
-            job.getId(), FilenameUtils.getBaseName(mediaFile.getPath()), "vtt");
+    var name = String.format("job-%d", job.getId());
+    var jobDir  = Path.of(workspace.rootDirectory(), "collection", COLLECTION, name).toFile();
 
     try {
       // prepare the output file
-      subtitlesFile = new File(String.format("%s/collection/%s/%s",
-              workspace.rootDirectory(), COLLECTION, vttFileName));
-      subtitlesFile.deleteOnExit();
-      FileUtils.forceMkdirParent(subtitlesFile);
-      Map<String,Object> subOutput = speechToTextEngine.generateSubtitlesFile(
-              workspace.get(mediaFile), subtitlesFile, language, translate);
-
-      subtitlesFile = (File) subOutput.get("subFile");
-      language = (String) subOutput.get("language");
+      jobDir.mkdirs();
+      SpeechToTextEngine.Result result = speechToTextEngine.generateSubtitlesFile(
+              workspace.get(mediaFile), jobDir, language, translate);
+      language = result.getLanguage();
 
       // we need to call the "putInCollection" method to get
       // a URI, that can be used in the following processes
-      try (FileInputStream subtitlesFileIS = new FileInputStream(subtitlesFile)) {
-        subtitleFilesURI = workspace.putInCollection(COLLECTION,
-                vttFileName.replaceFirst(TMP_PREFIX, ""), subtitlesFileIS);
+      final var outputName = String.format("%d-%s.vtt", job.getId(), FilenameUtils.getBaseName(mediaFile.getPath()));
+      try (FileInputStream in = new FileInputStream(result.getSubtitleFile())) {
+        subtitleFilesURI = workspace.putInCollection(COLLECTION, outputName, in);
       }
     } catch (Exception e) {
       throw new SpeechToTextServiceException("Error while generating subtitle from " + mediaFile, e);
     } finally {
-      if (subtitlesFile != null && subtitlesFile.exists()) {
-        FileUtils.deleteQuietly(subtitlesFile);
-      }
+      FileUtils.deleteQuietly(jobDir);
     }
     return subtitleFilesURI.toString() + "," + language;
   }

diff --git a/...h-to-text-impl/src/main/java/org/opencastproject/speechtotext/impl/engine/VoskEngine.java b/...h-to-text-impl/src/main/java/org/opencastproject/speechtotext/impl/engine/VoskEngine.java
@@ -25,6 +25,7 @@
 import org.opencastproject.speechtotext.api.SpeechToTextEngineException;
 import org.opencastproject.util.IoSupport;
 
+import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.osgi.service.component.ComponentContext;
@@ -37,9 +38,7 @@
 import java.io.File;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 
 /** Vosk implementation of the Speech-to-text engine interface. */
 @Component(
@@ -98,7 +97,7 @@ public void activate(ComponentContext cc) {
    * @see org.opencastproject.speechtotext.api.SpeechToTextEngine#generateSubtitlesFile(File, File, String, Boolean)
    */
   @Override
-  public Map<String, Object> generateSubtitlesFile(File mediaFile, File preparedOutputFile,
+  public Result generateSubtitlesFile(File mediaFile, File workingDirectory,
       String language, Boolean translate)
           throws SpeechToTextEngineException {
 
@@ -107,11 +106,11 @@ public Map<String, Object> generateSubtitlesFile(File mediaFile, File preparedOu
       language = voskLanguage;
     }
 
-
+    var output = new File(workingDirectory, FilenameUtils.getBaseName(mediaFile.getAbsolutePath()));
     final List<String> command = Arrays.asList(
             voskExecutable,
             "-i", mediaFile.getAbsolutePath(),
-            "-o", preparedOutputFile.getAbsolutePath(),
+            "-o", output.getAbsolutePath(),
             "-l", language);
     logger.info("Executing Vosk's transcription command: {}", command);
 
@@ -131,21 +130,17 @@ public Map<String, Object> generateSubtitlesFile(File mediaFile, File preparedOu
         throw new SpeechToTextEngineException(
                 String.format("Vosk exited abnormally with status %d (command: %s)%s", exitCode, command, error));
       }
-      if (!preparedOutputFile.isFile()) {
+      if (!output.isFile()) {
         throw new SpeechToTextEngineException("Vosk produced no output");
       }
-      logger.info("Subtitles file generated successfully: {}", preparedOutputFile);
+      logger.info("Subtitles file generated successfully: {}", output);
     } catch (Exception e) {
       logger.debug("Transcription failed closing Vosk transcription process for: {}", mediaFile);
       throw new SpeechToTextEngineException(e);
     } finally {
       IoSupport.closeQuietly(process);
     }
-    Map<String,Object> returnValues = new HashMap<>();
-    returnValues.put("subFile",preparedOutputFile);
-    returnValues.put("language",language);
-
-    return returnValues; // List containing the output File and language parameter
+    return new Result(language, output);
   }
 
 }
diff --git a/...o-text-impl/src/main/java/org/opencastproject/speechtotext/impl/engine/WhisperEngine.java b/...o-text-impl/src/main/java/org/opencastproject/speechtotext/impl/engine/WhisperEngine.java
@@ -26,6 +26,7 @@
 import org.opencastproject.util.OsgiUtil;
 import org.opencastproject.util.data.Option;
 
+import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.json.simple.JSONObject;
 import org.json.simple.parser.JSONParser;
@@ -42,9 +43,7 @@
 import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
@@ -151,14 +150,14 @@ public void activate(ComponentContext cc) {
    */
 
   @Override
-  public Map<String, Object> generateSubtitlesFile(File mediaFile,
-      File preparedOutputFile, String language, Boolean translate)
+  public Result generateSubtitlesFile(File mediaFile,
+      File workingDirectory, String language, Boolean translate)
           throws SpeechToTextEngineException {
 
     String[] baseCommands = { whisperExecutable,
     mediaFile.getAbsolutePath(),
         "--model", whisperModel,
-        "--output_dir", preparedOutputFile.getParent()};
+        "--output_dir", workingDirectory.getAbsolutePath()};
 
     List<String> transcriptionCommand = new ArrayList<>(Arrays.asList(baseCommands));
 
@@ -192,7 +191,7 @@ public Map<String, Object> generateSubtitlesFile(File mediaFile,
     logger.info("Executing Whisper's transcription command: {}", transcriptionCommand);
 
     Process transcriptonProcess = null;
-    String mediaFileNameWithoutExtension;
+    File output;
 
     try {
       ProcessBuilder processBuilder = new ProcessBuilder(transcriptionCommand);
@@ -216,15 +215,14 @@ public Map<String, Object> generateSubtitlesFile(File mediaFile,
       }
 
       // Renaming output whisper filename to the expected output filename
-      String mediaFileName = mediaFile.getName();
-      mediaFileNameWithoutExtension = mediaFileName.lastIndexOf('.') != -1
-          ? mediaFileName.substring(0, mediaFileName.lastIndexOf('.')) : mediaFileName;
-      preparedOutputFile = new File((preparedOutputFile.getParent() + "/" + mediaFileNameWithoutExtension + ".vtt"));
+      String outputFileName = FilenameUtils.getBaseName(mediaFile.getAbsolutePath()) + ".vtt";
+      output = new File(workingDirectory, outputFileName);
+      logger.debug("Whisper output file {}", output);
 
-      if (!preparedOutputFile.isFile()) {
+      if (!output.isFile()) {
         throw new SpeechToTextEngineException("Whisper produced no output");
       }
-      logger.info("Subtitles file generated successfully: {}", preparedOutputFile);
+      logger.info("Subtitles file generated successfully: {}", output);
     } catch (Exception e) {
       logger.debug("Transcription failed closing Whisper transcription process for: {}", mediaFile);
       throw new SpeechToTextEngineException(e);
@@ -239,10 +237,10 @@ public Map<String, Object> generateSubtitlesFile(File mediaFile,
 
     // Detect language if not set
     if (language.isBlank()) {
-      JSONParser jsonParser = new JSONParser();
+      var jsonFile = FilenameUtils.removeExtension(output.getAbsolutePath()) + ".json";
+      var jsonParser = new JSONParser();
       try {
-        FileReader reader = new FileReader((preparedOutputFile.getParent() + "/"
-            + mediaFileNameWithoutExtension + ".json"));
+        FileReader reader = new FileReader(jsonFile);
         Object obj = jsonParser.parse(reader);
         JSONObject jsonObject = (JSONObject) obj;
         language = (String) jsonObject.get("language");
@@ -253,11 +251,7 @@ public Map<String, Object> generateSubtitlesFile(File mediaFile,
       }
     }
 
-    Map<String,Object> returnValues = new HashMap<>();
-    returnValues.put("subFile",preparedOutputFile);
-    returnValues.put("language",language);
-
-    return returnValues; // Subtitles data
+    return new Result(language, output);
   }
 
   /**