Skip to content

Commit

Permalink
Fix concurrency problem in Whisper integration
Browse files Browse the repository at this point in the history
The SpeechToTextService interface ensures that a unique file is passed
to transcription engines which these can safely use to store output
files. Unfortunately, most Whisper implementations work slightly
different and you cannot specify a single output file. That is probably
why the Whisper engine throes the safe file name overboard and just uses
the parent directly to store files.

This has the side-effect that if `mediapackage-a/presenter.mp4` and
`mediapackage-b/presenter.mp4` are processed concurrently, both write to
the same file and an incorrect output file will end up being attached to
one of the media packages.

This patch fixes that problem by creating an unique directory instead
and passing that to the engines to operate in.

This also fixes the problem that most Whisper implementations create
many more output files which whould have never been deleted before,
slowly filling up the workspace.
  • Loading branch information
lkiesow committed Jan 7, 2024
1 parent cb63d82 commit d0a7677
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,28 @@
package org.opencastproject.speechtotext.api;

import java.io.File;
import java.util.Map;

/** Interface for speech-to-text implementations. */
public interface SpeechToTextEngine {

class Result {
private final String language;
private final File subtitleFile;

public Result(String language, File subtitleFile) {
this.language = language;
this.subtitleFile = subtitleFile;
}

public String getLanguage() {
return language;
}

public File getSubtitleFile() {
return subtitleFile;
}
}

/**
* Returns the name of the implemented engine.
*
Expand All @@ -38,13 +55,13 @@ public interface SpeechToTextEngine {
* Generates the subtitles file.
*
* @param mediaFile The media package containing the audio track.
* @param preparedOutputFile The prepared output file where the subtitle's data should be saved.
* @param workingDirectory A unique working directory to safely operate in.
* @param language The language of the audio track.
* @param translate If the subtitles should be translated into english
* @return HashMap which contains the language code and the subtitles file path.
* @return Result containing the language code and the subtitles file path.
* @throws SpeechToTextEngineException Thrown when an error occurs at the process.
*/
Map<String, Object> generateSubtitlesFile(File mediaFile, File preparedOutputFile, String language,
Result generateSubtitlesFile(File mediaFile, File workingDirectory, String language,
Boolean translate) throws SpeechToTextEngineException;

}
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,12 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileInputStream;
import java.net.URI;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Dictionary;
import java.util.List;
import java.util.Map;

/** Creates a subtitles file for a video. */
@Component(
Expand Down Expand Up @@ -146,34 +145,26 @@ protected String process(Job job) throws Exception {
translate = false;
}
URI subtitleFilesURI;
File subtitlesFile = null;
String vttFileName = String.format("%s%d_%s.%s", TMP_PREFIX,
job.getId(), FilenameUtils.getBaseName(mediaFile.getPath()), "vtt");
var name = String.format("job-%d", job.getId());
var jobDir = Path.of(workspace.rootDirectory(), "collection", COLLECTION, name).toFile();

try {
// prepare the output file
subtitlesFile = new File(String.format("%s/collection/%s/%s",
workspace.rootDirectory(), COLLECTION, vttFileName));
subtitlesFile.deleteOnExit();
FileUtils.forceMkdirParent(subtitlesFile);
Map<String,Object> subOutput = speechToTextEngine.generateSubtitlesFile(
workspace.get(mediaFile), subtitlesFile, language, translate);

subtitlesFile = (File) subOutput.get("subFile");
language = (String) subOutput.get("language");
jobDir.mkdirs();
SpeechToTextEngine.Result result = speechToTextEngine.generateSubtitlesFile(
workspace.get(mediaFile), jobDir, language, translate);
language = result.getLanguage();

// we need to call the "putInCollection" method to get
// a URI, that can be used in the following processes
try (FileInputStream subtitlesFileIS = new FileInputStream(subtitlesFile)) {
subtitleFilesURI = workspace.putInCollection(COLLECTION,
vttFileName.replaceFirst(TMP_PREFIX, ""), subtitlesFileIS);
final var outputName = String.format("%d-%s.vtt", job.getId(), FilenameUtils.getBaseName(mediaFile.getPath()));
try (FileInputStream in = new FileInputStream(result.getSubtitleFile())) {
subtitleFilesURI = workspace.putInCollection(COLLECTION, outputName, in);
}
} catch (Exception e) {
throw new SpeechToTextServiceException("Error while generating subtitle from " + mediaFile, e);
} finally {
if (subtitlesFile != null && subtitlesFile.exists()) {
FileUtils.deleteQuietly(subtitlesFile);
}
FileUtils.deleteQuietly(jobDir);
}
return subtitleFilesURI.toString() + "," + language;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.opencastproject.speechtotext.api.SpeechToTextEngineException;
import org.opencastproject.util.IoSupport;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.osgi.service.component.ComponentContext;
Expand All @@ -37,9 +38,7 @@
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/** Vosk implementation of the Speech-to-text engine interface. */
@Component(
Expand Down Expand Up @@ -98,7 +97,7 @@ public void activate(ComponentContext cc) {
* @see org.opencastproject.speechtotext.api.SpeechToTextEngine#generateSubtitlesFile(File, File, String, Boolean)
*/
@Override
public Map<String, Object> generateSubtitlesFile(File mediaFile, File preparedOutputFile,
public Result generateSubtitlesFile(File mediaFile, File workingDirectory,
String language, Boolean translate)
throws SpeechToTextEngineException {

Expand All @@ -107,11 +106,11 @@ public Map<String, Object> generateSubtitlesFile(File mediaFile, File preparedOu
language = voskLanguage;
}


var output = new File(workingDirectory, FilenameUtils.getBaseName(mediaFile.getAbsolutePath()));
final List<String> command = Arrays.asList(
voskExecutable,
"-i", mediaFile.getAbsolutePath(),
"-o", preparedOutputFile.getAbsolutePath(),
"-o", output.getAbsolutePath(),
"-l", language);
logger.info("Executing Vosk's transcription command: {}", command);

Expand All @@ -131,21 +130,17 @@ public Map<String, Object> generateSubtitlesFile(File mediaFile, File preparedOu
throw new SpeechToTextEngineException(
String.format("Vosk exited abnormally with status %d (command: %s)%s", exitCode, command, error));
}
if (!preparedOutputFile.isFile()) {
if (!output.isFile()) {
throw new SpeechToTextEngineException("Vosk produced no output");
}
logger.info("Subtitles file generated successfully: {}", preparedOutputFile);
logger.info("Subtitles file generated successfully: {}", output);
} catch (Exception e) {
logger.debug("Transcription failed closing Vosk transcription process for: {}", mediaFile);
throw new SpeechToTextEngineException(e);
} finally {
IoSupport.closeQuietly(process);
}
Map<String,Object> returnValues = new HashMap<>();
returnValues.put("subFile",preparedOutputFile);
returnValues.put("language",language);

return returnValues; // List containing the output File and language parameter
return new Result(language, output);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.opencastproject.util.OsgiUtil;
import org.opencastproject.util.data.Option;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
Expand All @@ -42,9 +43,7 @@
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -151,14 +150,14 @@ public void activate(ComponentContext cc) {
*/

@Override
public Map<String, Object> generateSubtitlesFile(File mediaFile,
File preparedOutputFile, String language, Boolean translate)
public Result generateSubtitlesFile(File mediaFile,
File workingDirectory, String language, Boolean translate)
throws SpeechToTextEngineException {

String[] baseCommands = { whisperExecutable,
mediaFile.getAbsolutePath(),
"--model", whisperModel,
"--output_dir", preparedOutputFile.getParent()};
"--output_dir", workingDirectory.getAbsolutePath()};

List<String> transcriptionCommand = new ArrayList<>(Arrays.asList(baseCommands));

Expand Down Expand Up @@ -192,7 +191,7 @@ public Map<String, Object> generateSubtitlesFile(File mediaFile,
logger.info("Executing Whisper's transcription command: {}", transcriptionCommand);

Process transcriptonProcess = null;
String mediaFileNameWithoutExtension;
File output;

try {
ProcessBuilder processBuilder = new ProcessBuilder(transcriptionCommand);
Expand All @@ -216,15 +215,14 @@ public Map<String, Object> generateSubtitlesFile(File mediaFile,
}

// Renaming output whisper filename to the expected output filename
String mediaFileName = mediaFile.getName();
mediaFileNameWithoutExtension = mediaFileName.lastIndexOf('.') != -1
? mediaFileName.substring(0, mediaFileName.lastIndexOf('.')) : mediaFileName;
preparedOutputFile = new File((preparedOutputFile.getParent() + "/" + mediaFileNameWithoutExtension + ".vtt"));
String outputFileName = FilenameUtils.getBaseName(mediaFile.getAbsolutePath()) + ".vtt";
output = new File(workingDirectory, outputFileName);
logger.debug("Whisper output file {}", output);

if (!preparedOutputFile.isFile()) {
if (!output.isFile()) {
throw new SpeechToTextEngineException("Whisper produced no output");
}
logger.info("Subtitles file generated successfully: {}", preparedOutputFile);
logger.info("Subtitles file generated successfully: {}", output);
} catch (Exception e) {
logger.debug("Transcription failed closing Whisper transcription process for: {}", mediaFile);
throw new SpeechToTextEngineException(e);
Expand All @@ -239,10 +237,10 @@ public Map<String, Object> generateSubtitlesFile(File mediaFile,

// Detect language if not set
if (language.isBlank()) {
JSONParser jsonParser = new JSONParser();
var jsonFile = FilenameUtils.removeExtension(output.getAbsolutePath()) + ".json";
var jsonParser = new JSONParser();
try {
FileReader reader = new FileReader((preparedOutputFile.getParent() + "/"
+ mediaFileNameWithoutExtension + ".json"));
FileReader reader = new FileReader(jsonFile);
Object obj = jsonParser.parse(reader);
JSONObject jsonObject = (JSONObject) obj;
language = (String) jsonObject.get("language");
Expand All @@ -253,11 +251,7 @@ public Map<String, Object> generateSubtitlesFile(File mediaFile,
}
}

Map<String,Object> returnValues = new HashMap<>();
returnValues.put("subFile",preparedOutputFile);
returnValues.put("language",language);

return returnValues; // Subtitles data
return new Result(language, output);
}

/**
Expand Down

0 comments on commit d0a7677

Please sign in to comment.