Skip to content

Commit

Permalink
created new script for converting bilingual captions to monolingual c…
Browse files Browse the repository at this point in the history
…aption (#399)
  • Loading branch information
tyisme614 authored Dec 8, 2022
1 parent 46186ee commit b983b66
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 1 deletion.
15 changes: 14 additions & 1 deletion subtitles/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,17 @@ Some languages like Simplified Chinese have a different YouTube language code (`
python utils/generate_subtitles.py --language zh-CN --youtube_language_code zh-Hans
```

Once you have the `.srt` files you can manually fix any translation errors and then open a pull request with the new files.
Once you have the `.srt` files you can manually fix any translation errors and then open a pull request with the new files.

# How to convert bilingual subtitle to monolingual subtitle

# Logic

The english caption line is conventionally placed at the last line of each subtitle block in srt files. So removing the last line of each subtitle block would make the bilingual subtitle a monolingual subtitle.

# Usage
> python3 convert_bilingual_monolingual.py -i \<input_file\> -o \<output_file\>
**Example**
* For instance, the input file name is "test.cn.en.srt", and you name your output file as "output_test.cn.srt" *
> python3 convert_bilingual_monolingual.py -i test.cn.en.srt -o output_test.cn.srt
61 changes: 61 additions & 0 deletions utils/convert_bilingual_monolingual.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/python3
import getopt
import re
import sys

PATTERN_TIMESTAMP = re.compile('^[0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9] --> [0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9][0-9][0-9]')
PATTERN_NUM = re.compile('\\d+')


def main(argv):
inputfile = ''
outputfile = ''
try:
opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="])
except getopt.GetoptError:
print('srt_worker.py -i <inputfile> -o <outputfile>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print( 'Usage: convert_bilingual_monolingual.py -i <inputfile> -o <outputfile>')
sys.exit(-2)
elif opt in ("-i", "--ifile"):
inputfile = arg
elif opt in ("-o", "--ofile"):
outputfile = arg

if not inputfile:
print('no input file is specified.\nUsage: convert_bilingual_monolingual.py -i <inputfile> -o <outputfile>')
elif not outputfile:
print('no output file is specified.\nUsage: convert_bilingual_monolingual.py -i <inputfile> -o <outputfile>')
else:
process(inputfile, outputfile)


def process(input_file, output):
"""
Convert bilingual caption file to monolingual caption, supported caption file type is srt.
"""
line_count = 0
with open(input_file) as file:
with open(output, 'a') as output:
for line in file:
if line_count == 0:
line_count += 1
output.write(line)
elif PATTERN_TIMESTAMP.match(line):
line_count += 1
output.write(line)
elif line == '\n':
line_count = 0
output.write(line)
else:
if line_count == 2:
output.write(line)
line_count += 1
output.close()
print('conversion completed!')


if __name__ == "__main__":
main(sys.argv[1:])

0 comments on commit b983b66

Please sign in to comment.