Skip to content

Commit

Permalink
OCTO-10961-honor-offset-commands
Browse files Browse the repository at this point in the history
  • Loading branch information
OlteanuRares committed Mar 27, 2024
1 parent c7d1cc1 commit 19fbf72
Show file tree
Hide file tree
Showing 8 changed files with 116 additions and 88 deletions.
1 change: 0 additions & 1 deletion pycaption/scc/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from itertools import product
from collections import defaultdict

COMMANDS = {
'9420': '',
Expand Down
68 changes: 44 additions & 24 deletions pycaption/scc/specialized_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
PAC_BYTES_TO_POSITIONING_MAP, COMMANDS, PAC_TAB_OFFSET_COMMANDS,
MICROSECONDS_PER_CODEWORD, INCONVERTIBLE_TO_ASCII_EXTENDED_CHARS_ASSOCIATION
)
from .translator import not_italics_commands

PopOnCue = collections.namedtuple("PopOnCue", "buffer, start, end")

Expand Down Expand Up @@ -254,7 +255,7 @@ def create_and_store(self, node_buffer, start, end=0):
layout_info = _get_layout_from_tuple(instruction.position)
caption.nodes.append(
CaptionNode.create_text(
instruction.get_text(), layout_info=layout_info),
instruction.text, layout_info=layout_info),
)
caption.layout_info = layout_info

Expand Down Expand Up @@ -312,40 +313,59 @@ def add_chars(self, *chars):
node = _InstructionNode(position=current_position)
self._collection.append(node)

# handle a simple line break
# if offset command add spaces
if self._position_tracer._spaces_to_add:
node = _InstructionNode.create_text(
current_position,
" " * self._position_tracer._spaces_to_add
)
self._collection.append(node)
self._position_tracer._spaces_to_add = 0
node.add_chars(*chars)

def interpret_command(self, command):
"""Creates instruction node for the command
:type command: str
"""
self._update_positioning(command)
text = COMMANDS.get(command, '')
current_position = self._position_tracer.get_current_position()

# if a command sets text to something else than italic is having an open
# italics tag, it should close it to reset the text style
# doing this first so it close the tag in current line

has_open_italics_tag = False
for node in self._collection[::-1]:
if node.is_italics_node():
if node.sets_italics_on():
has_open_italics_tag = True
break
if command in not_italics_commands and has_open_italics_tag:
self._collection.append(
_InstructionNode.create_italics_style(
self._position_tracer.get_current_position(),
turn_on=False
)
)

# add break instruction node
if self._position_tracer.is_linebreak_required():
# must insert a line break here
self._collection.append(_InstructionNode.create_break(
position=current_position))
node = _InstructionNode.create_text(current_position)
self._collection.append(node)
self._position_tracer.acknowledge_linebreak_consumed()

# handle completely new positioning
elif self._position_tracer.is_repositioning_required():
# add repositioning instruction node
if self._position_tracer.is_repositioning_required():
self._collection.append(
_InstructionNode.create_repositioning_command(
current_position
)
)
node = _InstructionNode.create_text(current_position)
self._collection.append(node)
self._position_tracer.acknowledge_position_changed()

node.add_chars(*chars)

def interpret_command(self, command):
"""Given a command determines whether to turn italics on or off,
or to set the positioning
This is mostly used to convert from the legacy-style commands
:type command: str
"""
self._update_positioning(command)

text = COMMANDS.get(command, '')

# add italic open/closed italics nodes
if 'italic' in text:
if 'end' not in text:
self._collection.append(
Expand All @@ -370,6 +390,7 @@ def _update_positioning(self, command):
prev_positioning = self._position_tracer.default
positioning = (prev_positioning[0],
prev_positioning[1] + tab_offset)
self._position_tracer._spaces_to_add = tab_offset
else:
first, second = command[:2], command[2:]

Expand Down Expand Up @@ -498,7 +519,6 @@ def add_chars(self, *args):
"""
if self.text is None:
self.text = ''

self.text += ''.join(args)

def is_text_node(self):
Expand Down
18 changes: 4 additions & 14 deletions pycaption/scc/state_machines.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def __init__(self, positioning=None):
# this attribute is used to store it and determine by comparison if the
# next positioning is actually a Tab Offset
self._last_column = None
self._spaces_to_add = 0

def update_positioning(self, positioning):
"""Being notified of a position change, updates the internal state,
Expand All @@ -35,26 +36,15 @@ def update_positioning(self, positioning):
return

row, col = current
if self._break_required:
col = self._last_column
new_row, new_col = positioning
is_tab_offset = new_row == row and col + 1 <= new_col <= col + 3

# One line below will be treated as line break, not repositioning
if new_row == row + 1:
self._positions.append((new_row, col))
self._positions.append((new_row, new_col))
self._break_required = True
self._last_column = new_col
# Tab offsets after line breaks will be ignored to avoid repositioning
elif self._break_required and is_tab_offset:
return
self._spaces_to_add = new_col
else:
# Reset the "current" position altogether.
self._positions = [positioning]
# Tab offsets are not interpreted as repositioning, but adjustments
# to the previous PAC command
if not is_tab_offset:
self._repositioning_required = True
self._repositioning_required = True

def get_current_position(self):
"""Returns the current usable position
Expand Down
5 changes: 5 additions & 0 deletions pycaption/scc/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,11 @@
"94a2": "Alarm On"
}

not_italics_commands = {
key: value for key, value in COMMAND_LABELS.items()
if "italic" not in value
}


def translate_scc(scc_content, brackets='[]'):
"""
Expand Down
20 changes: 10 additions & 10 deletions tests/fixtures/dfxp.py
Original file line number Diff line number Diff line change
Expand Up @@ -939,8 +939,8 @@ def sample_dfxp_from_scc_output():
</p>
<p begin="00:00:03.136" end="00:00:09.709" region="r3" style="default">
ghgh<br/>
ijij<br/>
klkl
ijij<br/>
klkl
</p>
<p begin="00:00:09.709" end="00:00:11.711" region="r4" style="default">
mnmn
Expand All @@ -953,8 +953,8 @@ def sample_dfxp_from_scc_output():
</p>
<p begin="00:00:11.711" end="00:00:20.086" region="r7" style="default">
stst<br/>
uvuv<br/>
wxwx
uvuv<br/>
wxwx
</p>
<p begin="00:00:20.086" end="00:00:22.088" region="r8" style="default">
yzyz
Expand All @@ -967,8 +967,8 @@ def sample_dfxp_from_scc_output():
</p>
<p begin="00:00:22.088" end="00:00:26.088" region="r11" style="default">
4545<br/>
6767<br/>
8989
6767<br/>
8989
</p>
</div>
</body>
Expand Down Expand Up @@ -1004,15 +1004,15 @@ def sample_dfxp_with_properly_closing_spans_output():
bbbb
</p>
<p begin="00:01:35.633" end="00:01:40.833" region="r2" style="default">
<span tts:fontStyle="italic" region="r2">cccc<br/>
bbaa</span>
<span tts:fontStyle="italic" region="r2">cccc</span><br/>
bbaa
</p>
<p begin="00:01:55.766" end="00:01:59.500" region="r0" style="default">
aa
</p>
<p begin="00:01:55.766" end="00:01:59.500" region="r3" style="default">
<span tts:fontStyle="italic" region="r3">bb<br/>
cc</span>
<span tts:fontStyle="italic" region="r3">bb</span><br/>
cc
</p>
<p begin="00:01:59.500" end="00:02:03.500" region="r3" style="default">
abcd
Expand Down
5 changes: 4 additions & 1 deletion tests/fixtures/webvtt.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,10 @@ def sample_webvtt_from_scc_properly_writes_newlines_output():
21:30.000 --> 21:34.000 align:left position:20% line:83% size:70%
aa
bb
21:30.000 --> 21:34.000 align:left position:35% line:89% size:55%
bb
"""


Expand Down
86 changes: 49 additions & 37 deletions tests/test_scc.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,29 @@ def test_tab_offset(self, sample_scc_tab_offset):
captions = SCCReader().read(sample_scc_tab_offset)

# SCC generates only origin, and we always expect it.
# expected_positioning = [
# ((37.5, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)),
# ((17.5, UnitEnum.PERCENT), (89.0, UnitEnum.PERCENT)),
# ((12.5, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)),
# ((27.5, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)),
# ((30.0, UnitEnum.PERCENT), (89.0, UnitEnum.PERCENT)),
# ((35.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)),
# ((17.5, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT))
# ]

expected_positioning = [
((37.5, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)),
((12.5, UnitEnum.PERCENT), (89.0, UnitEnum.PERCENT)),
((17.5, UnitEnum.PERCENT), (89.0, UnitEnum.PERCENT)),
((12.5, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)),
((32.5, UnitEnum.PERCENT), (89.0, UnitEnum.PERCENT)),
((27.5, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)),
((15.0, UnitEnum.PERCENT), (89.0, UnitEnum.PERCENT)),
((30.0, UnitEnum.PERCENT), (89.0, UnitEnum.PERCENT)),
((35.0, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)),
((17.5, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT))
((22.5, UnitEnum.PERCENT), (89.0, UnitEnum.PERCENT)),
((17.5, UnitEnum.PERCENT), (83.0, UnitEnum.PERCENT)),
((22.5, UnitEnum.PERCENT), (89.0, UnitEnum.PERCENT))
]

actual_positioning = [
Expand Down Expand Up @@ -135,7 +150,8 @@ def switches_italics(node):
# will most likely change
assert switches_italics(nodes[0]) is True
assert switches_italics(nodes[2]) is False
assert nodes[1].content == 'abababab'
# we're respecting offsets and we have a 97a2 = 2 spaces
assert nodes[1].content == ' abababab'

def test_default_positioning_when_no_positioning_is_specified(
self, sample_no_positioning_at_all_scc):
Expand Down Expand Up @@ -200,17 +216,20 @@ def test_skip_extended_characters_ascii_duplicate(
caption_set = SCCReader().read(sample_scc_with_extended_characters)
captions = caption_set.get_captions('en-US')
assert captions[0].nodes[0].content == 'MÄRTHA:'
expected_result = ['JUNIOR: ¡Yum!', None, 'Ya me siento mucho mejor.']
# 97a1 means one space before Junior
# 9723 means three spaces before Ya me ...
expected_result = [' JUNIOR: ¡Yum!', None, ' Ya me siento mucho mejor.']
content = [node.content for node in captions[1].nodes]
assert all(result in expected_result for result in content)

def test_skip_duplicate_tab_offset(self, sample_scc_duplicate_tab_offset):
# 97a1 space before [Radio
expected_lines = [
'[Radio reporter]',
'The I-10 Santa Monica Freeway',
'westbound is jammed,',
'due to a three-car accident',
'blocking lanes 1 and 2',
' [Radio reporter]',
' The I-10 Santa Monica Freeway',
' westbound is jammed,',
' due to a three-car accident',
' blocking lanes 1 and 2'
]

caption_set = SCCReader().read(sample_scc_duplicate_tab_offset)
Expand All @@ -220,7 +239,6 @@ def test_skip_duplicate_tab_offset(self, sample_scc_duplicate_tab_offset):
for node in cap_.nodes
if node.type_ == CaptionNode.TEXT
]

assert expected_lines == actual_lines

def test_skip_duplicate_special_characters(
Expand Down Expand Up @@ -248,9 +266,10 @@ def test_line_too_long(self, sample_scc_with_line_too_long):
with pytest.raises(CaptionLineLengthError) as exc_info:
SCCReader().read(sample_scc_with_line_too_long)

# 9723 - 3 spaces before "was Call" so length will be 81 + 3
assert exc_info.value.args[0].startswith(
"32 character limit for caption cue in scc file.")
assert ("was Cal l l l l l l l l l l l l l l l l l l l l l l l l l l l l Denison, a friend - Length 81"
assert (" was Cal l l l l l l l l l l l l l l l l l l l l l l l l l l l l Denison, a friend - Length 84"
in exc_info.value.args[0].split("\n"))


Expand Down Expand Up @@ -294,21 +313,20 @@ def test_multiple_formats(self, sample_scc_multiple_formats):
# Test for captions that contain both pop on and paint on formats to
# ensure the paint on lines are not repeated
expected_text_lines = [
"(Client's Voice)",
'Remember that degree',
'you got in taxation?',
'(Danny)',
"Of course you don't",
"because you didn't!",
"Your job isn't doing hard",
'work...',
"...it's making them do hard",
'work...',
'...and getting paid for it.',
'(VO)',
'Snap and sort your expenses to',
'save over $4,600 at tax time.',
'QUICKBOOKS. BACKING YOU.',
" (Client's Voice)",
' Remember that degree',
' you got in taxation?',
' (Danny)',
" Of course you don't",
" because you didn't!",
" Your job isn't doing hard",
' work...',
" ...it's making them do hard",
' work...',
' ...and getting paid for it.',
' (VO)', ' Snap and sort your expenses to',
' save over $4,600 at tax time.',
'QUICKBOOKS. BACKING YOU.'
]

captions = SCCReader().read(sample_scc_multiple_formats) \
Expand All @@ -319,7 +337,6 @@ def test_multiple_formats(self, sample_scc_multiple_formats):
for node in caption.nodes
if node.type_ == CaptionNode.TEXT
]

assert expected_text_lines == text_lines

def test_freeze_semicolon_spec_time(self, sample_scc_roll_up_ru2):
Expand Down Expand Up @@ -407,7 +424,7 @@ def test_italics_commands_are_formatted_properly(self):
node_creator.add_chars('c')
node_creator.interpret_command('91ae') # italics ON

node_creator.interpret_command('9270') # row 4 col 0
node_creator.interpret_command('9270') # row 4 col 0 resets italics
node_creator.add_chars('d')

node_creator.interpret_command('15d0') # row 5 col 0 - creates BR
Expand All @@ -434,17 +451,12 @@ def test_italics_commands_are_formatted_properly(self):
assert result[8].is_text_node()

assert result[9].requires_repositioning()
assert result[10].is_italics_node()
assert result[10].sets_italics_on()

assert result[11].is_text_node()
assert result[12].is_explicit_break()
assert result[13].is_text_node()
assert result[14].is_explicit_break()
assert result[15].is_text_node()
assert result[10].is_text_node()

assert result[16].is_italics_node()
assert result[16].sets_italics_off()
assert result[11].is_explicit_break()
assert result[12].is_text_node()
assert result[13].is_explicit_break()
assert result[14].is_text_node()


class CaptionDummy:
Expand Down
Loading

0 comments on commit 19fbf72

Please sign in to comment.