Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Fix Python Bigtable dataloss bug] Stop unsetting timestamps of -1 #28624

Merged
merged 9 commits into from
Sep 23, 2023
7 changes: 3 additions & 4 deletions sdks/python/apache_beam/io/gcp/bigtableio.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,11 +252,10 @@ def process(self, direct_row):
"type": b'SetCell',
"family_name": mutation.set_cell.family_name.encode('utf-8'),
"column_qualifier": mutation.set_cell.column_qualifier,
"value": mutation.set_cell.value
"value": mutation.set_cell.value,
"timestamp_micros": struct.pack(
'>q', mutation.set_cell.timestamp_micros)
}
micros = mutation.set_cell.timestamp_micros
if micros > -1:
mutation_dict['timestamp_micros'] = struct.pack('>q', micros)
elif mutation.__contains__("delete_from_column"):
mutation_dict = {
"type": b'DeleteFromColumn',
Expand Down
35 changes: 35 additions & 0 deletions sdks/python/apache_beam/io/gcp/bigtableio_it_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,10 @@ def test_set_mutation(self):
row1_col2_cell = Cell(b'val1-2', 200_000_000)
row2_col1_cell = Cell(b'val2-1', 100_000_000)
row2_col2_cell = Cell(b'val2-2', 200_000_000)
# When setting this cell, we won't set a timestamp. We expect the timestamp
# to default to -1, and Bigtable will set it to system time at insertion.
row2_col1_no_timestamp = Cell(b'val2-2-notimestamp', time.time())
row2_col1_neg1_timestamp = Cell(b'val2-2-neg1-timestamp', time.time())
# rows sent to write transform
row1.set_cell(
'col_fam', b'col-1', row1_col1_cell.value, row1_col1_cell.timestamp)
Expand All @@ -232,6 +236,10 @@ def test_set_mutation(self):
'col_fam', b'col-1', row2_col1_cell.value, row2_col1_cell.timestamp)
row2.set_cell(
'col_fam', b'col-2', row2_col2_cell.value, row2_col2_cell.timestamp)
# don't set a timestamp here. it should default to -1
row2.set_cell('col_fam', b'col-no-timestamp', row2_col1_no_timestamp.value)
ahmedabu98 marked this conversation as resolved.
Show resolved Hide resolved
row2.set_cell(
'col_fam', b'col-neg1-timestamp', row2_col1_no_timestamp.value)

self.run_pipeline([row1, row2])

Expand All @@ -249,6 +257,33 @@ def test_set_mutation(self):
self.assertEqual(
row2_col2_cell, actual_row2.find_cells('col_fam', b'col-2')[0])

# check cell that doesn't have a timestamp set is handled properly:
self.assertEqual(
row2_col1_no_timestamp.value,
actual_row2.find_cells('col_fam', b'col-no-timestamp')[0].value)
# Bigtable sets timestamp as insertion time, which is later than the
# time.time() we set when creating this test case
cell_timestamp = actual_row2.find_cells('col_fam',
b'col-no-timestamp')[0].timestamp
self.assertTrue(
row2_col1_no_timestamp.timestamp < actual_row2.find_cells(
'col_fam', b'col-no-timestamp')[0].timestamp,
msg="Expected cell with unset timestamp to have ingestion time "
f"attached, but was {cell_timestamp}")
# check cell that has timestamp of `-1` is handled properly:
self.assertEqual(
row2_col1_neg1_timestamp.value,
actual_row2.find_cells('col_fam', b'col-neg1-timestamp')[0].value)
# Bigtable sets -1 timestamp as insertion time, which is later than the
# time.time() we set when creating this test case
cell_timestamp = actual_row2.find_cells('col_fam',
b'col-neg1-timestamp')[0].timestamp
self.assertTrue(
row2_col1_neg1_timestamp.timestamp < actual_row2.find_cells(
'col_fam', b'col-neg1-timestamp')[0].timestamp,
msg="Expected cell with `-1` timestamp to have ingestion time "
f"attached, but was {cell_timestamp}")

def test_delete_cells_mutation(self):
col_fam = self.table.column_family('col_fam')
col_fam.create()
Expand Down