Skip to content

Commit

Permalink
Auto repair incorrect collation on MySQL schema
Browse files Browse the repository at this point in the history
As we do more union queries in 2023.5.x if there is a mismatch
between collations on tables, they will fail with an error
that is hard for the user to figure out how to fix

`Error executing query: (MySQLdb.OperationalError) (1271, "Illegal mix of collations for operation UNION")`

This was reported in the #beta channel and by PM from others
so the problem is not isolated to a single user

https://discord.com/channels/330944238910963714/427516175237382144/1100908739910963272
  • Loading branch information
bdraco committed Apr 29, 2023
1 parent b9f2b0a commit 48c93c1
Show file tree
Hide file tree
Showing 8 changed files with 195 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
correct_db_schema_precision,
correct_db_schema_utf8,
validate_db_schema_precision,
validate_table_schema_has_correct_collation,
validate_table_schema_supports_utf8,
)

Expand All @@ -17,15 +18,19 @@

def validate_db_schema(instance: Recorder) -> set[str]:
"""Do some basic checks for common schema errors caused by manual migration."""
return validate_table_schema_supports_utf8(
schema_errors = validate_table_schema_supports_utf8(
instance, EventData, (EventData.shared_data,)
) | validate_db_schema_precision(instance, Events)
for table in (Events, EventData):
schema_errors |= validate_table_schema_has_correct_collation(instance, table)
return schema_errors


def correct_db_schema(
instance: Recorder,
schema_errors: set[str],
) -> None:
"""Correct issues detected by validate_db_schema."""
correct_db_schema_utf8(instance, EventData, schema_errors)
for table in (Events, EventData):
correct_db_schema_utf8(instance, table, schema_errors)
correct_db_schema_precision(instance, Events, schema_errors)
52 changes: 51 additions & 1 deletion homeassistant/components/recorder/auto_repairs/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging
from typing import TYPE_CHECKING

from sqlalchemy import MetaData
from sqlalchemy.exc import OperationalError
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy.orm.attributes import InstrumentedAttribute
Expand Down Expand Up @@ -60,6 +61,52 @@ def validate_table_schema_supports_utf8(
return schema_errors


def validate_table_schema_has_correct_collation(
instance: Recorder,
table_object: type[DeclarativeBase],
) -> set[str]:
"""Verify the table has the correct collation."""
schema_errors: set[str] = set()
# Lack of full utf8 support is only an issue for MySQL / MariaDB
if instance.dialect_name != SupportedDialect.MYSQL:
return schema_errors

try:
schema_errors = _validate_table_schema_has_correct_collation(
instance, table_object
)
except Exception as exc: # pylint: disable=broad-except
_LOGGER.exception("Error when validating DB schema: %s", exc)

_log_schema_errors(table_object, schema_errors)
return schema_errors


def _validate_table_schema_has_correct_collation(
instance: Recorder,
table_object: type[DeclarativeBase],
) -> set[str]:
"""Ensure the table has the correct collation to avoid union errors with mixed collations."""
schema_errors: set[str] = set()
# Mark the session as read_only to ensure that the test data is not committed
# to the database and we always rollback when the scope is exited
with session_scope(session=instance.get_session(), read_only=True) as session:
table = table_object.__tablename__
metadata_obj = MetaData()
metadata_obj.reflect(bind=session.connection())
dialect_kwargs = metadata_obj.tables[table].dialect_kwargs
collate = dialect_kwargs.get("mysql_collate") or dialect_kwargs.get(
"mariadb_collate"
)
if collate and collate != "utf8mb4_unicode_ci":
_LOGGER.debug(
"Database %s collation is not utf8mb4_unicode_ci",
table,
)
schema_errors.add(f"{table}.utf8mb4_unicode_ci")
return schema_errors


def _validate_table_schema_supports_utf8(
instance: Recorder,
table_object: type[DeclarativeBase],
Expand Down Expand Up @@ -184,7 +231,10 @@ def correct_db_schema_utf8(
) -> None:
"""Correct utf8 issues detected by validate_db_schema."""
table_name = table_object.__tablename__
if f"{table_name}.4-byte UTF-8" in schema_errors:
if (
f"{table_name}.4-byte UTF-8" in schema_errors
or f"{table_name}.utf8mb4_unicode_ci" in schema_errors
):
from ..migration import ( # pylint: disable=import-outside-toplevel
_correct_table_character_set_and_collation,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
correct_db_schema_precision,
correct_db_schema_utf8,
validate_db_schema_precision,
validate_table_schema_has_correct_collation,
validate_table_schema_supports_utf8,
)

Expand All @@ -26,6 +27,8 @@ def validate_db_schema(instance: Recorder) -> set[str]:
for table, columns in TABLE_UTF8_COLUMNS.items():
schema_errors |= validate_table_schema_supports_utf8(instance, table, columns)
schema_errors |= validate_db_schema_precision(instance, States)
for table in (States, StateAttributes):
schema_errors |= validate_table_schema_has_correct_collation(instance, table)
return schema_errors


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
correct_db_schema_precision,
correct_db_schema_utf8,
validate_db_schema_precision,
validate_table_schema_has_correct_collation,
validate_table_schema_supports_utf8,
)

Expand All @@ -26,6 +27,7 @@ def validate_db_schema(instance: Recorder) -> set[str]:
)
for table in (Statistics, StatisticsShortTerm):
schema_errors |= validate_db_schema_precision(instance, table)
schema_errors |= validate_table_schema_has_correct_collation(instance, table)
if schema_errors:
_LOGGER.debug(
"Detected statistics schema errors: %s", ", ".join(sorted(schema_errors))
Expand All @@ -41,3 +43,4 @@ def correct_db_schema(
correct_db_schema_utf8(instance, StatisticsMeta, schema_errors)
for table in (Statistics, StatisticsShortTerm):
correct_db_schema_precision(instance, table, schema_errors)
correct_db_schema_utf8(instance, table, schema_errors)
29 changes: 29 additions & 0 deletions tests/components/recorder/auto_repairs/events/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,32 @@ async def test_validate_db_schema_fix_utf8_issue_event_data(
"Updating character set and collation of table event_data to utf8mb4"
in caplog.text
)


@pytest.mark.parametrize("enable_schema_validation", [True])
async def test_validate_db_schema_fix_collation_issue(
async_setup_recorder_instance: RecorderInstanceGenerator,
hass: HomeAssistant,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test validating DB schema with MySQL.
Note: The test uses SQLite, the purpose is only to exercise the code.
"""
with patch(
"homeassistant.components.recorder.core.Recorder.dialect_name", "mysql"
), patch(
"homeassistant.components.recorder.auto_repairs.schema._validate_table_schema_has_correct_collation",
return_value={"events.utf8mb4_unicode_ci"},
):
await async_setup_recorder_instance(hass)
await async_wait_recording_done(hass)

assert "Schema validation failed" not in caplog.text
assert (
"Database is about to correct DB schema errors: events.utf8mb4_unicode_ci"
in caplog.text
)
assert (
"Updating character set and collation of table events to utf8mb4" in caplog.text
)
29 changes: 29 additions & 0 deletions tests/components/recorder/auto_repairs/states/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,32 @@ async def test_validate_db_schema_fix_utf8_issue_state_attributes(
"Updating character set and collation of table state_attributes to utf8mb4"
in caplog.text
)


@pytest.mark.parametrize("enable_schema_validation", [True])
async def test_validate_db_schema_fix_collation_issue(
async_setup_recorder_instance: RecorderInstanceGenerator,
hass: HomeAssistant,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test validating DB schema with MySQL.
Note: The test uses SQLite, the purpose is only to exercise the code.
"""
with patch(
"homeassistant.components.recorder.core.Recorder.dialect_name", "mysql"
), patch(
"homeassistant.components.recorder.auto_repairs.schema._validate_table_schema_has_correct_collation",
return_value={"states.utf8mb4_unicode_ci"},
):
await async_setup_recorder_instance(hass)
await async_wait_recording_done(hass)

assert "Schema validation failed" not in caplog.text
assert (
"Database is about to correct DB schema errors: states.utf8mb4_unicode_ci"
in caplog.text
)
assert (
"Updating character set and collation of table states to utf8mb4" in caplog.text
)
30 changes: 30 additions & 0 deletions tests/components/recorder/auto_repairs/statistics/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,33 @@ async def test_validate_db_schema_fix_float_issue(
"sum DOUBLE PRECISION",
]
modify_columns_mock.assert_called_once_with(ANY, ANY, table, modification)


@pytest.mark.parametrize("enable_schema_validation", [True])
async def test_validate_db_schema_fix_collation_issue(
async_setup_recorder_instance: RecorderInstanceGenerator,
hass: HomeAssistant,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test validating DB schema with MySQL.
Note: The test uses SQLite, the purpose is only to exercise the code.
"""
with patch(
"homeassistant.components.recorder.core.Recorder.dialect_name", "mysql"
), patch(
"homeassistant.components.recorder.auto_repairs.schema._validate_table_schema_has_correct_collation",
return_value={"statistics.utf8mb4_unicode_ci"},
):
await async_setup_recorder_instance(hass)
await async_wait_recording_done(hass)

assert "Schema validation failed" not in caplog.text
assert (
"Database is about to correct DB schema errors: statistics.utf8mb4_unicode_ci"
in caplog.text
)
assert (
"Updating character set and collation of table statistics to utf8mb4"
in caplog.text
)
43 changes: 43 additions & 0 deletions tests/components/recorder/auto_repairs/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
correct_db_schema_precision,
correct_db_schema_utf8,
validate_db_schema_precision,
validate_table_schema_has_correct_collation,
validate_table_schema_supports_utf8,
)
from homeassistant.components.recorder.db_schema import States
Expand Down Expand Up @@ -106,6 +107,48 @@ def _break_states_schema():
assert schema_errors == set()


async def test_validate_db_schema_fix_incorrect_collation(
async_setup_recorder_instance: RecorderInstanceGenerator,
hass: HomeAssistant,
recorder_db_url: str,
caplog: pytest.LogCaptureFixture,
) -> None:
"""Test validating DB schema with MySQL when the collation is incorrect."""
if not recorder_db_url.startswith("mysql://"):
# This problem only happens on MySQL
return
await async_setup_recorder_instance(hass)
await async_wait_recording_done(hass)
instance = get_instance(hass)
session_maker = instance.get_session

def _break_states_schema():
with session_scope(session=session_maker()) as session:
session.execute(
text(
"ALTER TABLE states CHARACTER SET utf8mb3 COLLATE utf8_general_ci, "
"LOCK=EXCLUSIVE;"
)
)

await instance.async_add_executor_job(_break_states_schema)
schema_errors = await instance.async_add_executor_job(
validate_table_schema_has_correct_collation, instance, States
)
assert schema_errors == {"states.utf8mb4_unicode_ci"}

# Now repair the schema
await instance.async_add_executor_job(
correct_db_schema_utf8, instance, States, schema_errors
)

# Now validate the schema again
schema_errors = await instance.async_add_executor_job(
validate_table_schema_has_correct_collation, instance, States
)
assert schema_errors == set()


async def test_validate_db_schema_fix_utf8_issue_with_broken_schema_unrepairable(
async_setup_recorder_instance: RecorderInstanceGenerator,
hass: HomeAssistant,
Expand Down

0 comments on commit 48c93c1

Please sign in to comment.