Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter work summary to remove non-xml safe characters (PP-1969) #2198

Merged
merged 2 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Remove unsafe characters summary

Revision ID: c3458e1ef9aa
Revises: 272da5f400de
Create Date: 2024-11-27 20:32:41.431147+00:00

"""

from alembic import op

# revision identifiers, used by Alembic.
revision = "c3458e1ef9aa"
down_revision = "272da5f400de"
branch_labels = None
depends_on = None


def upgrade() -> None:
# Remove any characters that are not XML safe from the summary_text field. The code has been
# updated to filter out these characters, but this cleans up any existing data.
# https://www.postgresql.org/docs/current/functions-matching.html#FUNCTIONS-POSIX-REGEXP
op.execute(
"UPDATE works SET summary_text = regexp_replace("
" summary_text, '[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', 'g'"
") WHERE "
"summary_text ~ '[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]'"
)


def downgrade() -> None:
# No need to do anything on downgrade.
pass
18 changes: 15 additions & 3 deletions src/palace/manager/sqlalchemy/model/work.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

from __future__ import annotations

import re
import sys
from collections import Counter
from collections.abc import Sequence
from datetime import date, datetime
from decimal import Decimal
from functools import cache
from typing import TYPE_CHECKING, Any, cast

import opensearchpy
Expand Down Expand Up @@ -609,11 +611,21 @@ def merge_into(self, other_work):

other_work.calculate_presentation()

def set_summary(self, resource):
@classmethod
@cache
def _xml_text_sanitization_regex(cls) -> re.Pattern[str]:
# Source: https://stackoverflow.com/questions/8733233/filtering-out-certain-bytes-in-python
return re.compile(
r"[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+"
)

def set_summary(self, resource: Resource) -> None:
self.summary = resource
# TODO: clean up the content
if resource and resource.representation:
self.summary_text = resource.representation.unicode_content
# Make sure that the summary text only contains characters that are XML compatible.
self.summary_text = self._xml_text_sanitization_regex().sub(
"", resource.representation.unicode_content
)
else:
self.summary_text = ""
WorkCoverageRecord.add_for(self, operation=WorkCoverageRecord.SUMMARY_OPERATION)
Expand Down
22 changes: 17 additions & 5 deletions tests/manager/sqlalchemy/model/test_edition.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,20 +403,32 @@ def test_set_summary(self, db: DatabaseTransactionFixture):
work = db.work(presentation_edition=e)
overdrive = DataSource.lookup(db.session, DataSource.OVERDRIVE)

# Set the work's summmary.
# Set the work's summary.
l1, new = pool.add_link(
Hyperlink.DESCRIPTION, None, overdrive, "text/plain", "F"
)
work.set_summary(l1.resource)

assert l1.resource == work.summary
assert "F" == work.summary_text
assert work.summary == l1.resource
assert work.summary_text == "F"

# Set the work's summary to a string that contains characters that cannot be
# represented in XML.
l2, new = pool.add_link(
Hyperlink.DESCRIPTION,
None,
overdrive,
"text/plain",
"\u0000💣ü🔥\u0001\u000C",
)
work.set_summary(l2.resource)
assert work.summary_text == "💣ü🔥"

# Remove the summary.
work.set_summary(None)

assert None == work.summary
assert "" == work.summary_text
assert work.summary is None
assert work.summary_text == ""

def test_calculate_evaluate_summary_quality_with_privileged_data_sources(
self, db: DatabaseTransactionFixture
Expand Down
Loading