Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

moderation: added query match rule #1049

Merged
merged 2 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions site/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ tests =
[options.entry_points]
flask.commands =
zenodo-admin = zenodo_rdm.cli:zenodo_admin
moderation = zenodo_rdm.cli:moderation
invenio_base.blueprints =
zenodo_rdm_legacy = zenodo_rdm.legacy.views:blueprint
zenodo_rdm_support = zenodo_rdm.views:create_blueprint
Expand Down
39 changes: 39 additions & 0 deletions site/tests/moderation/test_moderation_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-

Check failure on line 1 in site/tests/moderation/test_moderation_queries.py

View workflow job for this annotation

GitHub Actions / Python (site, 3.9, postgresql14, opensearch2)

Black format check --- /home/runner/work/zenodo-rdm/zenodo-rdm/site/tests/moderation/test_moderation_queries.py 2024-11-08 13:33:19.707687+00:00 +++ /home/runner/work/zenodo-rdm/zenodo-rdm/site/tests/moderation/test_moderation_queries.py 2024-11-08 13:38:42.445572+00:00 @@ -10,10 +10,11 @@ from invenio_db import db from invenio_search import current_search_client from zenodo_rdm.moderation.models import ModerationQuery from zenodo_rdm.moderation.rules import match_query_rule from zenodo_rdm.api import ZenodoRDMRecord + def test_moderation_query_creation(app): """Test to create and index a ModerationQuery.""" with app.app_context(): query_string = "metadata.title:SimpleTest" @@ -34,6 +35,7 @@ query.score == score, query.active == active, ] ) -#TODO: Add test for matching query \ No newline at end of file + +# TODO: Add test for matching query

Check failure on line 1 in site/tests/moderation/test_moderation_queries.py

View workflow job for this annotation

GitHub Actions / Python (site, 3.9, postgresql14, opensearch2)

isort-check from invenio_db import db from invenio_search import current_search_client + +from zenodo_rdm.api import ZenodoRDMRecord from zenodo_rdm.moderation.models import ModerationQuery from zenodo_rdm.moderation.rules import match_query_rule -from zenodo_rdm.api import ZenodoRDMRecord + def test_moderation_query_creation(app): """Test to create and index a ModerationQuery."""
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Test ModerationQuery model class."""

from invenio_db import db
from invenio_search import current_search_client
from zenodo_rdm.moderation.models import ModerationQuery
from zenodo_rdm.moderation.rules import match_query_rule
from zenodo_rdm.api import ZenodoRDMRecord

def test_moderation_query_creation(app):
slint marked this conversation as resolved.
Show resolved Hide resolved
"""Test to create and index a ModerationQuery."""
with app.app_context():
query_string = "metadata.title:SimpleTest"
notes = "test query"
score = 5
active = True

query = ModerationQuery.create(
query_string, ZenodoRDMRecord, notes=notes, score=score, active=active
)
db.session.commit()

# Check if query attributes are set correctly
assert all(
[
query.query_string == query_string,
query.notes == notes,
query.score == score,
query.active == active,
]
)

#TODO: Add test for matching query
33 changes: 33 additions & 0 deletions site/zenodo_rdm/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import click
from flask.cli import with_appcontext
from invenio_access.permissions import system_identity
from invenio_communities.communities.records.api import Community
from invenio_db import db
from invenio_pidstore.models import PersistentIdentifier
from invenio_rdm_records.proxies import current_rdm_records_service
Expand All @@ -25,6 +26,12 @@
from invenio_requests.records.api import Request
from invenio_requests.records.models import RequestMetadata

from zenodo_rdm.api import ZenodoRDMRecord
from zenodo_rdm.moderation.percolator import (
create_percolator_index,
get_percolator_index,
)


def _get_parent(record_model):
parent_model = record_model.parent
Expand Down Expand Up @@ -246,3 +253,29 @@ def delete_record(recid):

for req in requests:
current_requests_service.indexer.delete(req)


@click.group()
def moderation():
"""Moderation commands."""


@moderation.command("create-queries-index")
@click.option(
"-r",
"--record-cls",
type=click.Choice(["records", "communities"], case_sensitive=False),
default="records",
help="Record class to base the index on (default: records).",
)
@with_appcontext
def create_index(record_cls):
"""Command to create a percolator index for moderation queries."""
record_cls = ZenodoRDMRecord if record_cls == "records" else Community

try:
create_percolator_index(record_cls)
index_name = get_percolator_index(record_cls)
click.secho(f"Percolator index '{index_name}' created successfully.")
except Exception as e:
click.secho(f"Error creating percolator index: {e}")
23 changes: 22 additions & 1 deletion site/zenodo_rdm/moderation/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@

"""Moderation config."""

from .rules import files_rule, links_rule, text_sanitization_rule, verified_user_rule
from .rules import (
files_rule,
links_rule,
match_query_rule,
text_sanitization_rule,
verified_user_rule,
)

MODERATION_SCORES = {
"spam_link": 8,
Expand Down Expand Up @@ -40,12 +46,27 @@
links_rule,
files_rule,
text_sanitization_rule,
match_query_rule,
]
"""Scoring rules for record moderation."""

MODERATION_COMMUNITY_SCORE_RULES = [
links_rule,
text_sanitization_rule,
verified_user_rule,
match_query_rule,
slint marked this conversation as resolved.
Show resolved Hide resolved
]
"""Scoring rules for communtiy moderation."""

MODERATION_PERCOLATOR_INDEX_PREFIX = "moderation-queries"
"""Index Prefix for percolator index."""

MODERATION_PERCOLATOR_MAPPING = {
"properties": {
"query": {"type": "percolator"},
"score": {"type": "integer"},
"notes": {"type": "text"},
"active": {"type": "boolean"},
}
}
"""Properties for moderation percolator index."""
52 changes: 52 additions & 0 deletions site/zenodo_rdm/moderation/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,15 @@
import enum
from urllib.parse import urlparse

from flask import current_app
from invenio_db import db
from invenio_search import current_search_client
from sqlalchemy_utils import ChoiceType, Timestamp

from zenodo_rdm.api import ZenodoRDMRecord

from .percolator import index_percolate_query


class LinkDomainStatus(enum.Enum):
"""Link domain status."""
Expand Down Expand Up @@ -73,3 +79,49 @@ def lookup_domain(cls, url):
.limit(1)
.scalar()
)


class ModerationQuery(db.Model):
"""Moderation queries model."""

__tablename__ = "moderation_queries"

id = db.Column(db.Integer, primary_key=True, autoincrement=True)
"""Primary key identifier for the moderation query."""

score = db.Column(db.Integer, default=0)
"""Score associated with the query."""

query_string = db.Column(db.Text, nullable=False)
"""Query string containing the filter criteria."""

notes = db.Column(db.Text, nullable=True)
"""Additional notes or comments regarding the moderation query."""

active = db.Column(db.Boolean, default=True)
"""Indicates whether the moderation query is currently active."""

@classmethod
def create(
cls, query_string, record_cls=ZenodoRDMRecord, notes=None, score=0, active=True
):
"""Create a new moderation query with a configurable record class."""
query = cls(query_string=query_string, notes=notes, score=score, active=active)
db.session.add(query)

index_percolate_query(record_cls, query_string, active, score, notes)
0einstein0 marked this conversation as resolved.
Show resolved Hide resolved

return query

@classmethod
def get(cls, query_id=None):
"""Retrieve a moderation query by ID or return all queries if no ID is provided."""
if query_id is not None:
return cls.query.filter_by(id=query_id).one_or_none()
return cls.query.all()

def __repr__(self):
"""Get a string representation of the moderation query."""
return (
f"<ModerationQuery id={self.id}, score={self.score}, active={self.active}>"
)
90 changes: 90 additions & 0 deletions site/zenodo_rdm/moderation/percolator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2017-2024 CERN.
# Copyright (C) 2022 Graz University of Technology.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Percolator."""


from flask import current_app
from invenio_search import current_search_client
from invenio_search.utils import build_alias_name, build_index_name


def get_percolator_index(record_cls):
"""Build the percolator index alias name for a given record class."""
prefix = current_app.config.get("MODERATION_PERCOLATOR_INDEX_PREFIX")
combined_index = f"{prefix}-{record_cls.index._name}"
return build_alias_name(combined_index, app=current_app)


def create_percolator_index(record_cls):
"""Create mappings with the percolator field for moderation queries.

This function creates a new Elasticsearch index for percolator queries by copying
the settings and mappings from an existing record index and adding specific
percolator mappings.
"""
# Build the name for the new percolator index, using a prefix and the record's index name
combined_index_name = f"{current_app.config.get('MODERATION_PERCOLATOR_INDEX_PREFIX')}-{record_cls.index._name}"
percolator_index = build_index_name(combined_index_name, app=current_app)

# Get the current mapping for the record index to copy its structure
record_index = build_alias_name(record_cls.index._name)
record_mapping = current_search_client.indices.get_mapping(index=record_index)
assert len(record_mapping) == 1
# Extract the mappings from the record index and store in `percolator_mappings`
percolator_mappings = list(record_mapping.values())[0]["mappings"]
slint marked this conversation as resolved.
Show resolved Hide resolved

# Add specific properties for percolator fields from the app configuration
percolator_mappings["properties"].update(
current_app.config.get("MODERATION_PERCOLATOR_MAPPING")["properties"]
)

# Retrieve the current settings of the record index to copy them to the percolator index
record_settings = list(
current_search_client.indices.get_settings(index=record_index).values()
)[0]["settings"]["index"]

percolator_settings = {
"index": {
"query": {
"default_field": record_settings.get("query", {}).get(
"default_field", []
)
}
},
"analysis": record_settings.get("analysis", {}),
}

if not current_search_client.indices.exists(percolator_index):
try:
current_search_client.indices.create(
index=percolator_index,
body={
"settings": percolator_settings,
"mappings": {**percolator_mappings},
},
)
except Exception as e:
current_app.logger.exception(e)


def index_percolate_query(record_cls, query_string, active=True, score=1, notes=None):
"""Index a percolate query."""
try:
current_search_client.index(
index=get_percolator_index(record_cls),
body={
"query": {"query_string": {"query": query_string}},
"active": active,
"score": score,
"notes": notes,
},
)
except Exception as e:
current_app.logger.exception(e)
30 changes: 30 additions & 0 deletions site/zenodo_rdm/moderation/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@
import re

from flask import current_app
from invenio_search import current_search_client
from invenio_search.utils import build_alias_name

from .models import LinkDomain, LinkDomainStatus
from .percolator import get_percolator_index
from .proxies import current_scores

#
Expand Down Expand Up @@ -130,3 +133,30 @@ def files_rule(identity, draft=None, record=None):
score += current_scores.ham_files

return score


def match_query_rule(identity, draft=None, record=None):
"""Calculate a score based on matched percolate queries against the given document in the specified index."""
document = record.dumps()
percolator_index = get_percolator_index(record)
if percolator_index:
matched_queries = current_search_client.search(
index=percolator_index,
body={
"query": {
"bool": {
"must": [
{"term": {"active": True}},
{"percolate": {"field": "query", "document": document}},
]
}
}
},
)

score = 0

for hit in matched_queries["hits"]["hits"]:
query_score = hit["_source"].get("score", 0)
score += query_score
return score
Loading