Skip to content

Commit

Permalink
moderation: move domains ban/safe-list to DB table
Browse files Browse the repository at this point in the history
  • Loading branch information
slint committed Nov 5, 2024
1 parent c51335a commit a67f987
Show file tree
Hide file tree
Showing 8 changed files with 134 additions and 84 deletions.
3 changes: 2 additions & 1 deletion site/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ invenio_celery.tasks =
invenio_oauth2server.scopes =
deposit_write_scope = zenodo_rdm.legacy.scopes:deposit_write_scope
deposit_actions_scope = zenodo_rdm.legacy.scopes:deposit_actions_scope

invenio_db.models =
zenodo_rdm_moderation = zenodo_rdm.moderation.models
invenio_assets.webpack =
zenodo_rdm_theme = zenodo_rdm.webpack:theme
invenio_config.module =
Expand Down
42 changes: 42 additions & 0 deletions site/tests/moderation/test_domains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Domain moderation tests."""

import pytest

from zenodo_rdm.moderation.models import LinkDomain, LinkDomainStatus


@pytest.fixture
def domains(db):
"""Create test domains."""
domains = [
LinkDomain.create("blog.io", LinkDomainStatus.SAFE),
LinkDomain.create("spam.blog.io", LinkDomainStatus.BANNED),
LinkDomain.create("edu.ch", LinkDomainStatus.SAFE),
LinkDomain.create("cam", LinkDomainStatus.BANNED),
]
db.session.commit()
return domains


@pytest.mark.parametrize(
"domain,expected_status",
[
("https://blog.io/article", LinkDomainStatus.SAFE),
("https://spam.blog.io/article", LinkDomainStatus.BANNED),
("http://other.blog.io/article", LinkDomainStatus.SAFE),
("https://physics.edu.ch/article", LinkDomainStatus.SAFE),
("https://math.edu.ch/article", LinkDomainStatus.SAFE),
("http://spam.cam/content", LinkDomainStatus.BANNED),
("http://sub.spam.cam/content", LinkDomainStatus.BANNED),
],
)
def test_lookup_domain(domains, domain, expected_status):
"""Test domain lookup."""
assert LinkDomain.lookup_domain(domain).status == expected_status
6 changes: 0 additions & 6 deletions site/zenodo_rdm/moderation/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,6 @@

from .rules import files_rule, links_rule, text_sanitization_rule, verified_user_rule

MODERATION_BANNED_LINK_DOMAINS = []
"""Banned domains for links."""

MODERATION_SAFE_LINK_DOMAINS = []
"""Safe domains for links."""

MODERATION_SCORES = {
"spam_link": 8,
"ham_link": -3,
Expand Down
55 changes: 0 additions & 55 deletions site/zenodo_rdm/moderation/domains.py

This file was deleted.

13 changes: 0 additions & 13 deletions site/zenodo_rdm/moderation/ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from werkzeug.utils import cached_property

from . import config
from .domains import DomainTree


class ZenodoModeration:
Expand All @@ -36,18 +35,6 @@ def init_app(self, app):
self.init_config(app)
app.extensions["zenodo-moderation"] = self

@cached_property
def domain_tree(self):
"""Initialize and return the DomainTree instance with config-based links."""
domain_tree = DomainTree()
domain_tree.initialize_links(
current_app.config.get("MODERATION_BANNED_LINK_DOMAINS", []), "banned"
)
domain_tree.initialize_links(
current_app.config.get("MODERATION_SAFE_LINK_DOMAINS", []), "safe"
)
return domain_tree

@cached_property
def scores(self):
"""Return moderation score values used in rules."""
Expand Down
75 changes: 75 additions & 0 deletions site/zenodo_rdm/moderation/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Moderation models."""

import enum
from urllib.parse import urlparse

from invenio_db import db
from sqlalchemy_utils import ChoiceType, Timestamp


class LinkDomainStatus(enum.Enum):
"""Link domain status."""

SAFE = "S"
BANNED = "B"
MODERATED = "M"


class LinkDomain(db.Model, Timestamp):
"""Link domain model."""

__tablename__ = "link_domains"

id = db.Column(db.Integer, primary_key=True)

domain = db.Column(db.Text, nullable=False, unique=True)
status = db.Column(
ChoiceType(LinkDomainStatus, impl=db.CHAR(1)),
nullable=False,
)
score = db.Column(db.Integer, nullable=True)
reason = db.Column(db.Text, nullable=True)

@classmethod
def create(cls, domain, status, score=None, reason=None):
"""Create a link domain."""
parts = domain.strip(".").split(".")
domain = "." + ".".join(parts[::-1]).lower()
ld = cls(domain=domain, status=status, score=score, reason=reason)
db.session.add(ld)
return ld

@classmethod
def lookup_domain(cls, url):
"""Lookup the status of a URL's domain."""
try:
parsed = urlparse(url)
except ValueError:
return None

domain = parsed.netloc or ""
domain = domain.lstrip("www.")
domain_parts = domain.split(".")
if not domain_parts:
return None

reversed_domain = "." + ".".join(domain_parts[::-1]).lower()
return (
cls.query.filter(
# Exact match
(LinkDomain.domain == reversed_domain)
# Or subdomain match
| db.literal(reversed_domain).like(LinkDomain.domain + ".%")
)
# Order by length of domain to get the most specific match
.order_by(db.func.length(LinkDomain.domain).desc())
.limit(1)
.scalar()
)
1 change: 0 additions & 1 deletion site/zenodo_rdm/moderation/proxies.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,4 @@
from werkzeug.local import LocalProxy

current_moderation = LocalProxy(lambda: current_app.extensions["zenodo-moderation"])
current_domain_tree = LocalProxy(lambda: current_moderation.domain_tree)
current_scores = LocalProxy(lambda: current_moderation.scores)
23 changes: 15 additions & 8 deletions site/zenodo_rdm/moderation/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@

from flask import current_app

from zenodo_rdm.moderation.proxies import current_domain_tree

from .proxies import current_domain_tree, current_scores
from .models import LinkDomain, LinkDomainStatus
from .proxies import current_scores

#
# Utilities
Expand Down Expand Up @@ -65,11 +64,19 @@ def links_rule(identity, draft=None, record=None):
extracted_links = extract_links(str(record.metadata))

for link in extracted_links:
status = current_domain_tree.get_status(link)
if status == "banned":
score += current_scores.spam_link
elif status == "safe":
score += current_scores.ham_link
domain = LinkDomain.lookup_domain(link)
if not domain:
continue
if domain.status == LinkDomainStatus.BANNED:
if domain.score is not None:
score += domain.score
else:
score += current_scores.spam_link
elif domain == LinkDomainStatus.SAFE:
if domain.score is not None:
score += domain.score
else:
score += current_scores.ham_link
return score


Expand Down

0 comments on commit a67f987

Please sign in to comment.