Skip to content

Commit

Permalink
New validations for actor packs url and twitter_handle overlap (#41)
Browse files Browse the repository at this point in the history
  • Loading branch information
soad003 committed Feb 17, 2023
1 parent 8425990 commit 10fb4f7
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 3 deletions.
52 changes: 50 additions & 2 deletions src/tagpack/actorpack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,19 @@
import json
import os
import sys
from collections import defaultdict

import yaml
from yamlinclude import YamlIncludeConstructor

from tagpack import TagPackFileError, UniqueKeyLoader, ValidationError
from tagpack.cmd_utils import print_info
from tagpack.utils import apply_to_dict_field, try_parse_date
from tagpack.cmd_utils import print_info, print_warn
from tagpack.utils import (
apply_to_dict_field,
get_secondlevel_domain,
strip_empty,
try_parse_date,
)


class ActorPack(object):
Expand Down Expand Up @@ -129,6 +135,8 @@ def validate(self):
e2 = "Mandatory tag field {} missing in {}"
e3 = "Field {} not allowed in {}"
e4 = "Value of body field {} must not be empty (None) in {}"
domain_overlap = defaultdict(set)
twitter_handle_overlap = defaultdict(set)
for actor in self.get_unique_actors():
# check if mandatory actor fields are defined
if not isinstance(actor, Actor):
Expand Down Expand Up @@ -157,6 +165,26 @@ def validate(self):
except ValidationError as e:
raise ValidationError(f"{e} in {actor}")

for uri in set(actor.uris):
domain_overlap[get_secondlevel_domain(uri)].add(actor.identifier)

if actor.twitter_handle:
twitter_handle_overlap[actor.twitter_handle].add(actor.identifier)

for domain, actors in domain_overlap.items():
if len(actors) > 1:
print_warn(
f"These actors share the same domain {actors} - {domain}."
" Please merge!"
)

for twitter_handle, actors in twitter_handle_overlap.items():
if len(actors) > 1:
print_warn(
"These actors share the same twitter_handle "
f"{actors} - {twitter_handle}. Please merge!"
)

if self._duplicates:
msg = (
f"{len(self._duplicates)} duplicate(s) found, starting "
Expand Down Expand Up @@ -208,6 +236,26 @@ def all_fields(self):
**self.explicit_fields,
}

@property
def context(self):
if "context" in self.contents:
return json.loads(self.contents["context"])
else:
return {}

@property
def uris(self):
c = self.context
return strip_empty([self.contents.get("uri", None)] + c.get("uris", []))

@property
def twitter_handle(self):
return self.context.get("twitter_handle", None)

@property
def identifier(self):
return self.contents.get("id", None)

def to_json(self):
"""Returns a JSON serialization of all actor fields"""
actor = self.all_fields
Expand Down
47 changes: 46 additions & 1 deletion src/tagpack/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@
from datetime import datetime
from urllib.parse import urlparse


def strip_values(listlike, values):
return [x for x in listlike if x not in values]


def strip_none(listlike):
return strip_values(listlike, [None])


def strip_empty(listlike):
return strip_values(listlike, [None, "", []])


def try_parse_date(date, format="%Y-%m-%d"):
Expand All @@ -20,8 +33,40 @@ def try_parse_date(date, format="%Y-%m-%d"):
return date


def apply_to_dict_field(dictlike, field, fun, fail=True):
def apply_to_dict_field(dictlike, field: str, fun, fail=True):
"""Summary
Args:
dictlike (dict): something dict like
field (str): Field to apply the function on
fun (TYPE): Function to apply, must take one parameter
fail (bool, optional): If True the function throws and error
if field is not present
Raises:
ValueError: Description
"""
if field in dictlike:
dictlike[field] = fun(dictlike[field])
elif fail:
raise ValueError(f"Field {field} is not present in dictionary.")


def get_secondlevel_domain(url: str) -> str:
"""Summary
Args:
url (str): url to parse
Returns:
str: top level domain
"""
if not url.startswith("http"):
url = f"http://{url}"
pu = urlparse(url).netloc
frag = pu.split(".")
if len(frag) < 2:
return ".".join(frag)
else:
co_domain = frag[-2] == "co"
return ".".join(frag[-3:] if co_domain else frag[-2:])
8 changes: 8 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from tagpack.utils import get_secondlevel_domain


def test_tld_extraction():
assert get_secondlevel_domain("abc.co.uk") == "abc.co.uk"
assert get_secondlevel_domain("spam.abc.co.uk") == "abc.co.uk"
assert get_secondlevel_domain("spam.uk") == "spam.uk"
assert get_secondlevel_domain("www.spam.uk") == "spam.uk"

0 comments on commit 10fb4f7

Please sign in to comment.