From f1576549d671bf714ebb0448a00b487720a4a3fd Mon Sep 17 00:00:00 2001 From: Binh Vu Date: Tue, 12 Mar 2024 18:39:38 +0000 Subject: [PATCH] Supports manual corrections in DBpedia such as `dbo:collectionSize` to `dbo:country` --- CHANGELOG.md | 1 + kgdata/dbpedia/config.py | 4 ++++ kgdata/dbpedia/datasets/entity_redirections.py | 11 ++++++++++- kgdata/dbpedia/datasets/ontology_dump.py | 13 ++++++++++--- scripts/build.sh | 10 +++++----- 5 files changed, 30 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index abc0291..9116e6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Added - Add default classes & properties: `rdf:Resource` and `rdf:type`. +- Supports manual corrections in DBpedia such as `dbo:collectionSize` to `dbo:country`. ### Changed diff --git a/kgdata/dbpedia/config.py b/kgdata/dbpedia/config.py index 69eda03..359bcb7 100644 --- a/kgdata/dbpedia/config.py +++ b/kgdata/dbpedia/config.py @@ -29,6 +29,7 @@ def __init__(self, datadir: Path): self.dumps = datadir / "dumps" self.ontology_dump = datadir / "ontology_dump" + self.modifications = datadir / "modifications" self.mapping_extractor_dump = datadir / "mapping_extractor_dump" self.generic_extractor_dump = datadir / "generic_extractor_dump" @@ -75,6 +76,9 @@ def get_mapping_extractor_dump_files(self, lang: str = "en"): def get_redirection_dump_file(self, lang: str = "en"): return self._get_file(self.dumps / f"redirects_lang={lang}.ttl.bz2") + def get_redirection_modified_file(self): + return self.modifications / "redirections.csv" + def get_wikilink_dump_file(self, lang: str = "en"): return self._get_file(self.dumps / f"wikilinks_lang={lang}.ttl.bz2") diff --git a/kgdata/dbpedia/datasets/entity_redirections.py b/kgdata/dbpedia/datasets/entity_redirections.py index 379413d..671a401 100644 --- a/kgdata/dbpedia/datasets/entity_redirections.py +++ b/kgdata/dbpedia/datasets/entity_redirections.py @@ -1,8 +1,12 @@ +from __future__ import annotations + import orjson +import serde.csv from kgdata.dataset import Dataset from kgdata.dbpedia.config import DBpediaDirCfg from kgdata.dbpedia.datasets.entities import entities from kgdata.misc.ntriples_parser import Triple, ignore_comment, ntriple_loads +from kgdata.spark.common import get_spark_context from kgdata.spark.extended_rdd import ExtendedRDD from kgdata.splitter import split_a_file from rdflib import URIRef @@ -27,6 +31,8 @@ def entity_redirections(lang: str = "en"): override=False, ) + extra_redirections = serde.csv.deser(cfg.get_redirection_modified_file()) + ( ExtendedRDD.textFile(cfg.entity_redirections / f"raw-{lang}/*.gz") .filter(ignore_comment) @@ -38,8 +44,11 @@ def entity_redirections(lang: str = "en"): entities(lang).get_extended_rdd().map(lambda r: (r.id, 1)) ) # join with entities to filter out non-existing entities .flatMap(lambda x: x[1][0]) # get back the redirections + .union(ExtendedRDD.parallelize(extra_redirections)) .map(orjson.dumps) - .save_like_dataset(ds, auto_coalesce=True, shuffle=True) + .save_like_dataset( + ds, auto_coalesce=True, shuffle=True, trust_dataset_dependencies=True + ) ) return ds diff --git a/kgdata/dbpedia/datasets/ontology_dump.py b/kgdata/dbpedia/datasets/ontology_dump.py index 04659ed..3e7f8aa 100644 --- a/kgdata/dbpedia/datasets/ontology_dump.py +++ b/kgdata/dbpedia/datasets/ontology_dump.py @@ -6,14 +6,14 @@ from functools import lru_cache from typing import Any, Callable, Iterable -from rdflib import OWL, RDF, RDFS, BNode, URIRef - +import serde.csv from kgdata.dataset import Dataset from kgdata.dbpedia.config import DBpediaDirCfg from kgdata.misc.ntriples_parser import Triple, ignore_comment, ntriple_loads from kgdata.misc.resource import RDFResource from kgdata.spark import ExtendedRDD from kgdata.splitter import split_a_file, split_a_list +from rdflib import OWL, RDF, RDFS, BNode, URIRef rdf_type = str(RDF.type) rdfs_label = str(RDFS.label) @@ -83,6 +83,12 @@ def ontology_dump() -> Dataset[RDFResource]: # fix broken references resources = step2_ds.get_list() + # remove resources that have been redirected + redirected_resources = { + x[0] for x in serde.csv.deser(cfg.get_redirection_modified_file()) + } + resources = [r for r in resources if r.id not in redirected_resources] + classes = {r.id: r for r in resources if is_class(r)} props = {r.id: r for r in resources if is_prop(r)} @@ -130,7 +136,8 @@ def ontology_dump() -> Dataset[RDFResource]: [line for id, lines in logs.items() for line in ["* " + id] + lines] ) ) - final_ds.sign("ontology-dump/final", [step2_ds]) + assert final_ds.name is not None + final_ds.sign(final_ds.name, [step2_ds]) if not (cfg.ontology_dump / "predicates.txt").exists(): ( diff --git a/scripts/build.sh b/scripts/build.sh index e679124..c77e223 100644 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -36,16 +36,16 @@ function wikidata_db { # dbpedia_dataset generic_extractor_dump # dbpedia_dataset mapping_extractor_dump -# dbpedia_dataset ontology_dump -# dbpedia_dataset classes -# dbpedia_dataset properties +dbpedia_dataset ontology_dump +dbpedia_dataset classes +dbpedia_dataset properties dbpedia_dataset entities +dbpedia_dataset entity_redirections dbpedia_dataset entity_labels dbpedia_dataset entity_metadata dbpedia_dataset entity_all_types dbpedia_dataset entity_degrees dbpedia_dataset entity_types_and_degrees -dbpedia_dataset entity_redirections # ====================================================================== # WIKIDATA Datasets @@ -114,7 +114,7 @@ dbpedia_db entity_redirections # wikidata_db classes # wikidata_db properties # wikidata_db entities -wikidata_db entity_labels +# wikidata_db entity_labels # wikidata_db entity_metadata # wikidata_db entity_types # wikidata_db entity_outlinks