Skip to content

Commit

Permalink
fix bugs in main property connections dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
Binh Vu committed Jan 16, 2024
1 parent 893f076 commit 3a0b8ca
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 7 deletions.
2 changes: 1 addition & 1 deletion kgdata/wikidata/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, datadir: Path) -> None:
self.property_ranges = datadir / "045_property_ranges"
self.property_ranges = datadir / "045_property_ranges"
self.ont_count = datadir / "046_ont_count"
self.main_property_connections = datadir / "046_main_property_connections"
self.main_property_connections = datadir / "047_main_property_connections"

self.cross_wiki_mapping = datadir / "050_cross_wiki_mapping"

Expand Down
28 changes: 22 additions & 6 deletions kgdata/wikidata/datasets/main_property_connections.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from __future__ import annotations

from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple, Union

import orjson
from kgdata.dataset import Dataset
Expand All @@ -11,15 +12,24 @@
from kgdata.wikidata.models.wdentity import WDEntity


def main_property_connections():
def get_main_property_connections_dataset(with_dep: bool = True):
cfg = WikidataDirCfg.get_instance()
ds = Dataset(

if with_dep:
deps = [entities(), entity_types()]
else:
deps = []

return Dataset(
cfg.main_property_connections / "*.gz",
deserialize=orjson.loads,
deserialize=deser_connection,
name="property-connections",
dependencies=[entities(), entity_types()],
dependencies=deps,
)


def main_property_connections():
ds = get_main_property_connections_dataset(with_dep=True)
if not ds.has_complete_data():
(
entities()
Expand All @@ -37,6 +47,8 @@ def main_property_connections():
.save_like_dataset(ds, auto_coalesce=True)
)

return ds


instanceof = "P31"
subclass_of = "P279"
Expand All @@ -45,6 +57,10 @@ def main_property_connections():
ignored_props = {instanceof, subclass_of, subproperty_of}


def deser_connection(line: Union[str, bytes]) -> PConnection:
return PConnection.from_dict(orjson.loads(line))


def merge_preconn(collection: dict[str, list[PrePConnection]], conn: PrePConnection):
if conn.prop not in collection:
collection[conn.prop] = [conn]
Expand Down Expand Up @@ -133,7 +149,7 @@ def get_prop_connections(ent: WDEntity):
domains = {
stmt.value.as_entity_id_safe(): 1 for stmt in ent.props.get(instanceof, [])
}
out: dict[str, list[PrePConnection]] = {}
out: dict[str, list[PrePConnection]] = defaultdict(list)

for prop, stmts in ent.props.items():
if prop in ignored_props:
Expand Down
2 changes: 2 additions & 0 deletions scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ wikidata_dataset entity_outlinks
wikidata_dataset entity_pagerank
wikidata_dataset entity_wiki_aliases

wikidata_dataset main_property_connections

# ======================================================================
# WIKIPEDIA Datasets

Expand Down

0 comments on commit 3a0b8ca

Please sign in to comment.