Skip to content

Commit

Permalink
add meta graph stats
Browse files Browse the repository at this point in the history
  • Loading branch information
Binh Vu committed Jan 22, 2024
1 parent 5cd4304 commit 13db96e
Show file tree
Hide file tree
Showing 5 changed files with 275 additions and 28 deletions.
1 change: 1 addition & 0 deletions kgdata/wikidata/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(self, datadir: Path) -> None:
self.entity_wiki_aliases = datadir / "077_entity_wiki_aliases"

self.meta_graph = datadir / "080_meta_graph"
self.meta_graph_stats = datadir / "081_meta_graph_stats"

# deprecated
self.wp2wd = datadir / "wp2wd"
Expand Down
4 changes: 3 additions & 1 deletion kgdata/wikidata/datasets/meta_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

from collections import defaultdict
from dataclasses import dataclass
from functools import partial
from typing import Dict, Iterable, List, Optional, Tuple, TypeAlias, Union

import orjson
from sm.misc.funcs import filter_duplication

from kgdata.dataset import Dataset
from kgdata.db import deser_from_dict
from kgdata.wikidata.config import WikidataDirCfg
from kgdata.wikidata.datasets.entities import entities
from kgdata.wikidata.datasets.entity_outlinks import entity_outlinks
Expand All @@ -21,7 +23,7 @@ def meta_graph():

ds = Dataset(
cfg.meta_graph / "*.gz",
deserialize=orjson.loads,
deserialize=partial(deser_from_dict, MetaEntity),
name="meta-graph",
dependencies=[entities(), entity_outlinks(), entity_types()],
)
Expand Down
243 changes: 243 additions & 0 deletions kgdata/wikidata/datasets/meta_graph_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
from __future__ import annotations

from collections import defaultdict
from dataclasses import dataclass
from functools import partial
from operator import add
from typing import Dict, Iterable, List, Optional, Tuple, TypeAlias, Union

import orjson
from sm.misc.funcs import filter_duplication

from kgdata.dataset import Dataset
from kgdata.db import deser_from_dict, ser_to_dict
from kgdata.wikidata.config import WikidataDirCfg
from kgdata.wikidata.datasets.entities import entities
from kgdata.wikidata.datasets.entity_outlinks import entity_outlinks
from kgdata.wikidata.datasets.entity_types import entity_types
from kgdata.wikidata.datasets.meta_graph import MetaEntity, meta_graph
from kgdata.wikidata.models.wdentity import WDEntity
from kgdata.wikidata.models.wdvalue import WDValue, WDValueKind


def meta_graph_stats():
cfg = WikidataDirCfg.get_instance()

# have information about the domains and ranges of predicates
predicate_count_ds = Dataset(
cfg.meta_graph_stats / "predicate_count/*.gz",
deserialize=partial(deser_from_dict, PCount),
name="meta-graph-stats/predicate-count",
dependencies=[meta_graph()],
)

predicate_conn_ds = Dataset(
cfg.meta_graph_stats / "predicate_conn/*.gz",
deserialize=partial(deser_from_dict, PConnection),
name="meta-graph-stats/predicate-conn",
dependencies=[meta_graph()],
)

predicate_occurrence_ds = Dataset(
cfg.meta_graph_stats / "predicate_occurrence/*.gz",
deserialize=partial(deser_from_dict, PConnection),
name="meta-graph-stats/predicate-occurrence",
dependencies=[meta_graph()],
)

if not predicate_count_ds.has_complete_data():
(
meta_graph()
.get_extended_rdd()
.flatMap(lambda x: get_pcount(x))
.reduceByKey(add)
.map(lambda x: PCount(x[0], x[1]))
.map(ser_to_dict)
.save_like_dataset(predicate_count_ds, auto_coalesce=True)
)

if not predicate_conn_ds.has_complete_data():

def merge_pconnections(p1: PConnection, p2: PConnection):
p1.freq += p2.freq
return p1

(
meta_graph()
.get_extended_rdd()
.flatMap(get_pconnection)
.map(lambda x: (x.get_key(), x))
.reduceByKey(merge_pconnections)
.map(lambda x: orjson.dumps(x[1].to_dict()))
.save_like_dataset(predicate_conn_ds, auto_coalesce=True)
)

if not predicate_occurrence_ds.has_complete_data():
(
meta_graph()
.get_extended_rdd()
.flatMap(get_poccurrence)
.reduceByKey(add)
.map(lambda x: POccurrence(x[0][0], x[0][1], x[1]))
.map(ser_to_dict)
.save_like_dataset(predicate_occurrence_ds, auto_coalesce=True)
)


def get_pcount(meta_entity: MetaEntity):
out: set[tuple[tuple[str, Optional[str]], int]] = set()
for prop, stmts in meta_entity.props.items():
out.add(((prop, None), 1))
for stmt in stmts:
for qual in stmt.qualifiers.keys():
out.add(((prop, qual), 1))

return list(out)


def get_pconnection(meta_entity: MetaEntity):
out: dict[tuple, PConnection] = {}
for prop, stmts in meta_entity.props.items():
for stmt in stmts:
for source_type in meta_entity.classes:
if stmt.value is None:
conn = PConnection(
prop=prop,
qual=None,
source_type=source_type,
target_type=None,
freq=1,
)
out[conn.get_key()] = conn
else:
for target_type in stmt.value:
conn = PConnection(
prop=prop,
qual=None,
source_type=source_type,
target_type=target_type,
freq=1,
)
out[conn.get_key()] = conn

for qual, values in stmt.qualifiers.items():
for value in values:
if value is None:
for source_type in meta_entity.classes:
conn = PConnection(
prop=prop,
qual=qual,
source_type=source_type,
target_type=None,
freq=1,
)
out[conn.get_key()] = conn
else:
for source_type in meta_entity.classes:
for target_type in value:
conn = PConnection(
prop=prop,
qual=qual,
source_type=source_type,
target_type=target_type,
freq=1,
)
out[conn.get_key()] = conn

return list(out.values())


def get_poccurrence(meta_entity: MetaEntity):
used_predicates: set[tuple[str, Optional[str]]] = set()
for prop, stmts in meta_entity.props.items():
used_predicates.add((prop, None))
for stmt in stmts:
for qual in stmt.qualifiers.keys():
used_predicates.add((prop, qual))

out: list[
tuple[tuple[tuple[str, Optional[str]], tuple[str, Optional[str]]], int]
] = []
for p1 in used_predicates:
for p2 in used_predicates:
if p1 != p2:
out.append(((p1, p2), 1))

return out


@dataclass
class PCount:
predicate: tuple[str, Optional[str]]
freq: int

def to_dict(self):
return {
"predicate": self.predicate,
"freq": self.freq,
}

@staticmethod
def from_dict(d: dict):
return PCount(
predicate=d["predicate"],
freq=d["freq"],
)


@dataclass
class POccurrence:
predicate1: tuple[str, Optional[str]]
predicate2: tuple[str, Optional[str]]
freq: int

def to_dict(self):
return {
"predicate1": self.predicate1,
"predicate2": self.predicate2,
"freq": self.freq,
}

@staticmethod
def from_dict(d: dict):
return POccurrence(
predicate1=d["predicate1"],
predicate2=d["predicate2"],
freq=d["freq"],
)


@dataclass
class PConnection:
prop: str
qual: Optional[str]
source_type: str
target_type: Optional[str]
freq: int

def get_key(self):
return (
self.prop,
self.qual,
self.source_type,
self.target_type,
)

def to_dict(self):
return {
"prop": self.prop,
"qual": self.qual,
"source_type": self.source_type,
"target_type": self.target_type,
"freq": self.freq,
}

@staticmethod
def from_dict(d: dict):
return PConnection(
prop=d["prop"],
qual=d["qual"],
source_type=d["source_type"],
target_type=d["target_type"],
freq=d["freq"],
)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "kgdata"
version = "6.3.2"
version = "6.4.0"
description = "Library to process dumps of knowledge graphs (Wikipedia, DBpedia, Wikidata)"
readme = "README.md"
authors = [{ name = "Binh Vu", email = "[email protected]" }]
Expand Down
53 changes: 27 additions & 26 deletions scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,32 +50,33 @@ function wikidata_db {
# # python -m kgdata.wikidata.datasets -d entity_redirection_dump --sign
# # python -m kgdata.wikidata.datasets -d page_dump --sign

wikidata_dataset page_ids
wikidata_dataset entity_ids
wikidata_dataset entity_redirections
wikidata_dataset entities
wikidata_dataset entity_types

wikidata_dataset classes
wikidata_dataset properties

wikidata_dataset class_count
wikidata_dataset property_count
wikidata_dataset property_domains
wikidata_dataset property_ranges

wikidata_dataset cross_wiki_mapping

wikidata_dataset entity_metadata
wikidata_dataset entity_all_types
wikidata_dataset entity_degrees
wikidata_dataset entity_labels
wikidata_dataset entity_types_and_degrees
wikidata_dataset entity_outlinks
wikidata_dataset entity_pagerank
wikidata_dataset entity_wiki_aliases

wikidata_dataset meta_graph
# wikidata_dataset page_ids
# wikidata_dataset entity_ids
# wikidata_dataset entity_redirections
# wikidata_dataset entities
# wikidata_dataset entity_types

# wikidata_dataset classes
# wikidata_dataset properties

# wikidata_dataset class_count
# wikidata_dataset property_count
# wikidata_dataset property_domains
# wikidata_dataset property_ranges

# wikidata_dataset cross_wiki_mapping

# wikidata_dataset entity_metadata
# wikidata_dataset entity_all_types
# wikidata_dataset entity_degrees
# wikidata_dataset entity_labels
# wikidata_dataset entity_types_and_degrees
# wikidata_dataset entity_outlinks
# wikidata_dataset entity_pagerank
# wikidata_dataset entity_wiki_aliases

# wikidata_dataset meta_graph
wikidata_dataset meta_graph_stats

# ======================================================================
# WIKIPEDIA Datasets
Expand Down

0 comments on commit 13db96e

Please sign in to comment.