Skip to content

Commit

Permalink
Merge branch 'master' of github.com:experimaestro/datamaestro_text
Browse files Browse the repository at this point in the history
  • Loading branch information
bpiwowar committed Oct 27, 2023
2 parents 72e9316 + c435f20 commit fb4871c
Show file tree
Hide file tree
Showing 2 changed files with 226 additions and 3 deletions.
153 changes: 151 additions & 2 deletions src/datamaestro_text/data/ir/formats.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import ClassVar
from typing import ClassVar, Tuple
from attrs import define
from .base import IDHolder, TextAndIDHolder, Document, GenericTopic
from ir_datasets.datasets.wapo import WapoDocMedia
from .base import IDHolder, Document, GenericTopic, IDTopic
from ir_datasets.datasets.cord19 import Cord19FullTextSection


@define
Expand All @@ -24,17 +26,164 @@ class DocumentWithTitle(IDHolder, Document):

text: str


@define
class CordFullTextDocument(IDHolder, Document):
title: str
doi: str
date: str
abstract: str
body: Tuple[Cord19FullTextSection, ...]

has_text: ClassVar[bool] = True

def get_text(self):
return f"{self.abstract}"


@define
class MsMarcoDocument(IDHolder, Document):
url: str
title: str
body: str

has_text: ClassVar[bool] = True

def get_text(self):
return f"{self.body}"


@define
class NFCorpusDocument(IDHolder, Document):
url: str
title: str
abstract: str

has_text: ClassVar[bool] = True

def get_text(self):
return f"{self.abstract}"


@define
class TitleDocument(IDHolder, Document):
text: str
title: str
has_text: ClassVar[bool] = True

def get_text(self):
return f"{self.title} {self.text}"


@define
class TitleUrlDocument(IDHolder, Document):
text: str
title: str
url: str
has_text: ClassVar[bool] = True

def get_text(self):
return f"{self.title} {self.text}"


@define
class TrecParsedDocument(IDHolder, Document):
title: str
body: str
marked_up_doc: bytes

has_text: ClassVar[bool] = True

def get_text(self):
return f"{self.title} {self.body}"


@define
class WapoDocument(IDHolder, Document):
url: str
title: str
author: str
published_date: int
kicker: str
body: str
body_paras_html: Tuple[str, ...]
body_media: Tuple[WapoDocMedia, ...]

has_text: ClassVar[bool] = True

def get_text(self):
return f"{self.body}"


@define
class TweetDoc(IDHolder, Document):
text: str
user_id: str
created_at: str
lang: str
reply_doc_id: str
retweet_doc_id: str
source: bytes
source_content_type: str

def get_text(self):
return f"{self.text}"


@define
class TrecTopic(GenericTopic):
text: str
query: str
narrative: str

def get_text(self):
return f"{self.text}"


@define
class UrlTopic(GenericTopic):
text: str
url: str

def get_text(self):
return f"{self.text}"


@define
class NFCorpusTopic(IDTopic):
title: str
all: str

def get_text(self):
return f"{self.title}"


@define
class TrecQuery(IDTopic):
title: str
description: str
narrative: str

def get_text(self):
return f"{self.description}"


@define
class TrecMb13Query(IDTopic):
query: str
time: str
tweet_time: str

def get_text(self):
return f"{self.query}"


@define
class TrecMb14Query(IDTopic):
query: str
time: str
tweet_time: str
description: str

def get_text(self):
return f"{self.query}"
76 changes: 75 additions & 1 deletion src/datamaestro_text/datasets/irds/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@
from typing import Any, Iterator, Tuple, Type, List
import attrs
import ir_datasets
from ir_datasets.formats import GenericDoc, GenericQuery, GenericDocPair
from ir_datasets.formats import (
GenericDoc,
GenericQuery,
GenericDocPair,
TrecParsedDoc,
TrecQuery,
)
import ir_datasets.datasets as _irds
from experimaestro import Config
from experimaestro.compat import cached_property
Expand Down Expand Up @@ -95,6 +101,54 @@ class Documents(ir.DocumentStore, IRDSId):
_irds.miracl.MiraclDoc: tuple_constructor(
formats.DocumentWithTitle, "doc_id", "title", "text"
),
_irds.beir.BeirTitleDoc: tuple_constructor(
formats.TitleDocument, "doc_id", "text", "title"
),
_irds.beir.BeirTitleUrlDoc: tuple_constructor(
formats.TitleUrlDocument, "doc_id", "text", "title", "url"
),
_irds.msmarco_document.MsMarcoDocument: tuple_constructor(
formats.MsMarcoDocument, "doc_id", "url", "title", "body"
),
_irds.cord19.Cord19FullTextDoc: tuple_constructor(
formats.CordFullTextDocument,
"doc_id",
"title",
"doi",
"date",
"abstract",
"body",
),
_irds.nfcorpus.NfCorpusDoc: tuple_constructor(
formats.NFCorpusDocument, "doc_id", "url", "title", "abstract"
),
TrecParsedDoc: tuple_constructor(
formats.TrecParsedDocument, "doc_id", "title", "body", "marked_up_doc"
),
_irds.wapo.WapoDoc: tuple_constructor(
formats.WapoDocument,
"doc_id",
"url",
"title",
"author",
"published_date",
"kicker",
"body",
"body_paras_html",
"body_media",
),
_irds.tweets2013_ia.TweetDoc: tuple_constructor(
formats.TweetDoc,
"doc_id",
"text",
"user_id",
"created_at",
"lang",
"reply_doc_id",
"retweet_doc_id",
"source",
"source_content_type",
),
}

"""Wraps an ir datasets collection -- and provide a default text
Expand Down Expand Up @@ -162,6 +216,26 @@ class Topics(ir.TopicsStore, IRDSId):
_irds.beir.BeirCovidQuery: tuple_constructor(
formats.TrecTopic, "query_id", "text", "query", "narrative"
),
_irds.beir.BeirUrlQuery: tuple_constructor(
formats.UrlTopic, "query_id", "text", "url"
),
_irds.nfcorpus.NfCorpusQuery: tuple_constructor(
formats.NFCorpusTopic, "query_id", "title", "all"
),
TrecQuery: tuple_constructor(
formats.TrecQuery, "query_id", "title", "description", "narrative"
),
_irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
),
_irds.tweets2013_ia.TrecMb14Query: tuple_constructor(
formats.TrecMb14Query,
"query_id",
"query",
"time",
"tweet_time",
"description",
),
}

def iter(self) -> Iterator[ir.Topic]:
Expand Down

0 comments on commit fb4871c

Please sign in to comment.