Skip to content

Commit

Permalink
Add Danish Discourse dataset (#247)
Browse files Browse the repository at this point in the history
* misc.

* update ddisco.py

* chore: delete ddisco.py, ddisco.test.tsv and ddisco.train.tsv

* Update mteb/tasks/Classification/DdiscoCohesionClassification.py

Co-authored-by: Kenneth Enevoldsen <[email protected]>

* Update mteb/tasks/Classification/DdiscoCohesionClassification.py

Co-authored-by: Kenneth Enevoldsen <[email protected]>

* Update mteb/tasks/Classification/DdiscoCohesionClassification.py

Co-authored-by: Imene Kerboua <[email protected]>

* Update mteb/tasks/Classification/DdiscoCohesionClassification.py

Co-authored-by: Imene Kerboua <[email protected]>

* Update mteb/tasks/Classification/DdiscoCohesionClassification.py

Co-authored-by: Imene Kerboua <[email protected]>

---------

Co-authored-by: Kenneth Enevoldsen <[email protected]>
Co-authored-by: Imene Kerboua <[email protected]>
  • Loading branch information
3 people authored Mar 18, 2024
1 parent 0048878 commit d46d0f5
Showing 1 changed file with 84 additions and 0 deletions.
84 changes: 84 additions & 0 deletions mteb/tasks/Classification/DdiscoCohesionClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from datasets import load_dataset

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification


class DdiscoCohesionClassification(AbsTaskClassification):
@property
def description(self):
return {
"name": "Ddisco",
"hf_hub_name": "DDSC/ddisco",
"description": "A Danish Discourse dataset with values for coherence and source (Wikipedia or Reddit)",
"reference": "https://aclanthology.org/2022.lrec-1.260/",
"type": "Classification",
"category": "s2s",
"eval_splits": ["test"],
"eval_langs": ["da"],
"main_score": "accuracy",
"revision": "514ab557579fcfba538a4078d6d647248a0e6eb7",
}

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset = load_dataset(
self.description["hf_hub_name"], revision=self.description.get("revision")
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
self.dataset = self.dataset.rename_columns({"rating": "label"}).remove_columns(
["domain"]
)

@property
def metadata(self):
return {
"date": "2012-01-01/2022-06-25",
"form": ["written"],
"domains": ["non-fiction", "social"],
"dialect": [],
"task_subtypes": ["Discourse coherence"],
"license": "cc-by-sa-3.0",
"socioeconomic_status": "high",
"annotations_creators": "expert-annotated",
"text_creation": "found",
"citation": """
@inproceedings{flansmose-mikkelsen-etal-2022-ddisco,
title = "{DD}is{C}o: A Discourse Coherence Dataset for {D}anish",
author = "Flansmose Mikkelsen, Linea and
Kinch, Oliver and
Jess Pedersen, Anders and
Lacroix, Oph{\'e}lie",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.260",
pages = "2440--2445",
abstract = "To date, there has been no resource for studying discourse coherence on real-world Danish texts. Discourse coherence has mostly been approached with the assumption that incoherent texts can be represented by coherent texts in which sentences have been shuffled. However, incoherent real-world texts rarely resemble that. We thus present DDisCo, a dataset including text from the Danish Wikipedia and Reddit annotated for discourse coherence. We choose to annotate real-world texts instead of relying on artificially incoherent text for training and testing models. Then, we evaluate the performance of several methods, including neural networks, on the dataset.",
}
""",
}


0 comments on commit d46d0f5

Please sign in to comment.