diff --git a/README.md b/README.md index 87fc54e..d458924 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,9 @@ The project aims at using state-of-the-art machine learning methods, and in part ## Data ### Raw -- cs_45.json; cell line terms, extracted from Cellosaurus; 145673 terms -- cs_pos_pmid_set.tsv; curated positive samples, extracted from Cellosaurus, 22719 PMIDs -- gs_neg_pmid.tsv; curated negative samples, extracted from Google Scholar, 475 PMIDs -- ls_neg_pmid.tsv; uncurated negative samples, extracted from LitSuggest, 645 PMIDs +- cs_term_45.0; cell line terms, extracted from Cellosaurus, 145673 terms +#### PMID +- Cellosaurus; positive samples, extracted from Cellosaurus, 22719 PMIDs +- CellosaurusAB; positive samples, extracted from Cellosaurus, curated, high portion of seminal papers, 10.000 PMIDs +- GoogleScholar; negative samples, extracted from rejected Google Scholar results, curated, 509 PMIDs +- LitSuggest; negative samples, extract from rejected LitSuggest results, 645 PMIDs \ No newline at end of file diff --git a/notebook/data_proc_pmid.ipynb b/notebook/data_proc_pmid.ipynb index 3c5688c..2f2329a 100644 --- a/notebook/data_proc_pmid.ipynb +++ b/notebook/data_proc_pmid.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 130, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -17,6 +17,7 @@ "import random\n", "from munch import Munch\n", "import numpy as np\n", + "from tab\n", "\n", "with initialize(\n", " version_base=None,\n",