diff --git a/code/data_etl.ipynb b/code/data_etl.ipynb
index 29ff681..107af96 100644
--- a/code/data_etl.ipynb
+++ b/code/data_etl.ipynb
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 195,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -20,6 +20,8 @@
     "import re\n",
     "import numpy as np\n",
     "import pandas as pd\n",
+    "import unicodedata\n",
+    "import string\n",
     "import rispy\n",
     "import matplotlib.pyplot as plt\n",
     "from pathlib import Path\n",
@@ -40,7 +42,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -57,7 +59,9 @@
     "    'transforms': [],\n",
     "}\n",
     "wosSource = {\n",
-    "    'paths': [dataSourceDir / x for x in (\"wos1-500.ciw\", \"wos501-973.ciw\")],\n",
+    "    'paths': [\n",
+    "        dataSourceDir / x for x in \"wos1001-1500.ciw  wos1-500.ciw  wos1501-1689.ciw  wos501-1000.ciw\".split()\n",
+    "    ],\n",
     "    'rispy_args': {'implementation': 'wok'},\n",
     "    'col_rename': {'publication_year': 'year', 'document_title': 'title'},\n",
     "    'transforms': [],\n",
@@ -66,7 +70,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -86,7 +90,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -95,7 +99,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -104,7 +108,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -129,6 +133,65 @@
     "allData = pd.concat(allDataList, join='outer', ignore_index=True)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Keep only article data\n",
+    "article_data = allData.loc[allData[\"type_of_reference\"].eq('JOUR') | allData[\"publication_type\"].eq('J')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Normalize DOI\n",
+    "article_data.loc[:, 'doi'] = article_data['doi'].str.translate(\n",
+    "    str.maketrans(string.ascii_lowercase, string.ascii_uppercase)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Remove spurious records\n",
+    "article_data = article_data.loc[article_data['url'].ne(\n",
+    "    \"https://www.scopus.com/inward/record.uri?eid=2-s2.0-85052219975&partnerID=40&md5=7b54756675a6d510c9db069b49b634d6\"\n",
+    ")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Correct faulty records\n",
+    "data_corrections = {\n",
+    "    'doi': {\n",
+    "        r'^(.*)/PDF$': r'\\1',\n",
+    "    }\n",
+    "}\n",
+    "corrected_article_data = article_data.replace(data_corrections, regex=True)\n",
+    "article_data.compare(corrected_article_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "article_data = corrected_article_data"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -137,7 +200,7 @@
    },
    "outputs": [],
    "source": [
-    "allData.describe()"
+    "article_data.describe()"
    ]
   },
   {
@@ -151,7 +214,11 @@
     "        return np.nan\n",
     "    if sx.name == '__source':\n",
     "        return sx.sum()\n",
-    "    return sx[sx.map(len, na_action='ignore').idxmax()]\n",
+    "    if sx.name == 'doi':\n",
+    "        if len(sx.dropna().unique()) > 1:\n",
+    "            print('Warning, merging different DOIs:\\n', sx)\n",
+    "            return list(sx.dropna().unique())\n",
+    "    return sx[sx.map(len, na_action='ignore').idxmax()] # Keep a list of all DOIs - must explode before using!\n",
     "\n",
     "def merge_records_keep_longest(dfx):\n",
     "    return dfx.agg(merge_series_keep_longest)"
@@ -163,10 +230,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Keep only article data\n",
-    "article_data = allData.loc[allData[\"type_of_reference\"].eq('JOUR') | allData[\"publication_type\"].eq('J')]\n",
     "# Merge data with same DOI\n",
     "article_doi = article_data.groupby(article_data['doi'].values).agg(merge_records_keep_longest)\n",
+    "\n",
     "# Reassemble data with and without DOI\n",
     "article_nodoi = article_data[~article_data.doi.isin(article_doi.index)]\n",
     "article_data = pd.concat([article_doi, article_nodoi], ignore_index=True)"
@@ -174,7 +240,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 179,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_diacritics(input_str):\n",
+    "    nfkd_form = unicodedata.normalize('NFKD', input_str)\n",
+    "    return \"\".join([c for c in nfkd_form if not unicodedata.combining(c)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -185,12 +262,13 @@
     "        .str.replace(r'[^\\s\\w]', ' ', regex=True)\n",
     "        .str.replace(r'\\s+', ' ', regex=True)\n",
     "        .str.strip()\n",
+    "        # .map(remove_diacritics) # no need as our corpus is in English\n",
     "    )"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -235,7 +313,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "aa = articles_g.agg(list)[articles_g.size()>=2]"
+    "aa = articles_g.agg(list)[articles_g.size() > 1]"
    ]
   },
   {
@@ -245,10 +323,13 @@
    "outputs": [],
    "source": [
     "# Test alternatives matchers\n",
-    "# articles_gx = article_data.groupby(Match(article_data, 15).match)\n",
-    "# bb = articles_gx.agg(list)[articles_gx.size()>=2]\n",
-    "# set(clean_titles(aa.explode('title')['title'])).difference(clean_title(bb.explode('title')['title']))\n",
-    "# set(clean_titles(bb.explode('title')['title'])).difference(clean_title(aa.explode('title')['title']))"
+    "if False:\n",
+    "    articles_gx = article_data.groupby(Match(article_data, 15).match)\n",
+    "    bb = articles_gx.agg(list)[articles_gx.size() > 1]\n",
+    "    pprint([sorted(x) for x in (\n",
+    "        set(clean_titles(aa.explode('title')['title'])).difference(clean_titles(bb.explode('title')['title'])),\n",
+    "        set(clean_titles(bb.explode('title')['title'])).difference(clean_titles(aa.explode('title')['title'])),\n",
+    "    )])"
    ]
   },
   {
@@ -257,102 +338,53 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Check that matching titles also have matching year and author (impl: first author last name)\n",
-    "assert aa['year'].map(lambda x: len(set(x)) < 2).all()\n",
-    "aa['authors'].map(\n",
-    "    lambda x: set(\n",
-    "        tuple(z.split(',')[0].split(' ')[-1] for z in y) # last name of each author\n",
-    "        for y in x\n",
-    "        if not ( isinstance(y, np.float) and pd.isna(y) ) # skip NANs\n",
+    "def clean_name(name):\n",
+    "    return remove_diacritics(name.split(',')[0].split(' ')[-1].lower().replace(' ', '').replace('-', ''))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check that matching titles also have matching year\n",
+    "sel = aa['year'].map(lambda x: len(set(x)) > 1)\n",
+    "aa[sel]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check that matching titles also have matching author (impl: first author last name)\n",
+    "sel = aa['authors'].map(\n",
+    "    lambda merged_authors: set(\n",
+    "        tuple( # last name of each author\n",
+    "            clean_name(author)\n",
+    "            for author in authors\n",
+    "        )\n",
+    "        for authors in merged_authors\n",
+    "        if not ( isinstance(authors, float) and pd.isna(authors) ) # skip NANs\n",
     "    )\n",
     ").map(\n",
-    "    lambda x: sum(\n",
-    "        edit_distance(y, z) # sum the edit distances\n",
-    "        for x in list(zip(*x))[:1] # first authors\n",
-    "        for i, y in enumerate(x) for z in x[i+1:] # distinct pairs\n",
+    "    lambda merged_lastnames: sum(\n",
+    "        edit_distance(firstauthor, other_firstauthor) # sum the edit distances\n",
+    "        for merged_firstauthor in list(zip(*merged_lastnames))[:1] # first authors\n",
+    "        for i, firstauthor in enumerate(merged_firstauthor)\n",
+    "        for other_firstauthor in merged_firstauthor[i+1:] # distinct pairs\n",
     "    )\n",
-    ").max() < 2"
+    ") > 0\n",
+    "aa[sel].authors.to_dict()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>doi</th>\n",
-       "      <th>title</th>\n",
-       "      <th>authors</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>count</th>\n",
-       "      <td>623</td>\n",
-       "      <td>706</td>\n",
-       "      <td>702</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>unique</th>\n",
-       "      <td>623</td>\n",
-       "      <td>706</td>\n",
-       "      <td>680</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>top</th>\n",
-       "      <td>10.1016/j.ohx.2020.e00127</td>\n",
-       "      <td>Research on Monitoring Platform of Agricultura...</td>\n",
-       "      <td>[Pearce, J.M.]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>freq</th>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                              doi  \\\n",
-       "count                         623   \n",
-       "unique                        623   \n",
-       "top     10.1016/j.ohx.2020.e00127   \n",
-       "freq                            1   \n",
-       "\n",
-       "                                                    title         authors  \n",
-       "count                                                 706             702  \n",
-       "unique                                                706             680  \n",
-       "top     Research on Monitoring Platform of Agricultura...  [Pearce, J.M.]  \n",
-       "freq                                                    1              10  "
-      ]
-     },
-     "execution_count": 90,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "article_data[['doi', 'title', 'authors']].describe()"
    ]
@@ -384,101 +416,18 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Load article data (instead of running the code above)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 99,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data_corrections = {\n",
-    "    'doi': {\n",
-    "        r'^(.*)/pdf$': r'\\1',\n",
-    "#        r'^(.*)/\\w+/$': r'\\1',\n",
-    "    }\n",
-    "}"
+    "# Load article data (if already stored from the code above)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 100,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "article_data = load_data(articleDataFile)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 101,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead tr th {\n",
-       "        text-align: left;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr>\n",
-       "      <th></th>\n",
-       "      <th colspan=\"2\" halign=\"left\">doi</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th></th>\n",
-       "      <th>self</th>\n",
-       "      <th>other</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>245</th>\n",
-       "      <td>10.1088/2058-7058/31/8/34/pdf</td>\n",
-       "      <td>10.1088/2058-7058/31/8/34</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                               doi                           \n",
-       "                              self                      other\n",
-       "245  10.1088/2058-7058/31/8/34/pdf  10.1088/2058-7058/31/8/34"
-      ]
-     },
-     "execution_count": 101,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "rep_article_data = article_data.replace(data_corrections, regex=True)\n",
-    "article_data.compare(rep_article_data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 102,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "article_data = rep_article_data"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -488,7 +437,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 169,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -497,7 +446,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 170,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -521,23 +470,27 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "assert plosData[\"URI (DOI or URL)\"].notna().all()"
+    "assert plosData[\"URI (DOI or URL)\"].notna().all()\n",
+    "# Normalize DOI\n",
+    "plosData[\"URI (DOI or URL)\"] = plosData[\"URI (DOI or URL)\"].str.translate(\n",
+    "    str.maketrans(string.ascii_lowercase, string.ascii_uppercase)\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 171,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Get the doi and doi-like, fixing doi-like containing extra stuff\n",
     "re_doi = r\"(10\\.[1-9]\\d{3,}(?:\\.\\d+)*/.+)\"\n",
-    "re_http_doi_fix = r\"https?://.*/\" + re_doi + r\"(?:/|/full|/abstract|#\\w+)$\""
+    "re_http_doi_fix = r\"HTTPS?://.*/\" + re_doi + r\"(?:/|/FULL|/ABSTRACT|#\\w+)$\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 172,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -546,122 +499,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 173,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "plosData_doi_http_doi_fixed = (\n",
     "    plosData['URI (DOI or URL)']\n",
-    "    .str.extract(re_httpdoi)[0]\n",
+    "    .str.extract(re_http_doi_fix)[0]\n",
     "    .map(unquote, na_action='ignore')\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 174,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>self</th>\n",
-       "      <th>other</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>35</th>\n",
-       "      <td>10.5334/joh.7/</td>\n",
-       "      <td>10.5334/joh.7</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>36</th>\n",
-       "      <td>10.5334/joh.4/</td>\n",
-       "      <td>10.5334/joh.4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>96</th>\n",
-       "      <td>10.3389/fnbeh.2019.00140/full</td>\n",
-       "      <td>10.3389/fnbeh.2019.00140</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>98</th>\n",
-       "      <td>10.3389/fncir.2012.00098/full</td>\n",
-       "      <td>10.3389/fncir.2012.00098</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>99</th>\n",
-       "      <td>10.3389/fneng.2014.00043/full</td>\n",
-       "      <td>10.3389/fneng.2014.00043</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>103</th>\n",
-       "      <td>10.3389/fnins.2019.00784/full</td>\n",
-       "      <td>10.3389/fnins.2019.00784</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>126</th>\n",
-       "      <td>10.1088/1741-2552/aa6806#jneaa6806f01</td>\n",
-       "      <td>10.1088/1741-2552/aa6806</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>128</th>\n",
-       "      <td>10.5334/joh.14/</td>\n",
-       "      <td>10.5334/joh.14</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>134</th>\n",
-       "      <td>10.3389/fphys.2019.00099/abstract</td>\n",
-       "      <td>10.3389/fphys.2019.00099</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                      self                     other\n",
-       "35                          10.5334/joh.7/             10.5334/joh.7\n",
-       "36                          10.5334/joh.4/             10.5334/joh.4\n",
-       "96           10.3389/fnbeh.2019.00140/full  10.3389/fnbeh.2019.00140\n",
-       "98           10.3389/fncir.2012.00098/full  10.3389/fncir.2012.00098\n",
-       "99           10.3389/fneng.2014.00043/full  10.3389/fneng.2014.00043\n",
-       "103          10.3389/fnins.2019.00784/full  10.3389/fnins.2019.00784\n",
-       "126  10.1088/1741-2552/aa6806#jneaa6806f01  10.1088/1741-2552/aa6806\n",
-       "128                        10.5334/joh.14/            10.5334/joh.14\n",
-       "134      10.3389/fphys.2019.00099/abstract  10.3389/fphys.2019.00099"
-      ]
-     },
-     "execution_count": 174,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "plosData_doi.loc[plosData_doi_http_doi_fixed.notna()].compare(plosData_doi_http_doi_fixed.dropna())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 175,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -671,52 +531,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 176,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "4              10.1002/elps.201800304\n",
-       "35                      10.5334/joh.7\n",
-       "36                      10.5334/joh.4\n",
-       "65       10.1021/acs.analchem.9b02628\n",
-       "66                  10.1063/1.4941068\n",
-       "                    ...              \n",
-       "251      10.1371/journal.pone.0011890\n",
-       "317      10.1371/journal.pone.0214460\n",
-       "319      10.1371/journal.pone.0192752\n",
-       "330    10.1016/j.techfore.2020.119986\n",
-       "331                 10.1111/tra.12728\n",
-       "Name: doi, Length: 126, dtype: object"
-      ]
-     },
-     "execution_count": 176,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "plosData['doi'].dropna()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 193,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "19 712\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\n",
-    "    len(set(plosData['doi'].dropna()).intersection(article_data['doi'])),\n",
-    "    len(set(plosData['doi'].dropna()).symmetric_difference(article_data['doi'])),\n",
+    "    len(set(plosData['doi'].dropna()).intersection(article_data['doi'].explode())),\n",
+    "    len(set(plosData['doi'].dropna()).symmetric_difference(article_data['doi'].explode())),\n",
     ")"
    ]
   },
@@ -729,7 +559,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 224,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -738,43 +568,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 197,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "36"
-      ]
-     },
-     "execution_count": 197,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# How many from the collection have their title in article_data\n",
     "plosData['Title (URL items only)'].pipe(clean_titles).map(\n",
-    "    lambda x: article_data.title.pipe(clean_titles).str.contains(rf'(?i){x}', regex=True).any()\n",
+    "    lambda x: article_data['title'].pipe(clean_titles).str.contains(rf'(?i){x}', regex=True).any()\n",
     ").sum()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 198,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "20"
-      ]
-     },
-     "execution_count": 198,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# How many from the collection have their title in article_data if we require they have DOIs\n",
     "plosData['Title (URL items only)'].loc[plosData['doi'].notna()].pipe(clean_titles).map(\n",
@@ -784,27 +592,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 210,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "219    10.1371/journal.pone.0168207\n",
-      "117       10.1016/j.ohx.2017.07.001\n",
-      "203    10.1371/journal.pone.0181560\n",
-      "231    10.1371/journal.pone.0134989\n",
-      "190    10.1371/journal.pone.0201353\n",
-      "65     10.1021/acs.analchem.9b02628\n",
-      "181    10.1371/journal.pone.0220091\n",
-      "232    10.1371/journal.pone.0124938\n",
-      "182    10.1371/journal.pone.0228140\n",
-      "210    10.1371/journal.pone.0178540\n",
-      "Name: doi, dtype: object\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Give me 10 from the collection having DOIs\n",
     "z = plosData['doi'].dropna().sample(10)\n",
@@ -813,23 +603,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 212,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "219 chaos based simultaneous compression and encryption for hadoop\n",
-      "203 feasibility of a 3d printed anthropomorphic patient specific head phantom for patient specific quality assurance of intensity modulated radiotherapy\n",
-      "65 odx a fitness tracker based device for continuous bacterial growth monitoring\n",
-      "181 a low cost fluorescence reader for in vitro transcription and nucleic acid detection with cas13a\n",
-      "232 multi contrast imaging and digital refocusing on a mobile microscope with a domed led array\n",
-      "182 fieldwork based determination of design priorities for point of use drinking water quality sensors for use in resource limited environments\n",
-      "210 from medical imaging data to 3d printed anatomical models\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Get their titles if their titles are not in article_data\n",
     "for i, title in plosData.loc[z.index]['Title (URL items only)'].pipe(clean_titles).items():\n",
@@ -839,42 +615,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 246,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "107"
-      ]
-     },
-     "execution_count": 246,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Selector for DOIs only in the collection\n",
-    "sel_new_doi = ~plosData[\"doi\"].dropna().isin(article_data.doi.values)\n",
+    "sel_new_doi = ~plosData[\"doi\"].dropna().isin(article_data['doi'].explode().values)\n",
     "sel_new_doi.sum()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 263,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "136"
-      ]
-     },
-     "execution_count": 263,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Selector for Titles only in the collection\n",
     "sel_new_title = ~clean_titles(plosData[\"Title (URL items only)\"]).isin(clean_titles(article_data['title']))\n",
@@ -883,17 +637,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 268,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "bottom-illuminated orbital shaker for microalgae cultivation 10.1016/j.ohx.2020.e00143 10.1101/2020.05.01.071878\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Same title, different DOIs\n",
     "x = plosData[[\"doi\", \"Title (URL items only)\"]].loc[sel_new_doi & ~sel_new_title]\n",
@@ -910,7 +656,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 269,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -918,8 +664,8 @@
     "x = plosData.loc[~sel_new_doi & sel_new_title, 'doi']\n",
     "for y in x:\n",
     "    print(\n",
-    "        plosData.loc[plosData.doi.eq(y), \"Title (URL items only)\"].squeeze(),\n",
-    "        article_data.loc[article_data.doi.eq(y), 'title'].squeeze(),\n",
+    "        plosData.loc[plosData['doi'].eq(y), \"Title (URL items only)\"].squeeze(),\n",
+    "        article_data.loc[article_data['doi'].explode().eq(y), 'title'].squeeze(),\n",
     "    )"
    ]
   },
@@ -936,7 +682,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = nad"
+    "article_data.shape"
    ]
   },
   {
@@ -945,8 +691,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(data.shape)\n",
-    "print(data.columns)"
+    "article_data.issn.str.replace('[^\\d]', '', regex=True).value_counts()"
    ]
   },
   {
@@ -955,7 +700,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(article_data.shape)"
+    "article_data.issn.str.replace('[^\\d]', '', regex=True).value_counts().reset_index().plot(loglog=True)"
    ]
   },
   {
@@ -964,25 +709,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dup_title = article_data.duplicated('title', keep=False)\n",
-    "dup_doi = article_data.duplicated('doi', keep=False)\n",
-    "nan_doi = article_data['doi'].isna()\n",
-    "print(\n",
-    "    dup_title.sum(),\n",
-    "    dup_doi.sum(),\n",
-    "    nan_doi.sum(),\n",
-    "    (dup_title & dup_doi).sum(),\n",
-    "    (dup_title & ~dup_doi).sum(),\n",
-    ")"
+    "article_data.groupby('year').size().plot.bar()"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "article_data.issn.str.replace('[^\\d]', '', regex=True).value_counts()"
+    "## Play with our 10 article sample"
    ]
   },
   {
@@ -991,7 +725,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "article_data.issn.str.replace('[^\\d]', '', regex=True).value_counts().reset_index().plot(loglog=True)"
+    "dois = pd.Series(\"\"\"\n",
+    "    10.1371/journal.pone.0187219\n",
+    "    10.1371/journal.pone.0059840\n",
+    "    10.1371/journal.pone.0030837\n",
+    "    10.1371/journal.pone.0118545\n",
+    "    10.1371/journal.pone.0206678\n",
+    "    10.1371/journal.pone.0143547\n",
+    "    10.1371/journal.pone.0220751\n",
+    "    10.1371/journal.pone.0107216\n",
+    "    10.1371/journal.pone.0226761\n",
+    "    10.1371/journal.pone.0193744\n",
+    "\"\"\".split()).str.translate(\n",
+    "    str.maketrans(string.ascii_lowercase, string.ascii_uppercase)\n",
+    ")"
    ]
   },
   {
@@ -1000,15 +747,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "article_data.groupby('year').size().plot.bar()"
+    "dois[dois.isin(article_data.doi.explode())]"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/code/project_definitions.py b/code/project_definitions.py
index eedc0b8..2498649 100644
--- a/code/project_definitions.py
+++ b/code/project_definitions.py
@@ -35,13 +35,17 @@ def build_query():
     adjectives = [
         'open',
         'open source',
+        'opensource',
         'open science',
         'frugal',
+        #'do it yourself'
+        #'diy'
         #'low cost',
     ]
     phrases = [
         ' '.join([a, n]) for a in adjectives for n in nouns
     ]
+    phrases.remove('open design')
     phrases.extend([
         "free hardware and software",
     ])