diff --git a/JSON-data-files/accent_edges_13_sw.json b/JSON-data-files/accent_edges_13_sw.json
new file mode 100644
index 0000000..edf7d68
--- /dev/null
+++ b/JSON-data-files/accent_edges_13_sw.json
@@ -0,0 +1 @@
+{"0": {"source": 2, "target": 3, "weight": 1}, "2": {"source": 2, "target": 8, "weight": 1}, "3": {"source": 2, "target": 14, "weight": 1}, "4": {"source": 2, "target": 15, "weight": 1}, "6": {"source": 2, "target": 4, "weight": 8}, "15": {"source": 2, "target": 39, "weight": 1}, "1": {"source": 4, "target": 5, "weight": 1}, "5": {"source": 14, "target": 15, "weight": 1}, "9": {"source": 23, "target": 4, "weight": 1}}
\ No newline at end of file
diff --git a/JSON-data-files/all_accents_13_sw.json b/JSON-data-files/all_accents_13_sw.json
new file mode 100644
index 0000000..b11644f
--- /dev/null
+++ b/JSON-data-files/all_accents_13_sw.json
@@ -0,0 +1 @@
+{"1": {"id": 1, "name": "Good", "count": 1, "locale": "en", "descriptors": {"600": {"id": 600, "name": "First or other language", "definition": "Indicates a descriptor related to whether this is the speaker\\s first or other language.", "parent": null}}, "predetermined": false}, "2": {"id": 2, "name": "Kiswahili accent", "count": 13, "locale": "en", "descriptors": {"300": {"id": 300, "name": "Supranational region", "definition": "Indicates a geographic region which crosses or overlaps multiple countries.", "parent": 100}}, "predetermined": false}, "3": {"id": 3, "name": "Coastal Swahili", "count": 2, "locale": "en", "descriptors": {"400": {"id": 400, "name": "Subnational region", "definition": "Indicates a geographic region within a national boundary.", "parent": 100}}, "predetermined": false}, "4": {"id": 4, "name": "Fluent", "count": 11, "locale": "en", "descriptors": {"600": {"id": 600, "name": "First or other language", "definition": "Indicates a descriptor related to whether this is the speaker\\s first or other language.", "parent": null}}, "predetermined": false}, "5": {"id": 5, "name": "kiMvita", "count": 2, "locale": "en", "descriptors": {"400": {"id": 400, "name": "Subnational region", "definition": "Indicates a geographic region within a national boundary.", "parent": 100}}, "predetermined": false}, "8": {"id": 8, "name": "Strong", "count": 1, "locale": "en", "descriptors": {"700": {"id": 700, "name": "Accent strength descriptor", "definition": "Indicates a marker of accent strength.", "parent": null}}, "predetermined": false}, "10": {"id": 10, "name": "Kenyan", "count": 2, "locale": "en", "descriptors": {"200": {"id": 200, "name": "Country", "definition": "Indicates a geographic region of a country or nation-state.", "parent": 100}}, "predetermined": false}, "11": {"id": 11, "name": "Lived in area", "count": 1, "locale": "en", "descriptors": {"2400": {"id": 2400, "name": "Linguistic heritage of speaker", "definition": "Indicates something about the language acquisition or language immersion of the speaker", "parent": null}}, "predetermined": false}, "12": {"id": 12, "name": "native", "count": 2, "locale": "en", "descriptors": {"600": {"id": 600, "name": "First or other language", "definition": "Indicates a descriptor related to whether this is the speaker\\s first or other language.", "parent": null}}, "predetermined": false}, "14": {"id": 14, "name": "Tanzania", "count": 1, "locale": "en", "descriptors": {"200": {"id": 200, "name": "Country", "definition": "Indicates a geographic region of a country or nation-state.", "parent": 100}}, "predetermined": false}, "15": {"id": 15, "name": "academic", "count": 1, "locale": "en", "descriptors": {"1300": {"id": 1300, "name": "Register", "definition": "Indicates which register the data contributor speaks in.", "parent": null}}, "predetermined": false}, "23": {"id": 23, "name": "Eloquent", "count": 1, "locale": "en", "descriptors": {"1300": {"id": 1300, "name": "Register", "definition": "Indicates which register the data contributor speaks in.", "parent": null}}, "predetermined": false}, "39": {"id": 39, "name": "Arusha", "count": 1, "locale": "en", "descriptors": {"400": {"id": 400, "name": "Subnational region", "definition": "Indicates a geographic region within a national boundary.", "parent": 100}}, "predetermined": false}}
\ No newline at end of file
diff --git a/MCV-get-demographic-details-from-dataset-13-sw.ipynb b/MCV-get-demographic-details-from-dataset-13-sw.ipynb
new file mode 100644
index 0000000..ef78591
--- /dev/null
+++ b/MCV-get-demographic-details-from-dataset-13-sw.ipynb
@@ -0,0 +1,4017 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Working with accent data in the Mozilla Common Voice dataset \n",
+ "\n",
+ "The purpose of this Python Jupyter notebook is to provide some worked examples of how you might explore accent data in the Common Voice dataset. \n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Index of notebook contents \n",
+ "\n",
+ "To make this notebook easier to navigate, each section is indexed below. \n",
+ "\n",
+ "* [Background information on demographic data in Common Voice](#Background)\n",
+ "* [Preparation steps and importing modules](#PreparationSteps) - including the `requirements.txt` you should run if using this notebook. \n",
+ "* [The Accent and AccentDescriptor classes we will use in the notebook](#Classes)\n",
+ "* [Preparing data from the Common Voice TSV file](#PreparingData)\n",
+ "* [Extracting accent information for data visualisation](#AccentExtraction)\n",
+ "* [Determine which accents are predetermined for selection in the Common Voice profile screen](#PreDetermined)\n",
+ "* [Add Descriptors to each Accent](#Descriptors)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "## Background information on demographic data in Common Voice \n",
+ "\n",
+ "Before you start working with accent data in Common Voice, there is background information you should know about the data structures in the Common Voice datasets, and how accents have been represented. \n",
+ "\n",
+ "### The ability to choose whether or not to specify demographic information \n",
+ "\n",
+ "Data contributors can contribute voice data to Common Voice with our without logging in to the platform. If a data contributor is not logged in, the utterances they record contain no demographic metadata information, such as the gender, age range or accent of the speaker. If the data contributor _does_ log in, then they can choose whether to specify demographic information in their profile. Part of the demographic information can include specifying which accent(s) they speak with. \n",
+ "\n",
+ "\n",
+ "Since mid 2021, data contributors to the Common Voice dataset have been able to self-specify descriptors for their accents. \n",
+ "\n",
+ "The purpose of this script is to get demographic details from an MCV downloaded dataset. \n",
+ "This informs decision making around, for example, how much of the data in a particular language, has demographic details, and if so, what they are. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "\n",
+ "## Preparation steps and importing the modules we will use \n",
+ "\n",
+ "@TODO \n",
+ "\n",
+ "make a `requirements.txt` file to install all the dependencies. \n",
+ "\n",
+ "* pandas \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# imports go here \n",
+ "\n",
+ "# io \n",
+ "import io\n",
+ "\n",
+ "# os for file handling \n",
+ "import os \n",
+ "\n",
+ "# pandas \n",
+ "import pandas as pd\n",
+ "\n",
+ "# regular expressions \n",
+ "import re\n",
+ "\n",
+ "# json \n",
+ "import json\n",
+ "\n",
+ "# string handling for isascii\n",
+ "import string \n",
+ "\n",
+ "# pretty print \n",
+ "import pprint\n",
+ "pp = pprint.PrettyPrinter(indent=4)\n",
+ "\n",
+ "# reload = because I'm developing the CVaccents module as I go, I want to reload it each time so it doesn't cache\n",
+ "from importlib import reload\n",
+ "\n",
+ "# copy = for using deepcopy()\n",
+ "import copy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# set the version number so that we can differentiate files, such as JSON, that are produced. \n",
+ "\n",
+ "dataset_release_version = 13\n",
+ "JSON_data_dir = 'JSON-data-files'\n",
+ "language = 'sw'\n",
+ "\n",
+ "# specify the filenames for the JSON output\n",
+ "\n",
+ "accents_filename = JSON_data_dir + '/' + 'all_accents' + '_' + str(dataset_release_version) + '_' + language + '.json'\n",
+ "links_filename = JSON_data_dir + '/' + 'accent_edges' + '_' + str(dataset_release_version) + '_' + language + '.json'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "## Accent, AccentDescriptor and AccentCollection classes used for manipulation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Accent class and AccentDescriptor class \n",
+ "\n",
+ "# these are classes I defined for accent handling\n",
+ "import cvaccents as cva\n",
+ "\n",
+ "# do an explicit reload as I'm still working on the classes \n",
+ "#reload(cva)\n",
+ "\n",
+ "# prove that my DocStrings are useful\n",
+ "# they are good, so I am suppressing output while I work through the rest of the doc. \n",
+ "\n",
+ "#print('Module docstring is: \\n', cva.__doc__)\n",
+ "#print('---')\n",
+ "#print('Accent docstring is: \\n', cva.Accent.__doc__)\n",
+ "#print('---')\n",
+ "#print('AccentDescriptor docstring is: \\n', cva.AccentDescriptor.__doc__)\n",
+ "#print('---')\n",
+ "#print('AccentCollection docstring is: \\n', cva.AccentDescriptor.__doc__)\n",
+ "#print('---')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "## Preparing the data from the Common Voice dataset TSV file\n",
+ "\n",
+ "Here, we extract data from the TSV file, and use `pandas` to perform manipulations on the dataset, such as removing rows that do not contain accent metadata. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_1737205/2438915031.py:9: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " df = pd.read_csv(filePath, sep='\\t')\n"
+ ]
+ }
+ ],
+ "source": [
+ "# You will need to download the CV corpus somewhere, or at least have the validated.tsv file available. \n",
+ "\n",
+ "# I have found that the aria2c downloader works very well for large downloads. \n",
+ "# https://aria2.github.io/\n",
+ "\n",
+ "filePath = '../cv-datasets/cv-corpus-13.0-2023-03-09/sw/validated.tsv'\n",
+ "\n",
+ "# put it into a DataFrame \n",
+ "df = pd.read_csv(filePath, sep='\\t')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age',\n",
+ " 'gender', 'accents', 'variant', 'locale', 'segment'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 94,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['client_id', 'age', 'gender', 'accents', 'variant'], dtype='object')"
+ ]
+ },
+ "execution_count": 95,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# We don't want all the columns, as some of them are not useful for the accent analysis \n",
+ "# Drop the columns we don't want \n",
+ "\n",
+ "df.drop(labels=['path', 'sentence', 'up_votes', 'down_votes', 'segment', 'locale'], axis='columns', inplace=True)\n",
+ "df.columns\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "231142"
+ ]
+ },
+ "execution_count": 96,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1325"
+ ]
+ },
+ "execution_count": 97,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# rows that have accent metadata \n",
+ "len(df[df['accents'].notna()])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " client_id | \n",
+ " age | \n",
+ " gender | \n",
+ " accents | \n",
+ " variant | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0133d8ddf5c1a3c678fde017e0b07d2835bfd707d5b3ec... | \n",
+ " twenties | \n",
+ " female | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 01c95772efd3fbe4a1122206c7474c77ed6591c8c9fb00... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 023711185d4404ff398c2697f2e72868d1ecf69a92b581... | \n",
+ " twenties | \n",
+ " male | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0244639ffd7ec755a01b21ea204735ca3c44443e9cf46c... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 04e78dc3038488a080fe3c76c28602d0db9e4eec2efbf0... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 231137 | \n",
+ " 457b3a2570720101c75d297cde767487e8f0a1a7f714cb... | \n",
+ " thirties | \n",
+ " male | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 231138 | \n",
+ " 457b3a2570720101c75d297cde767487e8f0a1a7f714cb... | \n",
+ " thirties | \n",
+ " male | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 231139 | \n",
+ " 457b3a2570720101c75d297cde767487e8f0a1a7f714cb... | \n",
+ " thirties | \n",
+ " male | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 231140 | \n",
+ " 457b3a2570720101c75d297cde767487e8f0a1a7f714cb... | \n",
+ " thirties | \n",
+ " male | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 231141 | \n",
+ " 457b3a2570720101c75d297cde767487e8f0a1a7f714cb... | \n",
+ " thirties | \n",
+ " male | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
231142 rows × 5 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " client_id age gender \\\n",
+ "0 0133d8ddf5c1a3c678fde017e0b07d2835bfd707d5b3ec... twenties female \n",
+ "1 01c95772efd3fbe4a1122206c7474c77ed6591c8c9fb00... NaN NaN \n",
+ "2 023711185d4404ff398c2697f2e72868d1ecf69a92b581... twenties male \n",
+ "3 0244639ffd7ec755a01b21ea204735ca3c44443e9cf46c... NaN NaN \n",
+ "4 04e78dc3038488a080fe3c76c28602d0db9e4eec2efbf0... NaN NaN \n",
+ "... ... ... ... \n",
+ "231137 457b3a2570720101c75d297cde767487e8f0a1a7f714cb... thirties male \n",
+ "231138 457b3a2570720101c75d297cde767487e8f0a1a7f714cb... thirties male \n",
+ "231139 457b3a2570720101c75d297cde767487e8f0a1a7f714cb... thirties male \n",
+ "231140 457b3a2570720101c75d297cde767487e8f0a1a7f714cb... thirties male \n",
+ "231141 457b3a2570720101c75d297cde767487e8f0a1a7f714cb... thirties male \n",
+ "\n",
+ " accents variant \n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 NaN NaN \n",
+ "3 NaN NaN \n",
+ "4 NaN NaN \n",
+ "... ... ... \n",
+ "231137 NaN NaN \n",
+ "231138 NaN NaN \n",
+ "231139 NaN NaN \n",
+ "231140 NaN NaN \n",
+ "231141 NaN NaN \n",
+ "\n",
+ "[231142 rows x 5 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "display(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1325"
+ ]
+ },
+ "execution_count": 99,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# remove all the rows where accents are not given (NaN)\n",
+ "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html\n",
+ "# DataFrame.dropna(*, axis=0, how=_NoDefault.no_default, thresh=_NoDefault.no_default, subset=None, inplace=False)\n",
+ "\n",
+ "df.dropna(axis='index', how='any', subset='accents', inplace=True)\n",
+ "len(df)\n",
+ "\n",
+ "# this matches the above figure for rows that have accent metadata, so it's a good cross-check"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 100,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "26"
+ ]
+ },
+ "execution_count": 100,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# number of unique contributors to the dataset \n",
+ "len(df['client_id'].unique())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "26"
+ ]
+ },
+ "execution_count": 101,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Now that the rows without an accent value have been removed, \n",
+ "# we want to deduplicate the speaker_id values - because one speaker can speak many utterances\n",
+ "# and we only want to record one accent per speaker \n",
+ "# and we should end up with the # of rows in the cell above \n",
+ "\n",
+ "\n",
+ "# One of the reasons we try and reduce the size of the dataframe \n",
+ "# first is because this operation is more efficient on a smaller dataframe \n",
+ "# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html\n",
+ "# DataFrame.drop_duplicates(subset=None, *, keep='first', inplace=False, ignore_index=False)\n",
+ "\n",
+ "df.drop_duplicates(subset='client_id', keep='first', inplace=True)\n",
+ "len(df)\n",
+ "\n",
+ "# This length should match the length above "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "## Extracting the accent data for visualisation \n",
+ "\n",
+ "We have already: \n",
+ "\n",
+ "* Removed any rows where accent data was not available = `NaN`\n",
+ "* De-duplicated based on the `client_id`\n",
+ "\n",
+ "So now, we need to extract all the self-styled accents for analysis. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "26\n"
+ ]
+ }
+ ],
+ "source": [
+ "# They are already unique so we don't need the `.unique` method\n",
+ "kiswahili_accents = df['accents']\n",
+ "\n",
+ "print(len(kiswahili_accents))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\"\"\"\n",
+ "\n",
+ "The list english_accents_list is a list that contains the ORIGINAL accent entries for each speaker. \n",
+ "In this list, the accents for each speaker are represented as a SINGLE STRING, NOT as a list of strings. \n",
+ "\n",
+ "So, we want to turn this into a LIST of LISTS OF STRINGS, to make it easier for doing data cleaning. \n",
+ "Each individual string represents one accent descriptor given by a speaker, \n",
+ "and the list which contains those strings is the grouping of accent descriptors for that speaker. \n",
+ "\n",
+ "We need to preserve the association between accent descriptors - co-references - for later data visualisation. \n",
+ "\n",
+ "\"\"\"\n",
+ "\n",
+ "\n",
+ "kiswahili_accents_list = [] \n",
+ "\n",
+ "for idx, accent_string in list(enumerate(kiswahili_accents)): \n",
+ " \n",
+ " # this regex is from \n",
+ " # https://stackoverflow.com/questions/26633452/how-to-split-by-commas-that-are-not-within-parentheses\n",
+ " accent_list=re.split(',\\s*(?![^()]*\\))', accent_string)\n",
+ " processed_accent_list = accent_list # we don't want to modify a list we're iterating over\n",
+ " \n",
+ " #print ('accent_string is: ', accent_string, ' and accent_list is: ', accent_list)\n",
+ " for idx_a, accent in list(enumerate(accent_list)): \n",
+ " \n",
+ " #print ('idx_a is: ', idx_a, ' and accent is: ', accent, ' and type of accent is: ', type(accent))\n",
+ " # Trim any whitespace off the elements, because this makes matching on strings harder later on\n",
+ " # Strings are immutable in Python, so we have to create another string\n",
+ " processed_accent_list.remove(accent)\n",
+ " stripped_accent = accent.strip() \n",
+ " processed_accent_list.append(stripped_accent)\n",
+ " \n",
+ " # Check for any empty strings and remove them - likely regex artefacts\n",
+ " if not accent: \n",
+ " processed_accent_list.remove(accent)\n",
+ " \n",
+ " # Check for any non-Latin characters that we may want to investigate \n",
+ " # For example, if one of the accents is garbage or deliberate rubbish\n",
+ " \n",
+ " if not accent.isascii(): \n",
+ " print('flagging that accent: ', accent, ' is not ASCII encoded, may be in another language')\n",
+ " \n",
+ " kiswahili_accents_list.append(processed_accent_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "26\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(kiswahili_accents_list))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0 ['Good']\n",
+ "1 ['swahili accent', 'coastal swahili accent']\n",
+ "2 ['Fluent', 'Mvita']\n",
+ "3 ['no sherapping']\n",
+ "4 ['Kiswahili']\n",
+ "5 ['Strong kiswahili accent']\n",
+ "6 ['Kimvita']\n",
+ "7 ['Typical Kenyan Accent']\n",
+ "8 ['Shaped by where i have lived']\n",
+ "9 ['native']\n",
+ "10 ['Lafudhi yangu ni kiswahili cha kawaida ambacho watanzania wengi wanakizungumza. Ni pamoja na kile ambacho kinafundishwa katika shule za msingi na sekondari ili kuleta maana katika mambo mbalimbali na namna ya kuzungumza kwa ujumla.']\n",
+ "11 ['Fluent Kiswahili']\n",
+ "12 ['Fluent in Swahili']\n",
+ "13 ['fluent in kiswahili']\n",
+ "14 ['Kenyan']\n",
+ "15 ['Eloquent and fluent.']\n",
+ "16 ['Fluent swahili']\n",
+ "17 ['kiswahili cha mkoa wa pwani']\n",
+ "18 ['Fluent Kiswahili']\n",
+ "19 ['Raisi wa Kenya alitoa hutoba yake juma tatu iliyopita']\n",
+ "20 ['Fluent swahili']\n",
+ "21 ['native']\n",
+ "22 ['Fluent swahili']\n",
+ "23 ['Fluent accent']\n",
+ "24 ['Fluent', 'Fluent swahili']\n",
+ "25 ['My accept is to recognize and record different sounds and voices', 'Especial in swahili', 'Normal Arusha accent']\n"
+ ]
+ }
+ ],
+ "source": [
+ "for idx, accent_list in list(enumerate(kiswahili_accents_list)): \n",
+ " print(idx, accent_list)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "## Add descriptors to each accent\n",
+ "\n",
+ "In this section, I apply a set of categories to the accent data. \n",
+ "\n",
+ "**I use a rule-based approach for reproduceability.** \n",
+ "This could have been done in a spreadsheet, but I'm working in Python so I chose to do it that way. \n",
+ "This also makes it easier for others applying this work to other languages or to other versions of the dataset. \n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Expand accents that have multiple descriptors in their .name element \n",
+ "\n",
+ "Here, we \"break apart\" accents that have multiple descriptors in the .name element into **multiple** accents. This is done _programmatically_ to aid in reproduceability. \n",
+ "\n",
+ "Some examples of this that I found during analysis of Kiswahili were; \n",
+ "\n",
+ "* 'My accent is the common Swahili that many Tanzanians speak. It includes what is taught in primary and secondary schools to make sense in various aspects and how to talk in general.' - this contains a geographic descriptor, the 'common' descriptor and the 'taught in school' descriptor. \n",
+ "\n",
+ "* \n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Modify the accent list to expand descriptors while preserving accent co-references\n",
+ "\n",
+ "Some of the given accent descriptors contain multiple descriptors in one string. Here, I expand them while maintain co-references. \n",
+ "\n",
+ "For example: \n",
+ "\n",
+ "* `slight Brooklyn accent` - contains both a City-based descriptor and an accent strength descriptor. \n",
+ "* `United States English combined with European English` - contains both a national descriptor and a supranational descriptor. \n",
+ "\n",
+ "\n",
+ "\n",
+ "It's easier to do this before we create Accent and AccentDescriptor objects. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# helper function for the below \n",
+ "\n",
+ "def update_list_coreference(list_to_be_updated, old_entry, new_entry):\n",
+ " # the accent list is a list of lists so we need to iterate through each one to find the element to update. \n",
+ " \n",
+ " for idx, accent_list in list(enumerate(list_to_be_updated)):\n",
+ " \n",
+ " #print(idx, '- old accent_list is: ', accent_list)\n",
+ " new_accent_list = accent_list\n",
+ " \n",
+ " for accent in accent_list: \n",
+ " match = False\n",
+ " #print ('accent is: ', accent, ' and old_entry is: ', old_entry)\n",
+ " \n",
+ " if accent == old_entry:\n",
+ " match = True \n",
+ " #print('accent is: ', accent, ' and old_entry is: ', old_entry, 'and new_entry is: ', new_entry)\n",
+ " #print('accent is: ', accent, ' which matches ', old_entry)\n",
+ " new_accent_list.remove(old_entry)\n",
+ " \n",
+ " for entry in new_entry: # there may be more than one\n",
+ " new_accent_list.append(entry)\n",
+ " #print('appending new entry: ', entry)\n",
+ " \n",
+ " \n",
+ " if match: \n",
+ " print ('processed ', old_entry, ' to be ', new_entry, ' and the old accent list is: ', accent_list, ' and the new accent list is: ', new_accent_list)\n",
+ "\n",
+ " # recreate the list from keys, this removes duplicates\n",
+ " # for example, a duplicate may be created due to normalisation or merger of accents \n",
+ " # run through a filter to remove empty list elements\n",
+ " new_accent_list = list(dict.fromkeys(filter(None, new_accent_list)))\n",
+ " \n",
+ " #print('new_accent_list is: ', new_accent_list)\n",
+ " #print('---')\n",
+ " \n",
+ " #print('removing: ', accent_list)\n",
+ " list_to_be_updated.remove(accent_list)\n",
+ " #print('appending: ', newlist)\n",
+ " list_to_be_updated.append(new_accent_list)\n",
+ " \n",
+ " \n",
+ " return(list_to_be_updated)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 107,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "processed Lafudhi yangu ni kiswahili cha kawaida ambacho watanzania wengi wanakizungumza. Ni pamoja na kile ambacho kinafundishwa katika shule za msingi na sekondari ili kuleta maana katika mambo mbalimbali na namna ya kuzungumza kwa ujumla. to be ['Kiswahili accent', 'Tanzania', 'academic'] and the old accent list is: ['Kiswahili accent', 'Tanzania', 'academic'] and the new accent list is: ['Kiswahili accent', 'Tanzania', 'academic']\n",
+ "processed Strong kiswahili accent to be ['Kiswahili accent', 'Strong'] and the old accent list is: ['Kiswahili accent', 'Strong'] and the new accent list is: ['Kiswahili accent', 'Strong']\n",
+ "processed Fluent Kiswahili to be ['Kiswahili accent', 'Fluent'] and the old accent list is: ['Kiswahili accent', 'Fluent'] and the new accent list is: ['Kiswahili accent', 'Fluent']\n",
+ "processed Fluent Kiswahili to be ['Kiswahili accent', 'Fluent'] and the old accent list is: ['Kiswahili accent', 'Fluent'] and the new accent list is: ['Kiswahili accent', 'Fluent']\n",
+ "processed Fluent in Swahili to be ['Kiswahili accent', 'Fluent'] and the old accent list is: ['Kiswahili accent', 'Fluent'] and the new accent list is: ['Kiswahili accent', 'Fluent']\n",
+ "processed fluent in kiswahili to be ['Kiswahili accent', 'Fluent'] and the old accent list is: ['Kiswahili accent', 'Fluent'] and the new accent list is: ['Kiswahili accent', 'Fluent']\n",
+ "processed Fluent swahili to be ['Kiswahili accent', 'Fluent'] and the old accent list is: ['Kiswahili accent', 'Fluent'] and the new accent list is: ['Kiswahili accent', 'Fluent']\n",
+ "processed Fluent swahili to be ['Kiswahili accent', 'Fluent'] and the old accent list is: ['Kiswahili accent', 'Fluent'] and the new accent list is: ['Kiswahili accent', 'Fluent']\n",
+ "processed Fluent swahili to be ['Kiswahili accent', 'Fluent'] and the old accent list is: ['Kiswahili accent', 'Fluent'] and the new accent list is: ['Kiswahili accent', 'Fluent']\n",
+ "processed Fluent swahili to be ['Kiswahili accent', 'Fluent'] and the old accent list is: ['Fluent', 'Kiswahili accent', 'Fluent'] and the new accent list is: ['Fluent', 'Kiswahili accent', 'Fluent']\n",
+ "processed Eloquent and fluent. to be ['Eloquent', 'Fluent'] and the old accent list is: ['Eloquent', 'Fluent'] and the new accent list is: ['Eloquent', 'Fluent']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# NORMALISATION INTO MULTIPLE ACCENTS \n",
+ "\n",
+ "# 'Lafudhi yangu ni kiswahili cha kawaida ambacho watanzania wengi wanakizungumza. Ni pamoja na kile ambacho kinafundishwa katika shule za msingi na sekondari ili kuleta maana katika mambo mbalimbali na namna ya kuzungumza kwa ujumla.\n",
+ "# This translates to: \n",
+ "# My accent is the common Swahili that many Tanzanians speak. It includes what is taught in primary and secondary schools to make sense in various aspects and how to talk in general. \n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Lafudhi yangu ni kiswahili cha kawaida ambacho watanzania wengi wanakizungumza. Ni pamoja na kile ambacho kinafundishwa katika shule za msingi na sekondari ili kuleta maana katika mambo mbalimbali na namna ya kuzungumza kwa ujumla.', \n",
+ " ['Kiswahili accent', 'Tanzania', 'academic'])\n",
+ "\n",
+ " \n",
+ "# 'Strong kiswahili accent' - separate into 'strong' and 'Kiswahili accent'\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Strong kiswahili accent', \n",
+ " ['Kiswahili accent', 'Strong'])\n",
+ "\n",
+ "# Fluent Kiswahili - separate into 'Fluent' and 'Kiswahili accent' \n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Fluent Kiswahili', \n",
+ " ['Kiswahili accent', 'Fluent'])\n",
+ " \n",
+ "# 'Fluent in Swahili' - separate into 'Fluent' and 'Kiswahili accent' \n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Fluent in Swahili', \n",
+ " ['Kiswahili accent', 'Fluent']) \n",
+ "\n",
+ "# 'Fluent in kiswahili' - separate into 'Fluent' and 'Kiswahili accent' \n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'fluent in kiswahili', \n",
+ " ['Kiswahili accent', 'Fluent'])\n",
+ " \n",
+ "# 'Fluent swahili' - separate into 'Fluent' and 'Kiswahili accent' \n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Fluent swahili', \n",
+ " ['Kiswahili accent', 'Fluent'])\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ "# 'Eloquent and fluent.' - separate into 'Eloquent' and 'Fluent'\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Eloquent and fluent.', \n",
+ " ['Eloquent', 'Fluent']) \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Normalize closely related accent descriptors - merge them \n",
+ "\n",
+ "There are several closely related accent descriptors, and here I merge them. \n",
+ "\n",
+ "The principles I use are: \n",
+ " \n",
+ "* Accents are merged where there are spelling variations \n",
+ "* Accents are merged where the accent has a region descriptor with our without 'accent' - such as \"French\" and \"French accent\"\n",
+ "* Where a country or language descriptor and demonym are closely equivalent - \"Germany\" and \"German\"\n",
+ "\n",
+ "Accents are not merged where: \n",
+ "\n",
+ "* One accent descriptor is more granular than another - \"London\" and \"South London\" are not merged. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 108,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "processed Kiswahili to be ['Kiswahili accent'] and the old accent list is: ['Kiswahili accent'] and the new accent list is: ['Kiswahili accent']\n",
+ "processed swahili accent to be ['Kiswahili accent'] and the old accent list is: ['coastal swahili accent', 'Kiswahili accent'] and the new accent list is: ['coastal swahili accent', 'Kiswahili accent']\n",
+ "processed Especial in swahili to be ['Kiswahili accent'] and the old accent list is: ['My accept is to recognize and record different sounds and voices', 'Normal Arusha accent', 'Kiswahili accent'] and the new accent list is: ['My accept is to recognize and record different sounds and voices', 'Normal Arusha accent', 'Kiswahili accent']\n",
+ "processed coastal swahili accent to be ['Coastal Swahili'] and the old accent list is: ['Kiswahili accent', 'Coastal Swahili'] and the new accent list is: ['Kiswahili accent', 'Coastal Swahili']\n",
+ "processed kiswahili cha mkoa wa pwani to be ['Coastal Swahili'] and the old accent list is: ['Coastal Swahili'] and the new accent list is: ['Coastal Swahili']\n",
+ "processed Mvita to be ['kiMvita'] and the old accent list is: ['Fluent', 'kiMvita'] and the new accent list is: ['Fluent', 'kiMvita']\n",
+ "processed Kimvita to be ['kiMvita'] and the old accent list is: ['kiMvita'] and the new accent list is: ['kiMvita']\n",
+ "processed Normal Arusha accent to be ['Arusha'] and the old accent list is: ['My accept is to recognize and record different sounds and voices', 'Kiswahili accent', 'Arusha'] and the new accent list is: ['My accept is to recognize and record different sounds and voices', 'Kiswahili accent', 'Arusha']\n",
+ "processed Typical Kenyan Accent to be ['Kenyan'] and the old accent list is: ['Kenyan'] and the new accent list is: ['Kenyan']\n",
+ "processed Fluent accent to be ['Fluent'] and the old accent list is: ['Fluent'] and the new accent list is: ['Fluent']\n",
+ "processed Shaped by where i have lived to be ['Lived in area'] and the old accent list is: ['Lived in area'] and the new accent list is: ['Lived in area']\n",
+ "processed no sherapping to be and the old accent list is: [] and the new accent list is: []\n",
+ "processed Raisi wa Kenya alitoa hutoba yake juma tatu iliyopita to be and the old accent list is: [] and the new accent list is: []\n",
+ "processed My accept is to recognize and record different sounds and voices to be and the old accent list is: ['Kiswahili accent', 'Arusha'] and the new accent list is: ['Kiswahili accent', 'Arusha']\n"
+ ]
+ }
+ ],
+ "source": [
+ "## There will be others as we put them into objects / classes\n",
+ "\n",
+ "# Kiswahili accent - canonical is 'Kiswahili accent'\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Kiswahili',\n",
+ " ['Kiswahili accent'])\n",
+ "\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'kiswahili',\n",
+ " ['Kiswahili accent'])\n",
+ "\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Swahili',\n",
+ " ['Kiswahili accent'])\n",
+ "\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'swahili',\n",
+ " ['Kiswahili accent'])\n",
+ "\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'swahili accent',\n",
+ " ['Kiswahili accent'])\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Especial in swahili',\n",
+ " ['Kiswahili accent'])\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "# Coastal Swahili - canonical is 'Coastal Swahili'\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'coastal swahili accent',\n",
+ " ['Coastal Swahili'])\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'kiswahili cha mkoa wa pwani',\n",
+ " ['Coastal Swahili'])\n",
+ "\n",
+ "\n",
+ "\n",
+ "# kiMvita accent - canonical is 'kiMvita'\n",
+ "\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'mvita',\n",
+ " ['kiMvita'])\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Mvita',\n",
+ " ['kiMvita'])\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'kimvita',\n",
+ " ['kiMvita'])\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Kimvita',\n",
+ " ['kiMvita'])\n",
+ "\n",
+ "\n",
+ "# Arusha accent - canonical is 'Arusha'\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Normal Arusha accent',\n",
+ " ['Arusha'])\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "# Kenyan accent - canonical is 'Kenyan'\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Typical Kenyan Accent',\n",
+ " ['Kenyan'])\n",
+ "\n",
+ "\n",
+ "# Fluent - canonical is 'Fluent'\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Fluent accent',\n",
+ " ['Fluent'])\n",
+ "\n",
+ "\n",
+ "# 'Shaped by where i have lived' - canonical is 'Lived in area'\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Shaped by where i have lived',\n",
+ " ['Lived in area'])\n",
+ "\n",
+ "############### accents to disregard\n",
+ "\n",
+ "\n",
+ "# just practicing - canonical is 'DISREGARD'\n",
+ "\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'no sherapping',\n",
+ " '')\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'Raisi wa Kenya alitoa hutoba yake juma tatu iliyopita',\n",
+ " '')\n",
+ "kiswahili_accents_list = update_list_coreference(kiswahili_accents_list, \n",
+ " 'My accept is to recognize and record different sounds and voices',\n",
+ " '')\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[ ['Good'],\n",
+ " ['Kiswahili accent', 'Coastal Swahili'],\n",
+ " ['Fluent', 'kiMvita'],\n",
+ " [],\n",
+ " ['Kiswahili accent'],\n",
+ " ['Kiswahili accent', 'Strong'],\n",
+ " ['kiMvita'],\n",
+ " ['Kenyan'],\n",
+ " ['Lived in area'],\n",
+ " ['native'],\n",
+ " ['Kiswahili accent', 'Tanzania', 'academic'],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " ['Kenyan'],\n",
+ " ['Eloquent', 'Fluent'],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " ['Coastal Swahili'],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " [],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " ['native'],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " ['Fluent'],\n",
+ " ['Fluent', 'Kiswahili accent'],\n",
+ " ['Kiswahili accent', 'Arusha']]\n"
+ ]
+ }
+ ],
+ "source": [
+ "pp.pprint(kiswahili_accents_list)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Extract unique accents from the list of normalised accents into a Dict of Accent objects for easier manipulation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# build a dict of each unique accent using an Accent object for each object. \n",
+ "\n",
+ "ratio_display = 120 # to stop the browser crashing \n",
+ "\n",
+ "AccentDict = {}\n",
+ "i = 0; \n",
+ "\n",
+ "# the english_accents_list is now normalised, merged etc so this is straightforward \n",
+ "for accent_list in kiswahili_accents_list:\n",
+ " for accent in accent_list: \n",
+ " \n",
+ " i +=1\n",
+ " match = False \n",
+ " count = 0\n",
+ " \n",
+ " #if (i%ratio_display ==0): # only show the 100th \n",
+ " #print('')\n",
+ " #print('---')\n",
+ " #print('now processing: ', accent, ' - ', i)\n",
+ " #print('---')\n",
+ " \n",
+ " # is this accent in our dict - if not, add it in \n",
+ " \n",
+ " for item in AccentDict.items() : # Each item should be an Accent object \n",
+ " \n",
+ " # (self, id=0, name=\"Accent Name\", count=0, locale=None, descriptors=None):\n",
+ " #pp.pprint(item[1].__str__())\n",
+ " \n",
+ " #if (i%ratio_display ==0): # only show the 100th \n",
+ " #print('item is: ', item)\n",
+ " #print(type(item))\n",
+ " #print('now checking match for: item:', item[1], ' and accent: ', accent)\n",
+ " \n",
+ " if (item[1].name == accent) : # update the count\n",
+ " \n",
+ " #if (i%ratio_display ==0): # only show the 100th \n",
+ " #print('---')\n",
+ " #print('match is True')\n",
+ " #print('---')\n",
+ " \n",
+ " match = True \n",
+ " \n",
+ " #if (i%ratio_display ==0): # only show the 100th \n",
+ " #print('accent count was: ', item[1].count)\n",
+ " \n",
+ " # update the count of the accent \n",
+ " item[1].count+=1\n",
+ " \n",
+ " #if (i%ratio_display ==0): # only show the 100th \n",
+ " #print('accent count is now: ', item[1].count)\n",
+ " \n",
+ " \n",
+ " # this match loop has to be outside the for: loop above \n",
+ " # because if we add items to the dict inside the loop\n",
+ " # then it will not run - because there are zero items in the dict to begin with \n",
+ " \n",
+ " if (not match) : \n",
+ " \n",
+ " # (self, id=0, name=\"Accent Name\", count=0, locale=None, descriptors=None):\n",
+ " AccentDict[i] = cva.Accent(i, accent, 1, 'en', None, False) \n",
+ " \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 111,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# do an explicit reload as I'm still working on the classes \n",
+ "#reload(cva)\n",
+ "\n",
+ "all_accents = cva.AccentCollection(AccentDict)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "13\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(all_accents.total())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 113,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'id is 1, name is Good, count is 1, locale is en, descriptors are None, predetermined is False. id is 2, name is Kiswahili accent, count is 13, locale is en, descriptors are None, predetermined is False. id is 3, name is Coastal Swahili, count is 2, locale is en, descriptors are None, predetermined is False. id is 4, name is Fluent, count is 11, locale is en, descriptors are None, predetermined is False. id is 5, name is kiMvita, count is 2, locale is en, descriptors are None, predetermined is False. id is 8, name is Strong, count is 1, locale is en, descriptors are None, predetermined is False. id is 10, name is Kenyan, count is 2, locale is en, descriptors are None, predetermined is False. id is 11, name is Lived in area, count is 1, locale is en, descriptors are None, predetermined is False. id is 12, name is native, count is 2, locale is en, descriptors are None, predetermined is False. id is 14, name is Tanzania, count is 1, locale is en, descriptors are None, predetermined is False. id is 15, name is academic, count is 1, locale is en, descriptors are None, predetermined is False. id is 23, name is Eloquent, count is 1, locale is en, descriptors are None, predetermined is False. id is 39, name is Arusha, count is 1, locale is en, descriptors are None, predetermined is False.'"
+ ]
+ },
+ "execution_count": 113,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "all_accents.__str__()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 114,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "id is 2, name is Kiswahili accent, count is 13, locale is en, descriptors are None, predetermined is False.\n",
+ "id is 4, name is Fluent, count is 11, locale is en, descriptors are None, predetermined is False.\n",
+ "id is 12, name is native, count is 2, locale is en, descriptors are None, predetermined is False.\n",
+ "id is 5, name is kiMvita, count is 2, locale is en, descriptors are None, predetermined is False.\n",
+ "id is 10, name is Kenyan, count is 2, locale is en, descriptors are None, predetermined is False.\n",
+ "id is 3, name is Coastal Swahili, count is 2, locale is en, descriptors are None, predetermined is False.\n",
+ "id is 15, name is academic, count is 1, locale is en, descriptors are None, predetermined is False.\n",
+ "id is 14, name is Tanzania, count is 1, locale is en, descriptors are None, predetermined is False.\n",
+ "id is 8, name is Strong, count is 1, locale is en, descriptors are None, predetermined is False.\n",
+ "id is 11, name is Lived in area, count is 1, locale is en, descriptors are None, predetermined is False.\n",
+ "id is 1, name is Good, count is 1, locale is en, descriptors are None, predetermined is False.\n",
+ "id is 23, name is Eloquent, count is 1, locale is en, descriptors are None, predetermined is False.\n",
+ "id is 39, name is Arusha, count is 1, locale is en, descriptors are None, predetermined is False.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# do an explicit reload as I'm still working on the classes \n",
+ "#reload(cva)\n",
+ "\n",
+ "all_accents_sortedByCount = all_accents.sortByCount(reverse=True)\n",
+ "\n",
+ "for accent in all_accents_sortedByCount.items(): \n",
+ " print(accent[1].__str__())\n",
+ " \n",
+ "# now I am cross-checking to see if there are any other duplicates or accents that should be merged\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "## Label the accents that were pre-determined \n",
+ "\n",
+ "Since its inception, Mozilla Common Voice has enabled data contributors to enter demographic age such as age, gender and accent. These associations are not validated in any way, and we don't have any indicator of how accurate they are. Accent _used_ to be represented as an a priori drop-down list, which the contributor could select from. From Common Voice v10, the data contributor can **self-describe** their accent, however, the previous accent list is still presented (so may be more frequently chosen by the data contributor). We need to be able to distinguish these accents visually to help with the exploration. \n",
+ "\n",
+ "```\n",
+ "\"splits\": {\n",
+ " \"accent\": {\n",
+ " \"\": 0.51,\n",
+ " \"canada\": 0.03,\n",
+ " \"england\": 0.08,\n",
+ " \"us\": 0.23,\n",
+ " \"indian\": 0.07,\n",
+ " \"australia\": 0.03,\n",
+ " \"malaysia\": 0,\n",
+ " \"newzealand\": 0.01,\n",
+ " \"african\": 0.01,\n",
+ " \"ireland\": 0.01,\n",
+ " \"philippines\": 0,\n",
+ " \"singapore\": 0,\n",
+ " \"scotland\": 0.02,\n",
+ " \"hongkong\": 0,\n",
+ " \"bermuda\": 0,\n",
+ " \"southatlandtic\": 0,\n",
+ " \"wales\": 0,\n",
+ " \"other\": 0.01\n",
+ " },\n",
+ "\n",
+ "```\n",
+ "\n",
+ "The `cv-datasets` splits above have labels for the accents that don't actually match the accent name in the data. So we need to specify the accents that are pre-determined. This is how they appear to the data contributor filling out their profile at: [https://commonvoice.mozilla.org/en/profile/info](https://commonvoice.mozilla.org/en/profile/info)\n",
+ "\n",
+ "\n",
+ "![Accents as specified on Mozilla Common Voice profile](cv-profile-specify-accent.png)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 115,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create a list of the pre-existing accents \n",
+ "# this is how they are given in the dataset. \n",
+ "\n",
+ "# TODO: for better maintainability, move this to a list of accents for each language, \n",
+ "# that can be updated in a separate file, rather than specified here in an adhoc way. \n",
+ "\n",
+ "predetermined_accents_list = ['United States English', \n",
+ " 'England English', \n",
+ " 'India and South Asia (India, Pakistan, Sri Lanka)', \n",
+ " 'Canadian English', \n",
+ " 'Australian English', \n",
+ " 'Southern African (South Africa, Zimbabwe, Namibia)', \n",
+ " 'Irish English', \n",
+ " 'Scottish English', \n",
+ " 'New Zealand English', \n",
+ " 'Hong Kong English', \n",
+ " 'Filipino', \n",
+ " 'Malaysian English', \n",
+ " 'Singaporean English', \n",
+ " 'Welsh English', \n",
+ " 'West Indies and Bermuda (Bahamas, Bermuda, Jamaica, Trinidad)', \n",
+ " 'South Atlantic (Falkland Islands, Saint Helena)']\n",
+ "\n",
+ "\n",
+ "\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 116,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1, )\n",
+ "1\n",
+ "(2, )\n",
+ "2\n",
+ "(3, )\n",
+ "3\n",
+ "(4, )\n",
+ "4\n",
+ "(5, )\n",
+ "5\n",
+ "(8, )\n",
+ "8\n",
+ "(10, )\n",
+ "10\n",
+ "(11, )\n",
+ "11\n",
+ "(12, )\n",
+ "12\n",
+ "(14, )\n",
+ "14\n",
+ "(15, )\n",
+ "15\n",
+ "(23, )\n",
+ "23\n",
+ "(39, )\n",
+ "39\n",
+ "id is 1, name is Good, count is 1, locale is en, descriptors are None, predetermined is False. id is 2, name is Kiswahili accent, count is 13, locale is en, descriptors are None, predetermined is False. id is 3, name is Coastal Swahili, count is 2, locale is en, descriptors are None, predetermined is False. id is 4, name is Fluent, count is 11, locale is en, descriptors are None, predetermined is False. id is 5, name is kiMvita, count is 2, locale is en, descriptors are None, predetermined is False. id is 8, name is Strong, count is 1, locale is en, descriptors are None, predetermined is False. id is 10, name is Kenyan, count is 2, locale is en, descriptors are None, predetermined is False. id is 11, name is Lived in area, count is 1, locale is en, descriptors are None, predetermined is False. id is 12, name is native, count is 2, locale is en, descriptors are None, predetermined is False. id is 14, name is Tanzania, count is 1, locale is en, descriptors are None, predetermined is False. id is 15, name is academic, count is 1, locale is en, descriptors are None, predetermined is False. id is 23, name is Eloquent, count is 1, locale is en, descriptors are None, predetermined is False. id is 39, name is Arusha, count is 1, locale is en, descriptors are None, predetermined is False.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# use the predetermined_accents_list to populate the 'predetermined_status' attribute of each Accent object \n",
+ "# to do this we use a method on the AccentCollection class\n",
+ "\n",
+ "import cvaccents as cva\n",
+ "#reload(cva)\n",
+ "\n",
+ "all_accents.updatePredeterminedStatus(predetermined_accents_list, True)\n",
+ "\n",
+ "print(all_accents)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "\n",
+ "## Create Accent Descriptors and add them to each Accent \n",
+ "\n",
+ "Each Accent can have multiple Accent Descriptors. \n",
+ "\n",
+ "For example the accent `Pronounced German` contains both a _national regional descriptor_ and an _accent strength descriptor_. \n",
+ "\n",
+ "I have used the following principles for Accent Descriptors for English. This should be considered a CodeBook. \n",
+ "\n",
+ "### Geographic Regional Descriptors \n",
+ "\n",
+ "Regional descriptors are where the accent has been specified with reference to a geographic region. \n",
+ "\n",
+ "* `Geographic Region` \n",
+ "\n",
+ "Within this Category, there are several sub-categories: \n",
+ "\n",
+ "* `Country Descriptor` - where the descriptor is a country or a nation-state. \n",
+ "* `Supranational region descriptor` - where the descriptor is a geographic region that crosses or overlaps multiple countries. An example would be `Slavic`, which refers to an [ethno-linguistic group](https://en.wikipedia.org/wiki/Slavic_languages) that covers several countries in Eastern Europe. \n",
+ "* `Subnational region descriptor` - where the descriptor is a geographic region that refers to a region within a country's national boundary. An example would be `Midwestern United States`. \n",
+ "* `City descriptor` - where the descriptor is a geographic region that refers to a city, town or municipality. An example would be `New York City` or `London`. \n",
+ "\n",
+ "One choice I have made here is not to represent areas _within_ cities using a separate Accent Descriptor. Examples here would be `Brooklyn` or `East London` - they have been classified as cities. This is because there are so few of them that it doesn't change the analysis significantly. \n",
+ "\n",
+ "### First or other language descriptor\n",
+ "\n",
+ "This refers to Accent Descriptors where the data contributor refers to their accent using a descriptor such as `non-native` or `native speaker`. This is sometimes referred to as `first language (L1)` or `second language (L2)`. Although this _may_ be used to refer to the data contributor's _level of fluency_ in a language, I've chosen not to refer to this as a _level of fluency_ - because even though someone speaks a language as a second or other language, this _does not imply_ their level of fluency specifically. One could speak Mandarin as a second language, but be highly fluent. One could speak French as a second language and be less than proficient. \n",
+ "\n",
+ "* `First or other language` \n",
+ "\n",
+ "### Accent strength descriptor \n",
+ "\n",
+ "This refers to Accent Descriptors where the data contributor refers to their accent using a marker of the strength of the accent. Examples included `pronounced`, `90%` or `slight`. \n",
+ "\n",
+ "* `Accent strength descriptor` \n",
+ "\n",
+ "### Vocal quality descriptor \n",
+ "\n",
+ "This refers to Accent Descriptors where the data contributor refers to their accent using words to describe aspects of their voice that are subjective and qualitative - such as `sultry` or `sassy`. \n",
+ "\n",
+ "* `Vocal quality descriptor` \n",
+ "\n",
+ "TODO: is `quality` the correct word here? \n",
+ "\n",
+ "## Phonetic changes \n",
+ "\n",
+ "This category refers to Accent Descriptors which describe a particular phonetic change. This is used as a parent category to group these Accent Descriptors. \n",
+ "\n",
+ "* `Phonetic changes`\n",
+ "\n",
+ "### Specific phonetic changes \n",
+ "\n",
+ "There are several phonetic changes that are linguistic markers for accent difference. \n",
+ "\n",
+ "* `Specified phonetic change` is applied when the Accent Descriptor itself specifies the type of phonetic change. `cot-caught merger` is an example. \n",
+ "\n",
+ "* `Rhoticity` is applied when the Accent Descriptor is describing how `/r/` and related phonemes are pronounced.\n",
+ "\n",
+ "* `Inflection` is applied when the Accent Descriptor is describing an inflection change. \n",
+ "\n",
+ "## Register \n",
+ "\n",
+ "Although the Mozilla Common Voice data used _elicited speech_ - utterances spoken from given text prompts, people can speak in a range of _registers_. A register is generally the level of formality of speech - such as `formal`, or `educated` or `slang`. It may indicate socio-economic heritage of the speaker. This category captures Accent Descriptors that describe an accent in this way. \n",
+ "\n",
+ "* `Register` \n",
+ "\n",
+ "## Named accent \n",
+ "\n",
+ "Some accents, such as `Geordie` or `Scouse` have a related geographical region descriptor - North East England, and Liverpool respectively, but ones such as `Received Pronunciation` do not. This category allows for having a Named Accent descriptor where no related geographic region descriptor exists, as well as being able to capture specifically named accents.\n",
+ "\n",
+ "* `Named Accent`\n",
+ "\n",
+ "## Accent effects due to physical changes \n",
+ "\n",
+ "Accent changes may occur due to physical changes in the speaker's vocal tract - for instance through surgery or disease. This Accent Descriptor is used to capture descriptions such as these. \n",
+ "\n",
+ "* `Accent effects due to physical changes`\n",
+ "\n",
+ "## Mixed or variable accent \n",
+ "\n",
+ "Where the data contributor specifies that their accent is a mixture or amalgamation of accents, but does not provide further information (for example so the Accent Descriptors can be separated or merged), this Accent Descriptor is used to capture this description. \n",
+ "\n",
+ "* `Mixed or variable accent`\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 117,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "id is 100, name is Geographic region, definition is Indicates a geographic region used as a descriptor., parent is None\n",
+ "id is 200, name is Country, definition is Indicates a geographic region of a country or nation-state., parent is 100\n",
+ "id is 300, name is Supranational region, definition is Indicates a geographic region which crosses or overlaps multiple countries., parent is 100\n",
+ "id is 400, name is Subnational region, definition is Indicates a geographic region within a national boundary., parent is 100\n",
+ "id is 500, name is City, definition is Indicates a geographic region referring to a city, town or municipality., parent is 100\n",
+ "id is 600, name is First or other language, definition is Indicates a descriptor related to whether this is the speaker\\s first or other language., parent is None\n",
+ "id is 700, name is Accent strength descriptor, definition is Indicates a marker of accent strength., parent is None\n",
+ "id is 800, name is Vocal quality descriptor, definition is Indicates a subjective vocal quality., parent is None\n",
+ "id is 1000, name is Phonetic Changes, definition is Indicates a phonetic change., parent is None\n",
+ "id is 1100, name is Specific phonetic changes, definition is Indicates a specific phonetic change., parent is 1000\n",
+ "id is 1200, name is Rhoticity, definition is Indicates rhoticity or its absence., parent is 1000\n",
+ "id is 1200, name is Inflection, definition is Indicates an inflection change., parent is 1000\n",
+ "id is 1300, name is Register, definition is Indicates which register the data contributor speaks in., parent is None\n",
+ "id is 1400, name is Specifically named accent, definition is Indicates a specifically named accent., parent is None\n",
+ "id is 1500, name is Accent effects due to physical changes, definition is Indicates accent changes due to physical changes of the data contributor., parent is None\n",
+ "id is 1600, name is Mixed or variable accent, definition is Indicates mixture or amalgamation of accents., parent is None\n",
+ "id is 100, name is Geographic region, definition is Indicates a geographic region used as a descriptor., parent is None\n",
+ "id is 200, name is Country, definition is Indicates a geographic region of a country or nation-state., parent is 100\n",
+ "id is 300, name is Supranational region, definition is Indicates a geographic region which crosses or overlaps multiple countries., parent is 100\n",
+ "id is 400, name is Subnational region, definition is Indicates a geographic region within a national boundary., parent is 100\n",
+ "id is 500, name is City, definition is Indicates a geographic region referring to a city, town or municipality., parent is 100\n",
+ "id is 600, name is First or other language, definition is Indicates a descriptor related to whether this is the speaker\\s first or other language., parent is None\n",
+ "id is 700, name is Accent strength descriptor, definition is Indicates a marker of accent strength., parent is None\n",
+ "id is 800, name is Vocal quality descriptor, definition is Indicates a subjective vocal quality., parent is None\n",
+ "id is 1000, name is Phonetic Changes, definition is Indicates a phonetic change., parent is None\n",
+ "id is 1100, name is Specific phonetic changes, definition is Indicates a specific phonetic change., parent is 1000\n",
+ "id is 1200, name is Rhoticity, definition is Indicates rhoticity or its absence., parent is 1000\n",
+ "id is 1200, name is Inflection, definition is Indicates an inflection change., parent is 1000\n",
+ "id is 1300, name is Register, definition is Indicates which register the data contributor speaks in., parent is None\n",
+ "id is 1400, name is Named Accent, definition is Indicates a specifically named accent., parent is None\n",
+ "id is 1500, name is Accent effects due to physical changes, definition is Indicates accent changes due to physical changes of the data contributor., parent is None\n",
+ "id is 1600, name is Mixed or variable accent, definition is Indicates mixture or amalgamation of accents., parent is None\n",
+ "id is 2000, name is Uncertainty marker, definition is Indicates uncertainty of descriptor., parent is None\n",
+ "id is 2100, name is Generational marker, definition is Indicates generational association of speaker., parent is None\n",
+ "id is 2200, name is Socio-economic marker, definition is Indicates the socio-economic status of speaker., parent is None\n",
+ "id is 2300, name is Hybrid dialect, definition is Indicates that the speaker has an accent of a hybrid dialect of the language., parent is None\n",
+ "id is 2400, name is Linguistic heritage of speaker, definition is Indicates something about the language acquisition or language immersion of the speaker, parent is None\n"
+ ]
+ }
+ ],
+ "source": [
+ "import cvaccents as cva\n",
+ "#reload(cva)\n",
+ "\n",
+ "# Using the Accent Descriptor class to create Accent Descriptor accents for the above \n",
+ "\n",
+ "descriptorGeoRegion = cva.AccentDescriptor(\n",
+ " id = 100, \n",
+ " name='Geographic region', \n",
+ " definition = 'Indicates a geographic region used as a descriptor.', \n",
+ " parent = None, \n",
+ ")\n",
+ "descriptorGeoCountry = cva.AccentDescriptor(\n",
+ " id = 200, \n",
+ " name='Country', \n",
+ " definition = 'Indicates a geographic region of a country or nation-state.', \n",
+ " parent = 100, \n",
+ ")\n",
+ "descriptorGeoSupra = cva.AccentDescriptor(\n",
+ " id = 300, \n",
+ " name='Supranational region', \n",
+ " definition = 'Indicates a geographic region which crosses or overlaps multiple countries.', \n",
+ " parent = 100, \n",
+ ")\n",
+ "descriptorGeoSub = cva.AccentDescriptor(\n",
+ " id = 400, \n",
+ " name='Subnational region', \n",
+ " definition = 'Indicates a geographic region within a national boundary.', \n",
+ " parent = 100, \n",
+ ")\n",
+ "descriptorGeoCity = cva.AccentDescriptor(\n",
+ " id = 500, \n",
+ " name='City', \n",
+ " definition = 'Indicates a geographic region referring to a city, town or municipality.', \n",
+ " parent = 100, \n",
+ ")\n",
+ "\n",
+ "\n",
+ "descriptorFOL = cva.AccentDescriptor(\n",
+ " id = 600, \n",
+ " name='First or other language', \n",
+ " definition = 'Indicates a descriptor related to whether this is the speaker\\s first or other language.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "descriptorAccStr = cva.AccentDescriptor(\n",
+ " id = 700, \n",
+ " name='Accent strength descriptor', \n",
+ " definition = 'Indicates a marker of accent strength.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "descriptorVocQual = cva.AccentDescriptor(\n",
+ " id = 800, \n",
+ " name='Vocal quality descriptor', \n",
+ " definition = 'Indicates a subjective vocal quality.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "\n",
+ "descriptorPhonChanges = cva.AccentDescriptor(\n",
+ " id = 1000, \n",
+ " name='Phonetic Changes', \n",
+ " definition = 'Indicates a phonetic change.', \n",
+ " parent = None, \n",
+ ")\n",
+ "descriptorPhonSpecific = cva.AccentDescriptor(\n",
+ " id = 1100, \n",
+ " name='Specific phonetic changes', \n",
+ " definition = 'Indicates a specific phonetic change.', \n",
+ " parent = 1000, \n",
+ ")\n",
+ "descriptorPhonRhoticity = cva.AccentDescriptor(\n",
+ " id = 1200, \n",
+ " name='Rhoticity', \n",
+ " definition = 'Indicates rhoticity or its absence.', \n",
+ " parent = 1000, \n",
+ ")\n",
+ "descriptorPhonInflection = cva.AccentDescriptor(\n",
+ " id = 1200, \n",
+ " name='Inflection', \n",
+ " definition = 'Indicates an inflection change.', \n",
+ " parent = 1000, \n",
+ ")\n",
+ "\n",
+ "descriptorRegister = cva.AccentDescriptor(\n",
+ " id = 1300, \n",
+ " name='Register', \n",
+ " definition = 'Indicates which register the data contributor speaks in.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "descriptorNamedAcc = cva.AccentDescriptor(\n",
+ " id = 1400, \n",
+ " name='Specifically named accent', \n",
+ " definition = 'Indicates a specifically named accent.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "descriptorPhysChange = cva.AccentDescriptor(\n",
+ " id = 1500, \n",
+ " name='Accent effects due to physical changes', \n",
+ " definition = 'Indicates accent changes due to physical changes of the data contributor.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "descriptorAccMixed = cva.AccentDescriptor(\n",
+ " id = 1600, \n",
+ " name='Mixed or variable accent', \n",
+ " definition = 'Indicates mixture or amalgamation of accents.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "\n",
+ "descriptorAccUncertainty = cva.AccentDescriptor(\n",
+ " id = 2000, \n",
+ " name='Uncertainty marker', \n",
+ " definition = 'Indicates uncertainty of descriptor.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "\n",
+ "print(descriptorGeoRegion.__str__())\n",
+ "print(descriptorGeoCountry.__str__())\n",
+ "print(descriptorGeoSupra.__str__())\n",
+ "print(descriptorGeoSub.__str__())\n",
+ "print(descriptorGeoCity.__str__())\n",
+ "\n",
+ "print(descriptorFOL.__str__())\n",
+ "\n",
+ "print(descriptorAccStr.__str__())\n",
+ "\n",
+ "print(descriptorVocQual.__str__())\n",
+ "\n",
+ "print(descriptorPhonChanges.__str__())\n",
+ "print(descriptorPhonSpecific.__str__())\n",
+ "print(descriptorPhonRhoticity.__str__())\n",
+ "print(descriptorPhonInflection.__str__())\n",
+ "\n",
+ "print(descriptorRegister.__str__())\n",
+ "\n",
+ "print(descriptorNamedAcc.__str__())\n",
+ "\n",
+ "print(descriptorPhysChange.__str__())\n",
+ "\n",
+ "print(descriptorAccMixed.__str__())\n",
+ "\n",
+ "import cvaccents as cva\n",
+ "#reload(cva)\n",
+ "\n",
+ "# Using the Accent Descriptor class to create Accent Descriptor accents for the above \n",
+ "\n",
+ "descriptorGeoRegion = cva.AccentDescriptor(\n",
+ " id = 100, \n",
+ " name='Geographic region', \n",
+ " definition = 'Indicates a geographic region used as a descriptor.', \n",
+ " parent = None, \n",
+ ")\n",
+ "descriptorGeoCountry = cva.AccentDescriptor(\n",
+ " id = 200, \n",
+ " name='Country', \n",
+ " definition = 'Indicates a geographic region of a country or nation-state.', \n",
+ " parent = 100, \n",
+ ")\n",
+ "descriptorGeoSupra = cva.AccentDescriptor(\n",
+ " id = 300, \n",
+ " name='Supranational region', \n",
+ " definition = 'Indicates a geographic region which crosses or overlaps multiple countries.', \n",
+ " parent = 100, \n",
+ ")\n",
+ "descriptorGeoSub = cva.AccentDescriptor(\n",
+ " id = 400, \n",
+ " name='Subnational region', \n",
+ " definition = 'Indicates a geographic region within a national boundary.', \n",
+ " parent = 100, \n",
+ ")\n",
+ "descriptorGeoCity = cva.AccentDescriptor(\n",
+ " id = 500, \n",
+ " name='City', \n",
+ " definition = 'Indicates a geographic region referring to a city, town or municipality.', \n",
+ " parent = 100, \n",
+ ")\n",
+ "\n",
+ "\n",
+ "descriptorFOL = cva.AccentDescriptor(\n",
+ " id = 600, \n",
+ " name='First or other language', \n",
+ " definition = 'Indicates a descriptor related to whether this is the speaker\\s first or other language.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "descriptorAccStr = cva.AccentDescriptor(\n",
+ " id = 700, \n",
+ " name='Accent strength descriptor', \n",
+ " definition = 'Indicates a marker of accent strength.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "descriptorVocQual = cva.AccentDescriptor(\n",
+ " id = 800, \n",
+ " name='Vocal quality descriptor', \n",
+ " definition = 'Indicates a subjective vocal quality.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "\n",
+ "descriptorPhonChanges = cva.AccentDescriptor(\n",
+ " id = 1000, \n",
+ " name='Phonetic Changes', \n",
+ " definition = 'Indicates a phonetic change.', \n",
+ " parent = None, \n",
+ ")\n",
+ "descriptorPhonSpecific = cva.AccentDescriptor(\n",
+ " id = 1100, \n",
+ " name='Specific phonetic changes', \n",
+ " definition = 'Indicates a specific phonetic change.', \n",
+ " parent = 1000, \n",
+ ")\n",
+ "descriptorPhonRhoticity = cva.AccentDescriptor(\n",
+ " id = 1200, \n",
+ " name='Rhoticity', \n",
+ " definition = 'Indicates rhoticity or its absence.', \n",
+ " parent = 1000, \n",
+ ")\n",
+ "descriptorPhonInflection = cva.AccentDescriptor(\n",
+ " id = 1200, \n",
+ " name='Inflection', \n",
+ " definition = 'Indicates an inflection change.', \n",
+ " parent = 1000, \n",
+ ")\n",
+ "\n",
+ "descriptorRegister = cva.AccentDescriptor(\n",
+ " id = 1300, \n",
+ " name='Register', \n",
+ " definition = 'Indicates which register the data contributor speaks in.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "descriptorNamedAcc = cva.AccentDescriptor(\n",
+ " id = 1400, \n",
+ " name='Named Accent', \n",
+ " definition = 'Indicates a specifically named accent.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "descriptorPhysChange = cva.AccentDescriptor(\n",
+ " id = 1500, \n",
+ " name='Accent effects due to physical changes', \n",
+ " definition = 'Indicates accent changes due to physical changes of the data contributor.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "descriptorAccMixed = cva.AccentDescriptor(\n",
+ " id = 1600, \n",
+ " name='Mixed or variable accent', \n",
+ " definition = 'Indicates mixture or amalgamation of accents.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "\n",
+ "descriptorAccUncertainty = cva.AccentDescriptor(\n",
+ " id = 2000, \n",
+ " name='Uncertainty marker', \n",
+ " definition = 'Indicates uncertainty of descriptor.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "descriptorGeneration = cva.AccentDescriptor(\n",
+ " id = 2100, \n",
+ " name='Generational marker', \n",
+ " definition = 'Indicates generational association of speaker.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "descriptorSocioeconomic = cva.AccentDescriptor(\n",
+ " id = 2200, \n",
+ " name='Socio-economic marker', \n",
+ " definition = 'Indicates the socio-economic status of speaker.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "\n",
+ "descriptorHybrid = cva.AccentDescriptor(\n",
+ " id = 2300, \n",
+ " name='Hybrid dialect', \n",
+ " definition = 'Indicates that the speaker has an accent of a hybrid dialect of the language.', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "descriptorHeritage = cva.AccentDescriptor(\n",
+ " id = 2400, \n",
+ " name='Linguistic heritage of speaker', \n",
+ " definition = 'Indicates something about the language acquisition or language immersion of the speaker', \n",
+ " parent = None, \n",
+ ")\n",
+ "\n",
+ "\n",
+ "#####\n",
+ "\n",
+ "\n",
+ "print(descriptorGeoRegion.__str__())\n",
+ "print(descriptorGeoCountry.__str__())\n",
+ "print(descriptorGeoSupra.__str__())\n",
+ "print(descriptorGeoSub.__str__())\n",
+ "print(descriptorGeoCity.__str__())\n",
+ "\n",
+ "print(descriptorFOL.__str__())\n",
+ "\n",
+ "print(descriptorAccStr.__str__())\n",
+ "\n",
+ "print(descriptorVocQual.__str__())\n",
+ "\n",
+ "print(descriptorPhonChanges.__str__())\n",
+ "print(descriptorPhonSpecific.__str__())\n",
+ "print(descriptorPhonRhoticity.__str__())\n",
+ "print(descriptorPhonInflection.__str__())\n",
+ "\n",
+ "print(descriptorRegister.__str__())\n",
+ "\n",
+ "print(descriptorNamedAcc.__str__())\n",
+ "\n",
+ "print(descriptorPhysChange.__str__())\n",
+ "\n",
+ "print(descriptorAccMixed.__str__())\n",
+ "\n",
+ "print(descriptorAccUncertainty.__str__())\n",
+ "\n",
+ "print(descriptorGeneration.__str__())\n",
+ "\n",
+ "print(descriptorSocioeconomic.__str__())\n",
+ "\n",
+ "print(descriptorHybrid.__str__())\n",
+ "\n",
+ "print(descriptorHeritage.__str__())\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we have the Accent Descriptors defined, we can associate Accent Descriptors with each Accent "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 118,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# I could put them all in one list, \n",
+ "# but it's easier to debug this way\n",
+ "\n",
+ "# Generic region descriptors that don't fit into any other category \n",
+ "region_descriptors = [\n",
+ " #('non regional', descriptorGeoRegion),\n",
+ " #('International English', descriptorFOL)\n",
+ "]\n",
+ "\n",
+ "# Country descriptors \n",
+ "country_descriptors = [ \n",
+ " ('Kenyan', descriptorGeoCountry),\n",
+ " ('Tanzania', descriptorGeoCountry)\n",
+ "]\n",
+ "\n",
+ "# Subnational descriptors \n",
+ "subnational_descriptors = [\n",
+ " ('kiMvita', descriptorGeoSub),\n",
+ " ('Arusha', descriptorGeoSub),\n",
+ " ('Coastal Swahili', descriptorGeoSub),\n",
+ "]\n",
+ "\n",
+ "# Supranational descriptors \n",
+ "supranational_descriptors = [\n",
+ " ('Kiswahili accent', descriptorGeoSupra)\n",
+ "]\n",
+ "\n",
+ "# City descriptors \n",
+ "city_descriptors = [\n",
+ " #('New York City', descriptorGeoCity),\n",
+ "]\n",
+ "\n",
+ "# First or other language descriptors \n",
+ "FOL_descriptors = [\n",
+ " ('Non-native speaker', descriptorFOL),\n",
+ " ('Bilingual', descriptorFOL),\n",
+ " ('native', descriptorFOL),\n",
+ " ('Second language', descriptorFOL),\n",
+ " ('Basic', descriptorFOL),\n",
+ " ('time spent in location', descriptorFOL),\n",
+ " ('some', descriptorFOL),\n",
+ " ('mid level', descriptorFOL),\n",
+ " ('Spoke language when a child', descriptorFOL),\n",
+ " ('fluent', descriptorFOL),\n",
+ " ('Conversational', descriptorFOL),\n",
+ " ('Foreign', descriptorFOL),\n",
+ " ('Native speaker', descriptorFOL),\n",
+ " ('Good', descriptorFOL),\n",
+ " ('Fluent', descriptorFOL)\n",
+ "]\n",
+ "\n",
+ "\n",
+ "# Accent Strength descriptors\n",
+ "AccStr_descriptors = [\n",
+ " ('pronounced', descriptorAccStr),\n",
+ " ('slight', descriptorAccStr),\n",
+ " ('Mild', descriptorAccStr),\n",
+ " ('Not bad', descriptorAccStr),\n",
+ " ('little bit', descriptorAccStr),\n",
+ " ('tinge', descriptorAccStr),\n",
+ " ('90%', descriptorAccStr),\n",
+ " ('10%', descriptorAccStr),\n",
+ " ('heavy', descriptorAccStr),\n",
+ " ('little', descriptorAccStr),\n",
+ " ('minor', descriptorAccStr),\n",
+ " ('plain', descriptorAccStr), \n",
+ " ('Neutral', descriptorAccStr),\n",
+ " ('touch', descriptorAccStr), \n",
+ " ('mostly', descriptorAccStr), \n",
+ " ('Strong', descriptorAccStr)\n",
+ "]\n",
+ "\n",
+ "\n",
+ "# Vocal quality descriptors\n",
+ "VocQual_descriptors = [\n",
+ " ('sultry', descriptorVocQual),\n",
+ " ('classy', descriptorVocQual),\n",
+ " ('sassy', descriptorVocQual),\n",
+ " ('Slight lisp', descriptorVocQual),\n",
+ " ('Slightly effeminate', descriptorVocQual),\n",
+ " ('Low', descriptorVocQual),\n",
+ " ('Demure', descriptorVocQual),\n",
+ " ('Gay', descriptorVocQual),\n",
+ " ('slow', descriptorVocQual),\n",
+ " ('slurred', descriptorVocQual)\n",
+ "]\n",
+ "\n",
+ "\n",
+ "# Phonetic descriptors \n",
+ "PhonSpecific_descriptors = [\n",
+ " ('pin/pen merger', descriptorPhonSpecific),\n",
+ " ('heavy consonants', descriptorPhonSpecific),\n",
+ " ('cot-caught merger', descriptorPhonSpecific)\n",
+ "]\n",
+ "PhonRhoticity_descriptors = [\n",
+ " (\"pronounced r's\", descriptorPhonRhoticity)\n",
+ "]\n",
+ "PhonInflection_descriptors = [\n",
+ " ('mostly affecting inflection', descriptorPhonInflection)\n",
+ "]\n",
+ "\n",
+ "# Register descriptors\n",
+ "Register_descriptors = [\n",
+ " ('surfer', descriptorRegister),\n",
+ " ('academic', descriptorRegister),\n",
+ " ('Educated', descriptorRegister),\n",
+ " ('formal', descriptorRegister),\n",
+ " ('slang', descriptorRegister),\n",
+ " ('Urban', descriptorRegister),\n",
+ " ('classy', descriptorRegister),\n",
+ " ('sassy', descriptorRegister),\n",
+ " ('city', descriptorRegister),\n",
+ " ('Cool', descriptorRegister),\n",
+ " ('Conversational', descriptorRegister),\n",
+ " ('Received Pronunciation', descriptorRegister), \n",
+ " ('Eloquent', descriptorRegister)\n",
+ "]\n",
+ "\n",
+ "# Named accent descriptors\n",
+ "NamedAcc_descriptors = [\n",
+ " ('Patois', descriptorNamedAcc),\n",
+ " ('Received Pronunciation', descriptorNamedAcc),\n",
+ " ('Kiwi', descriptorNamedAcc),\n",
+ " ('Chicano English', descriptorNamedAcc),\n",
+ " ('\"Valley Girl\" English', descriptorNamedAcc),\n",
+ " ('Okie', descriptorNamedAcc),\n",
+ " \n",
+ " ('Southern drawl', descriptorNamedAcc),\n",
+ " ('Transatlantic English', descriptorNamedAcc),\n",
+ " ('Culchie', descriptorNamedAcc),\n",
+ " ('African American Vernacular', descriptorNamedAcc),\n",
+ " ('Standard American English', descriptorNamedAcc)\n",
+ " \n",
+ "]\n",
+ " \n",
+ "# Physical change descriptors\n",
+ "PhysChange_descriptors = [\n",
+ " ('changes due to oral surgery', descriptorPhysChange)\n",
+ "]\n",
+ "\n",
+ "# Mixed accent descriptors\n",
+ "AccMixed_descriptors = [\n",
+ " ('Variable', descriptorAccMixed),\n",
+ " ('Adjustable', descriptorAccMixed),\n",
+ " ('Mix of accents', descriptorAccMixed),\n",
+ " ('try to maintain originality', descriptorAccMixed)\n",
+ "]\n",
+ " \n",
+ "# Uncertainty marker \n",
+ "AccUncertainty_descriptors = [\n",
+ " ('I think', descriptorAccUncertainty)\n",
+ "] \n",
+ "\n",
+ "# Generational associations \n",
+ "Generation_descriptors = [\n",
+ " ('Gen Z', descriptorGeneration)\n",
+ "] \n",
+ "\n",
+ "# Socio-economic status descriptors \n",
+ "Socioeconomic_descriptors = [\n",
+ " ('Middle class', descriptorSocioeconomic)\n",
+ "]\n",
+ "\n",
+ "# Hybrid descriptors \n",
+ "Hybrid_descriptors = [\n",
+ " ('Hunglish', descriptorHybrid),\n",
+ " ('Denglish', descriptorHybrid)\n",
+ "]\n",
+ "\n",
+ "# Heritage descriptors \n",
+ "Heritage_descriptors = [\n",
+ " ('Born in area', descriptorHeritage),\n",
+ " ('Lived in area', descriptorHeritage),\n",
+ "]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 119,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create one list from the above lists \n",
+ "\n",
+ "accent_descriptor_list = [\n",
+ " region_descriptors,\n",
+ " country_descriptors,\n",
+ " subnational_descriptors,\n",
+ " supranational_descriptors,\n",
+ " city_descriptors,\n",
+ " FOL_descriptors,\n",
+ " AccStr_descriptors,\n",
+ " VocQual_descriptors,\n",
+ " PhonSpecific_descriptors,\n",
+ " PhonRhoticity_descriptors,\n",
+ " PhonInflection_descriptors,\n",
+ " Register_descriptors,\n",
+ " NamedAcc_descriptors,\n",
+ " PhysChange_descriptors,\n",
+ " AccMixed_descriptors,\n",
+ " AccUncertainty_descriptors,\n",
+ " Generation_descriptors,\n",
+ " Socioeconomic_descriptors,\n",
+ " Hybrid_descriptors,\n",
+ " Heritage_descriptors\n",
+ "]\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 120,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Now we loop through all the accents \n",
+ "# And if the accent name matches one of the descriptors in accent_descriptor_list \n",
+ "# We add the relevant Accent Descriptor to the Accent's object representation \n",
+ "\n",
+ "for accent_descriptor_category in accent_descriptor_list: \n",
+ " for accent_descriptor in accent_descriptor_category: \n",
+ " for accent in all_accents.items(): \n",
+ " \n",
+ " #print ('accent is: ', accent[1], ' and accent_descriptor is: ', accent_descriptor)\n",
+ " \n",
+ " if accent[1]._name == accent_descriptor[0]: \n",
+ " #print ('MATCH!')\n",
+ " if accent[1]._descriptors is None: \n",
+ " accent[1]._descriptors = [] # initialise list if None\n",
+ " accent[1]._descriptors.append(accent_descriptor[1]) # append because there can be multiple \n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 121,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "id is 1, name is Good, count is 1, locale is en, descriptors are [], predetermined is False.\n",
+ "id is 2, name is Kiswahili accent, count is 13, locale is en, descriptors are [], predetermined is False.\n",
+ "id is 3, name is Coastal Swahili, count is 2, locale is en, descriptors are [], predetermined is False.\n",
+ "id is 4, name is Fluent, count is 11, locale is en, descriptors are [], predetermined is False.\n",
+ "id is 5, name is kiMvita, count is 2, locale is en, descriptors are [], predetermined is False.\n",
+ "id is 8, name is Strong, count is 1, locale is en, descriptors are [], predetermined is False.\n",
+ "id is 10, name is Kenyan, count is 2, locale is en, descriptors are [], predetermined is False.\n",
+ "id is 11, name is Lived in area, count is 1, locale is en, descriptors are [], predetermined is False.\n",
+ "id is 12, name is native, count is 2, locale is en, descriptors are [], predetermined is False.\n",
+ "id is 14, name is Tanzania, count is 1, locale is en, descriptors are [], predetermined is False.\n",
+ "id is 15, name is academic, count is 1, locale is en, descriptors are [], predetermined is False.\n",
+ "id is 23, name is Eloquent, count is 1, locale is en, descriptors are [], predetermined is False.\n",
+ "id is 39, name is Arusha, count is 1, locale is en, descriptors are [], predetermined is False.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# the accents should now have descriptors \n",
+ "\n",
+ "import cvaccents as cva\n",
+ "#reload(cva)\n",
+ "\n",
+ "for accent in all_accents.items(): \n",
+ " print(accent[1].__str__())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 122,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Now do a cross-check to see if there are any accents for which the ._descriptor is None \n",
+ "# this flags if I've missed an accent somewhere \n",
+ "\n",
+ "\n",
+ "\n",
+ "missing_descriptors = all_accents.reportNoneAccentDescriptors() \n",
+ "\n",
+ "for accent in missing_descriptors: \n",
+ " print (accent[1].__str__())\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create relationships between Accents suitable for data visualisation \n",
+ "\n",
+ "Now, we want to create relationships _between_ accents so that we can visualise accents as **nodes** and their relationships as **edges**. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 124,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[ ['Good'],\n",
+ " ['Kiswahili accent', 'Coastal Swahili'],\n",
+ " ['Fluent', 'kiMvita'],\n",
+ " [],\n",
+ " ['Kiswahili accent'],\n",
+ " ['Kiswahili accent', 'Strong'],\n",
+ " ['kiMvita'],\n",
+ " ['Kenyan'],\n",
+ " ['Lived in area'],\n",
+ " ['native'],\n",
+ " ['Kiswahili accent', 'Tanzania', 'academic'],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " ['Kenyan'],\n",
+ " ['Eloquent', 'Fluent'],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " ['Coastal Swahili'],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " [],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " ['native'],\n",
+ " ['Kiswahili accent', 'Fluent'],\n",
+ " ['Fluent'],\n",
+ " ['Fluent', 'Kiswahili accent'],\n",
+ " ['Kiswahili accent', 'Arusha']]\n"
+ ]
+ }
+ ],
+ "source": [
+ "pp.pprint(kiswahili_accents_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 125,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 id is 1, name is Good, count is 1, locale is en, descriptors are [], predetermined is False.\n",
+ "2 id is 2, name is Kiswahili accent, count is 13, locale is en, descriptors are [], predetermined is False.\n",
+ "3 id is 3, name is Coastal Swahili, count is 2, locale is en, descriptors are [], predetermined is False.\n",
+ "4 id is 4, name is Fluent, count is 11, locale is en, descriptors are [], predetermined is False.\n",
+ "5 id is 5, name is kiMvita, count is 2, locale is en, descriptors are [], predetermined is False.\n",
+ "8 id is 8, name is Strong, count is 1, locale is en, descriptors are [], predetermined is False.\n",
+ "10 id is 10, name is Kenyan, count is 2, locale is en, descriptors are [], predetermined is False.\n",
+ "11 id is 11, name is Lived in area, count is 1, locale is en, descriptors are [], predetermined is False.\n",
+ "12 id is 12, name is native, count is 2, locale is en, descriptors are [], predetermined is False.\n",
+ "14 id is 14, name is Tanzania, count is 1, locale is en, descriptors are [], predetermined is False.\n",
+ "15 id is 15, name is academic, count is 1, locale is en, descriptors are [], predetermined is False.\n",
+ "23 id is 23, name is Eloquent, count is 1, locale is en, descriptors are [], predetermined is False.\n",
+ "39 id is 39, name is Arusha, count is 1, locale is en, descriptors are [], predetermined is False.\n"
+ ]
+ }
+ ],
+ "source": [
+ "for idx, value in all_accents.items(): \n",
+ " print(idx, value)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 126,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3\n",
+ "i is: 1\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[2, 3]\n",
+ "5\n",
+ "i is: 2\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[4, 5]\n",
+ "8\n",
+ "i is: 5\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[2, 8]\n",
+ "14\n",
+ "i is: 10\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[2, 14]\n",
+ "15\n",
+ "i is: 10\n",
+ "length of accent_nodes[i] is: 3\n",
+ "[2, 14, 15]\n",
+ "4\n",
+ "i is: 11\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[2, 4]\n",
+ "4\n",
+ "i is: 12\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[2, 4]\n",
+ "4\n",
+ "i is: 13\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[2, 4]\n",
+ "4\n",
+ "i is: 15\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[23, 4]\n",
+ "4\n",
+ "i is: 16\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[2, 4]\n",
+ "4\n",
+ "i is: 18\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[2, 4]\n",
+ "4\n",
+ "i is: 20\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[2, 4]\n",
+ "4\n",
+ "i is: 22\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[2, 4]\n",
+ "2\n",
+ "i is: 24\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[4, 2]\n",
+ "39\n",
+ "i is: 25\n",
+ "length of accent_nodes[i] is: 2\n",
+ "[2, 39]\n"
+ ]
+ }
+ ],
+ "source": [
+ "## Creating linkages between the individual accents and how they are represented in the data. \n",
+ "## What I want to do here is create a data structure that has the ID of the accent \n",
+ "## and something to describe the edge: \n",
+ "## \n",
+ "## The data structure I think will work here is: \n",
+ "## \n",
+ "## { 99: (123, 456)} \n",
+ "## to represent an edge between accent ID 123 and accent 456\n",
+ "## \n",
+ "## One thing to be aware of here is that the edges are NON-DIRECTIONAL\n",
+ "## {99: (123, 456)}\n",
+ "## is equivalent to \n",
+ "## {99: (123, 456)}\n",
+ "## so we need a way to remove duplicates \n",
+ "\n",
+ "## The data structures we are using are: \n",
+ "## \n",
+ "## all_accents - Accent Collection object of all Accents, merged and normalised\n",
+ "## english_accents_list - this is a list of list of strings,\n",
+ "## where each list represents the Accents that are related\n",
+ "## \n",
+ "## what we want to do is go through each list, \n",
+ "## and find the ID number of the accent \n",
+ "## from the Dict, \n",
+ "## then build a Dict that represents the Accent's relation to other Accents\n",
+ "## this is accent_nodes\n",
+ "\n",
+ "accent_nodes = {}\n",
+ "i = 0;\n",
+ "\n",
+ "for accent_list in kiswahili_accents_list:\n",
+ "\n",
+ " #print(accent_list)\n",
+ " \n",
+ " # initialise the list first \n",
+ " accent_nodes[i] = []\n",
+ " \n",
+ " for accent_list_item in accent_list: \n",
+ " #print('now processing', accent_list_item)\n",
+ " \n",
+ " for accent in all_accents.items(): \n",
+ " \n",
+ " #if (i%ratio_display ==0): # only show the 100th \n",
+ " #print('---')\n",
+ " #print ('now looking at row: ', accent_list, 'and accent list item: ', accent_list_item, ' and accent: ', accent)\n",
+ " \n",
+ " if (accent_list_item == accent[1]._name): ## match \n",
+ " \n",
+ " #if (i%ratio_display ==0): # only show the 100th \n",
+ " #print('---')\n",
+ " #print ('match!')\n",
+ " \n",
+ " #print(accent[0])\n",
+ " #print('i is: ', i)\n",
+ " \n",
+ " \n",
+ " accent_nodes[i].append(accent[0]) # we want the accent ID number\n",
+ " \n",
+ " if (len(accent_nodes[i]) > 1) : \n",
+ " # double check nodes that have more than 1 element \n",
+ " print(accent[0])\n",
+ " print('i is: ', i)\n",
+ " print('length of accent_nodes[i] is: ', len(accent_nodes[i]))\n",
+ " pp.pprint(accent_nodes[i])\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " i +=1 "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 127,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "26\n"
+ ]
+ }
+ ],
+ "source": [
+ "pp.pprint(len(accent_nodes))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 128,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{ 0: [1],\n",
+ " 1: [2, 3],\n",
+ " 2: [4, 5],\n",
+ " 3: [],\n",
+ " 4: [2],\n",
+ " 5: [2, 8],\n",
+ " 6: [5],\n",
+ " 7: [10],\n",
+ " 8: [11],\n",
+ " 9: [12],\n",
+ " 10: [2, 14, 15],\n",
+ " 11: [2, 4],\n",
+ " 12: [2, 4],\n",
+ " 13: [2, 4],\n",
+ " 14: [10],\n",
+ " 15: [23, 4],\n",
+ " 16: [2, 4],\n",
+ " 17: [3],\n",
+ " 18: [2, 4],\n",
+ " 19: [],\n",
+ " 20: [2, 4],\n",
+ " 21: [12],\n",
+ " 22: [2, 4],\n",
+ " 23: [4],\n",
+ " 24: [4, 2],\n",
+ " 25: [2, 39]}\n"
+ ]
+ }
+ ],
+ "source": [
+ "pp.pprint(accent_nodes)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 129,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "26\n"
+ ]
+ }
+ ],
+ "source": [
+ "# this figure is a cross-check - \n",
+ "# it should equal the original length of accent_nodes minus the size of deletion_list\n",
+ "pp.pprint(len(accent_nodes))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 130,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{ 0: [1],\n",
+ " 1: [2, 3],\n",
+ " 2: [4, 5],\n",
+ " 3: [],\n",
+ " 4: [2],\n",
+ " 5: [2, 8],\n",
+ " 6: [5],\n",
+ " 7: [10],\n",
+ " 8: [11],\n",
+ " 9: [12],\n",
+ " 10: [2, 14, 15],\n",
+ " 11: [2, 4],\n",
+ " 12: [2, 4],\n",
+ " 13: [2, 4],\n",
+ " 14: [10],\n",
+ " 15: [23, 4],\n",
+ " 16: [2, 4],\n",
+ " 17: [3],\n",
+ " 18: [2, 4],\n",
+ " 19: [],\n",
+ " 20: [2, 4],\n",
+ " 21: [12],\n",
+ " 22: [2, 4],\n",
+ " 23: [4],\n",
+ " 24: [4, 2],\n",
+ " 25: [2, 39]}\n"
+ ]
+ }
+ ],
+ "source": [
+ "pp.pprint(accent_nodes)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 131,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Now what I need to do is create a JSON format suitable for using in \n",
+ "# a Trellis diagram in Observable \n",
+ "# e.g. https://observablehq.com/@jameslaneconkling/trellis\n",
+ "\n",
+ "# The format of the JSON looks like: \n",
+ "# const nodes = [\n",
+ "# {id: 'Myriel', group: 1},\n",
+ "# {id: 'Napoleon', group: 1},\n",
+ "\n",
+ "# const edges = [\n",
+ "# {source: 'Napoleon', target: 'Myriel'},\n",
+ "# {source: 'Mlle.Baptistine', target: 'Myriel'},\n",
+ "\n",
+ "# the nodes should be as easy as JSON dumping the all_accents AccentCollection \n",
+ "# possibly using a method \n",
+ "\n",
+ "# the edges will be a bit more complex "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Nodes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 132,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "13\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(all_accents.total())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 133,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 133,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#reload(cva)\n",
+ "\n",
+ "all_accents.exportJSON(accents_filename)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 134,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "26\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(accent_nodes))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 135,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{ 0: [1],\n",
+ " 1: [2, 3],\n",
+ " 2: [4, 5],\n",
+ " 3: [],\n",
+ " 4: [2],\n",
+ " 5: [2, 8],\n",
+ " 6: [5],\n",
+ " 7: [10],\n",
+ " 8: [11],\n",
+ " 9: [12],\n",
+ " 10: [2, 14, 15],\n",
+ " 11: [2, 4],\n",
+ " 12: [2, 4],\n",
+ " 13: [2, 4],\n",
+ " 14: [10],\n",
+ " 15: [23, 4],\n",
+ " 16: [2, 4],\n",
+ " 17: [3],\n",
+ " 18: [2, 4],\n",
+ " 19: [],\n",
+ " 20: [2, 4],\n",
+ " 21: [12],\n",
+ " 22: [2, 4],\n",
+ " 23: [4],\n",
+ " 24: [4, 2],\n",
+ " 25: [2, 39]}\n"
+ ]
+ }
+ ],
+ "source": [
+ "pp.pprint(accent_nodes)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 136,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "--- checking 1, 2 ---\n",
+ "0\n",
+ "--- checking 2, 1 ---\n",
+ "0\n",
+ "---\n",
+ "---\n"
+ ]
+ }
+ ],
+ "source": [
+ "# let's do some sanity checking to make sure these are correct \n",
+ "\n",
+ "# 0: {'source': 1, 'target': 2, 'weight': 23}\n",
+ "# this is equivalent to the occurrence [1, 2] in accent_nodes \n",
+ "# plus the occurrences of [2, 1] because we have removed bidirectional edges\n",
+ "# and represents a relationship between \n",
+ "# 'England English' - accent id 1 - and 'United States English' - accent id 2\n",
+ "\n",
+ "print('--- checking 1, 2 ---')\n",
+ "\n",
+ "count = 0\n",
+ "for idx, node in accent_nodes.items(): \n",
+ " if (node == list([1, 2])) :\n",
+ " count +=1\n",
+ "print(count)\n",
+ " \n",
+ "print('--- checking 2, 1 ---')\n",
+ "\n",
+ "count = 0\n",
+ "for idx, node in accent_nodes.items(): \n",
+ " if (node == list([2, 1])) :\n",
+ " count +=1\n",
+ "print(count)\n",
+ " \n",
+ "# these together should sum to 23 \n",
+ "# the first one is 9 which appears correct \n",
+ "# but the second is 13, not 14 as expected \n",
+ "# double check the values above. Why do we have an off by one error? \n",
+ "\n",
+ "\n",
+ "\n",
+ "print ('---')\n",
+ "\n",
+ "\n",
+ "# 139: {'source': 13016, 'target': 59, 'weight': 2}\n",
+ "# this is equivalent to the occurrence [13016, 59] in accent_nodes\n",
+ "# and represents a relationship between \n",
+ "# 'Foreign' - accent id 13016 - and 'Non-native speaker'\n",
+ "# this didn't show originally so I dug into it. \n",
+ "# The original accent listing is: \n",
+ "# (12775, [750, 13016, 59])\n",
+ "\n",
+ "for idx, node in accent_nodes.items(): \n",
+ " if node == list([13016, 59]) :\n",
+ " print ('match')\n",
+ " \n",
+ "# \n",
+ "\n",
+ "print ('---')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 137,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# So why am I getting an NODE count here of 13, when below I am getting an edge count of 14? "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Edges\n",
+ "\n",
+ "Here, we use the `accent_nodes` dict to create a dict of edges"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 138,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "26\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(accent_nodes))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 140,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (0, [1])\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (1, [2, 3])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 2\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 3\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 2\n",
+ "accent_edges_id is: 0\n",
+ "{'source': 2, 'target': 3, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 1\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (2, [4, 5])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 4\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 5\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 4\n",
+ "accent_edges_id is: 1\n",
+ "{'source': 4, 'target': 5, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 2\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (3, [])\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (4, [2])\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (5, [2, 8])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 2\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 8\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 2\n",
+ "accent_edges_id is: 2\n",
+ "{'source': 2, 'target': 8, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 3\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (6, [5])\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (7, [10])\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (8, [11])\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (9, [12])\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (10, [2, 14, 15])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 3\n",
+ "popped element is : 2\n",
+ "size of accent_list[1] AFTER popping is: 2\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 14\n",
+ "length of accent list is: 2\n",
+ "popped_element inside the for loop is: 2\n",
+ "accent_edges_id is: 3\n",
+ "{'source': 2, 'target': 14, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 4\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 15\n",
+ "length of accent list is: 2\n",
+ "popped_element inside the for loop is: 2\n",
+ "accent_edges_id is: 4\n",
+ "{'source': 2, 'target': 15, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 5\n",
+ "--- END for loop ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 14\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 15\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 14\n",
+ "accent_edges_id is: 5\n",
+ "{'source': 14, 'target': 15, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 6\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (11, [2, 4])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 2\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 4\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 2\n",
+ "accent_edges_id is: 6\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 7\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (12, [2, 4])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 2\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 4\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 2\n",
+ "accent_edges_id is: 7\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 8\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (13, [2, 4])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 2\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 4\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 2\n",
+ "accent_edges_id is: 8\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 9\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (14, [10])\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (15, [23, 4])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 23\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 4\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 23\n",
+ "accent_edges_id is: 9\n",
+ "{'source': 23, 'target': 4, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 10\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (16, [2, 4])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 2\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 4\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 2\n",
+ "accent_edges_id is: 10\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 11\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (17, [3])\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (18, [2, 4])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 2\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 4\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 2\n",
+ "accent_edges_id is: 11\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 12\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (19, [])\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (20, [2, 4])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 2\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 4\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 2\n",
+ "accent_edges_id is: 12\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 13\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (21, [12])\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (22, [2, 4])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 2\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 4\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 2\n",
+ "accent_edges_id is: 13\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 14\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (23, [4])\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (24, [4, 2])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 4\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 2\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 4\n",
+ "accent_edges_id is: 14\n",
+ "{'source': 4, 'target': 2, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 15\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n",
+ "\n",
+ "\n",
+ "--- BEGIN accent list ---\n",
+ "accent list is: (25, [2, 39])\n",
+ "--- node has more than one accent, processing ... ---\n",
+ "size of accent_list[1] BEFORE popping is: 2\n",
+ "popped element is : 2\n",
+ "size of accent_list[1] AFTER popping is: 1\n",
+ "\n",
+ "\n",
+ "--- in accent list loop ---\n",
+ "accent_id is: 39\n",
+ "length of accent list is: 1\n",
+ "popped_element inside the for loop is: 2\n",
+ "accent_edges_id is: 15\n",
+ "{'source': 2, 'target': 39, 'weight': 1}\n",
+ "checking that accent_edges_id has incremented: 16\n",
+ "--- END for loop ---\n",
+ "--- END while loop ---\n",
+ "--- END accent list ---\n"
+ ]
+ }
+ ],
+ "source": [
+ "## what we want to do here is loop through the accent_nodes Dict \n",
+ "## and create another Dict that we can use to create Links between the Nodes (which are accents)\n",
+ "\n",
+ "accent_edges = {} \n",
+ "accent_edges_id = 0\n",
+ "\n",
+ "\n",
+ "# make a deep copy of accent_nodes as we will be pop()ing elements off lists inside the dict \n",
+ "# and if we don't make a deep copy this will affect accent_nodes as well \n",
+ "# because Python uses [pass by assignment](https://docs.python.org/3/faq/programming.html#how-do-i-write-a-function-with-output-parameters-call-by-reference)\n",
+ "\n",
+ "accent_nodes_for_manipulation = copy.deepcopy(accent_nodes)\n",
+ "\n",
+ "for accent_list in accent_nodes_for_manipulation.items(): \n",
+ " print('\\n')\n",
+ " print('--- BEGIN accent list ---')\n",
+ " print('accent list is: ', accent_list)\n",
+ " \n",
+ " if len(accent_list[1]) > 1: \n",
+ " print ('--- node has more than one accent, processing ... ---') \n",
+ "\n",
+ " # we want to create edges, so only care where the list has two or more elements\n",
+ " while (len(accent_list[1]) > 1) : \n",
+ " \n",
+ " print('size of accent_list[1] BEFORE popping is: ', (len(accent_list[1])))\n",
+ " popped_element = accent_list[1].pop(0) # remove the first element \n",
+ " print('popped element is :', popped_element)\n",
+ " print('size of accent_list[1] AFTER popping is: ', (len(accent_list[1])))\n",
+ " \n",
+ " # create links between the popped element and all the remaining elements in the list \n",
+ " for accent_id in accent_list[1]: \n",
+ " \n",
+ " print('\\n')\n",
+ " print('--- in accent list loop ---')\n",
+ " print('accent_id is: ', accent_id)\n",
+ " print('length of accent list is: ', len(accent_list[1]))\n",
+ " \n",
+ " print('popped_element inside the for loop is: ', popped_element)\n",
+ " print('accent_edges_id is: ', accent_edges_id)\n",
+ " \n",
+ " accent_edges[accent_edges_id] = {}\n",
+ " accent_edges[accent_edges_id]['source'] = popped_element\n",
+ " accent_edges[accent_edges_id]['target'] = accent_id\n",
+ " accent_edges[accent_edges_id]['weight'] = 1\n",
+ " \n",
+ " print(accent_edges[accent_edges_id])\n",
+ " \n",
+ " accent_edges_id +=1\n",
+ " print('checking that accent_edges_id has incremented: ', accent_edges_id)\n",
+ " \n",
+ " print('--- END for loop ---')\n",
+ " print('--- END while loop ---') \n",
+ " print('--- END accent list ---')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 141,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "26\n",
+ "26\n",
+ "False\n"
+ ]
+ }
+ ],
+ "source": [
+ "# accent_nodes and accent_nodes_for_manipulation should NOT be equivalent \n",
+ "# because list items have been pop()'d off the latter\n",
+ "\n",
+ "print(len(accent_nodes))\n",
+ "print(len(accent_nodes_for_manipulation))\n",
+ "print (accent_nodes == accent_nodes_for_manipulation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 142,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "16\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(accent_edges))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 143,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{ 0: {'source': 2, 'target': 3, 'weight': 1},\n",
+ " 1: {'source': 4, 'target': 5, 'weight': 1},\n",
+ " 2: {'source': 2, 'target': 8, 'weight': 1},\n",
+ " 3: {'source': 2, 'target': 14, 'weight': 1},\n",
+ " 4: {'source': 2, 'target': 15, 'weight': 1},\n",
+ " 5: {'source': 14, 'target': 15, 'weight': 1},\n",
+ " 6: {'source': 2, 'target': 4, 'weight': 1},\n",
+ " 7: {'source': 2, 'target': 4, 'weight': 1},\n",
+ " 8: {'source': 2, 'target': 4, 'weight': 1},\n",
+ " 9: {'source': 23, 'target': 4, 'weight': 1},\n",
+ " 10: {'source': 2, 'target': 4, 'weight': 1},\n",
+ " 11: {'source': 2, 'target': 4, 'weight': 1},\n",
+ " 12: {'source': 2, 'target': 4, 'weight': 1},\n",
+ " 13: {'source': 2, 'target': 4, 'weight': 1},\n",
+ " 14: {'source': 4, 'target': 2, 'weight': 1},\n",
+ " 15: {'source': 2, 'target': 39, 'weight': 1}}\n"
+ ]
+ }
+ ],
+ "source": [
+ "pp.pprint(accent_edges)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 144,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "26\n"
+ ]
+ }
+ ],
+ "source": [
+ "print (len(accent_nodes))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 145,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for idx, node in accent_nodes.items(): \n",
+ " # each node is a list \n",
+ " if len(node) > 1:\n",
+ " if (1 in node) and (2 in node): \n",
+ " print(node)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 146,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for idx, node in accent_nodes.items(): \n",
+ " # each node is a list \n",
+ " if len(node) > 2:\n",
+ " if (1 in node) and (2 in node): \n",
+ " print(node)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 147,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for idx, node in accent_nodes.items(): \n",
+ " # each node is a list \n",
+ " if len(node) == 2:\n",
+ " if (1 in node) and (2 in node): \n",
+ " print(node)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This actually returns 30 lines - so we may not have as many _edges_ being generated as there should be. \n",
+ "\n",
+ "There are **11** cases in `accent_nodes` where there are more than two nodes in the list. \n",
+ "\n",
+ "There are **22** cases in `accent_nodes` where there are exactly two nodes in the list. \n",
+ "\n",
+ "_Working hypothesis for this bug:_ I think what's happening here is that the `accent_edges` for nodes with more than 2 nodes are not being generated correctly. \n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "Now, we de-duplicate the `accent_edges` dict\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 148,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "def deduplicateDict (accent_edges):\n",
+ " \n",
+ " temp_edges = []\n",
+ " temp_dict = {}\n",
+ " temp_weights = {}\n",
+ "\n",
+ " for key, val in accent_edges.items(): \n",
+ " #print (key)\n",
+ " #print (val)\n",
+ " \n",
+ " if val not in temp_edges: \n",
+ " \n",
+ " temp_edges.append(val)\n",
+ " temp_dict[key] = val \n",
+ " temp_weights[key] = 1\n",
+ " \n",
+ " else: # increment the weight of the edge\n",
+ " # find the key to increment based on the val\n",
+ " print('breakpoint 1')\n",
+ " print ('finding the key to update')\n",
+ " pp.pprint(temp_dict)\n",
+ " print('breakpoint 2')\n",
+ " \n",
+ " pp.pprint(temp_dict.keys())\n",
+ " pp.pprint(temp_dict.values())\n",
+ " print('breakpoint 3')\n",
+ " \n",
+ " pp.pprint(key)\n",
+ " pp.pprint(val)\n",
+ " print('breakpoint 4')\n",
+ " \n",
+ " update_key_position = list(temp_dict.values()).index(val) \n",
+ " #this is the *position* in the dict that should be updated \n",
+ " \n",
+ " print('update_key_position is: ', update_key_position)\n",
+ " \n",
+ " update_key = list(temp_dict.keys())[update_key_position]\n",
+ "\n",
+ " print('update_key is: ', update_key)\n",
+ " \n",
+ " print(list(temp_dict.keys()))\n",
+ " print('breakpoint 5')\n",
+ " \n",
+ " pp.pprint(temp_dict[update_key])\n",
+ " temp_weights[update_key] +=1 \n",
+ " print('breakpoint 6')\n",
+ " \n",
+ " # update the weights - we can't update in the for loop above\n",
+ " # otherwise the `val` won't match, because the weight: element would be compared\n",
+ " for key, val in temp_dict.items():\n",
+ " val['weight'] = temp_weights[key]\n",
+ " \n",
+ " print(type(temp_dict))\n",
+ " # sort the dict by source because it's easier to do error checking \n",
+ " temp_dict = dict(sorted(temp_dict.items(), key=lambda x: x[1]['source'], reverse=False))\n",
+ " print(type(temp_dict))\n",
+ " return temp_dict"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 149,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "breakpoint 1\n",
+ "finding the key to update\n",
+ "{ 0: {'source': 2, 'target': 3, 'weight': 1},\n",
+ " 1: {'source': 4, 'target': 5, 'weight': 1},\n",
+ " 2: {'source': 2, 'target': 8, 'weight': 1},\n",
+ " 3: {'source': 2, 'target': 14, 'weight': 1},\n",
+ " 4: {'source': 2, 'target': 15, 'weight': 1},\n",
+ " 5: {'source': 14, 'target': 15, 'weight': 1},\n",
+ " 6: {'source': 2, 'target': 4, 'weight': 1}}\n",
+ "breakpoint 2\n",
+ "dict_keys([0, 1, 2, 3, 4, 5, 6])\n",
+ "dict_values([{'source': 2, 'target': 3, 'weight': 1}, {'source': 4, 'target': 5, 'weight': 1}, {'source': 2, 'target': 8, 'weight': 1}, {'source': 2, 'target': 14, 'weight': 1}, {'source': 2, 'target': 15, 'weight': 1}, {'source': 14, 'target': 15, 'weight': 1}, {'source': 2, 'target': 4, 'weight': 1}])\n",
+ "breakpoint 3\n",
+ "7\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "breakpoint 4\n",
+ "update_key_position is: 6\n",
+ "update_key is: 6\n",
+ "[0, 1, 2, 3, 4, 5, 6]\n",
+ "breakpoint 5\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "breakpoint 6\n",
+ "breakpoint 1\n",
+ "finding the key to update\n",
+ "{ 0: {'source': 2, 'target': 3, 'weight': 1},\n",
+ " 1: {'source': 4, 'target': 5, 'weight': 1},\n",
+ " 2: {'source': 2, 'target': 8, 'weight': 1},\n",
+ " 3: {'source': 2, 'target': 14, 'weight': 1},\n",
+ " 4: {'source': 2, 'target': 15, 'weight': 1},\n",
+ " 5: {'source': 14, 'target': 15, 'weight': 1},\n",
+ " 6: {'source': 2, 'target': 4, 'weight': 1}}\n",
+ "breakpoint 2\n",
+ "dict_keys([0, 1, 2, 3, 4, 5, 6])\n",
+ "dict_values([{'source': 2, 'target': 3, 'weight': 1}, {'source': 4, 'target': 5, 'weight': 1}, {'source': 2, 'target': 8, 'weight': 1}, {'source': 2, 'target': 14, 'weight': 1}, {'source': 2, 'target': 15, 'weight': 1}, {'source': 14, 'target': 15, 'weight': 1}, {'source': 2, 'target': 4, 'weight': 1}])\n",
+ "breakpoint 3\n",
+ "8\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "breakpoint 4\n",
+ "update_key_position is: 6\n",
+ "update_key is: 6\n",
+ "[0, 1, 2, 3, 4, 5, 6]\n",
+ "breakpoint 5\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "breakpoint 6\n",
+ "breakpoint 1\n",
+ "finding the key to update\n",
+ "{ 0: {'source': 2, 'target': 3, 'weight': 1},\n",
+ " 1: {'source': 4, 'target': 5, 'weight': 1},\n",
+ " 2: {'source': 2, 'target': 8, 'weight': 1},\n",
+ " 3: {'source': 2, 'target': 14, 'weight': 1},\n",
+ " 4: {'source': 2, 'target': 15, 'weight': 1},\n",
+ " 5: {'source': 14, 'target': 15, 'weight': 1},\n",
+ " 6: {'source': 2, 'target': 4, 'weight': 1},\n",
+ " 9: {'source': 23, 'target': 4, 'weight': 1}}\n",
+ "breakpoint 2\n",
+ "dict_keys([0, 1, 2, 3, 4, 5, 6, 9])\n",
+ "dict_values([{'source': 2, 'target': 3, 'weight': 1}, {'source': 4, 'target': 5, 'weight': 1}, {'source': 2, 'target': 8, 'weight': 1}, {'source': 2, 'target': 14, 'weight': 1}, {'source': 2, 'target': 15, 'weight': 1}, {'source': 14, 'target': 15, 'weight': 1}, {'source': 2, 'target': 4, 'weight': 1}, {'source': 23, 'target': 4, 'weight': 1}])\n",
+ "breakpoint 3\n",
+ "10\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "breakpoint 4\n",
+ "update_key_position is: 6\n",
+ "update_key is: 6\n",
+ "[0, 1, 2, 3, 4, 5, 6, 9]\n",
+ "breakpoint 5\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "breakpoint 6\n",
+ "breakpoint 1\n",
+ "finding the key to update\n",
+ "{ 0: {'source': 2, 'target': 3, 'weight': 1},\n",
+ " 1: {'source': 4, 'target': 5, 'weight': 1},\n",
+ " 2: {'source': 2, 'target': 8, 'weight': 1},\n",
+ " 3: {'source': 2, 'target': 14, 'weight': 1},\n",
+ " 4: {'source': 2, 'target': 15, 'weight': 1},\n",
+ " 5: {'source': 14, 'target': 15, 'weight': 1},\n",
+ " 6: {'source': 2, 'target': 4, 'weight': 1},\n",
+ " 9: {'source': 23, 'target': 4, 'weight': 1}}\n",
+ "breakpoint 2\n",
+ "dict_keys([0, 1, 2, 3, 4, 5, 6, 9])\n",
+ "dict_values([{'source': 2, 'target': 3, 'weight': 1}, {'source': 4, 'target': 5, 'weight': 1}, {'source': 2, 'target': 8, 'weight': 1}, {'source': 2, 'target': 14, 'weight': 1}, {'source': 2, 'target': 15, 'weight': 1}, {'source': 14, 'target': 15, 'weight': 1}, {'source': 2, 'target': 4, 'weight': 1}, {'source': 23, 'target': 4, 'weight': 1}])\n",
+ "breakpoint 3\n",
+ "11\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "breakpoint 4\n",
+ "update_key_position is: 6\n",
+ "update_key is: 6\n",
+ "[0, 1, 2, 3, 4, 5, 6, 9]\n",
+ "breakpoint 5\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "breakpoint 6\n",
+ "breakpoint 1\n",
+ "finding the key to update\n",
+ "{ 0: {'source': 2, 'target': 3, 'weight': 1},\n",
+ " 1: {'source': 4, 'target': 5, 'weight': 1},\n",
+ " 2: {'source': 2, 'target': 8, 'weight': 1},\n",
+ " 3: {'source': 2, 'target': 14, 'weight': 1},\n",
+ " 4: {'source': 2, 'target': 15, 'weight': 1},\n",
+ " 5: {'source': 14, 'target': 15, 'weight': 1},\n",
+ " 6: {'source': 2, 'target': 4, 'weight': 1},\n",
+ " 9: {'source': 23, 'target': 4, 'weight': 1}}\n",
+ "breakpoint 2\n",
+ "dict_keys([0, 1, 2, 3, 4, 5, 6, 9])\n",
+ "dict_values([{'source': 2, 'target': 3, 'weight': 1}, {'source': 4, 'target': 5, 'weight': 1}, {'source': 2, 'target': 8, 'weight': 1}, {'source': 2, 'target': 14, 'weight': 1}, {'source': 2, 'target': 15, 'weight': 1}, {'source': 14, 'target': 15, 'weight': 1}, {'source': 2, 'target': 4, 'weight': 1}, {'source': 23, 'target': 4, 'weight': 1}])\n",
+ "breakpoint 3\n",
+ "12\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "breakpoint 4\n",
+ "update_key_position is: 6\n",
+ "update_key is: 6\n",
+ "[0, 1, 2, 3, 4, 5, 6, 9]\n",
+ "breakpoint 5\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "breakpoint 6\n",
+ "breakpoint 1\n",
+ "finding the key to update\n",
+ "{ 0: {'source': 2, 'target': 3, 'weight': 1},\n",
+ " 1: {'source': 4, 'target': 5, 'weight': 1},\n",
+ " 2: {'source': 2, 'target': 8, 'weight': 1},\n",
+ " 3: {'source': 2, 'target': 14, 'weight': 1},\n",
+ " 4: {'source': 2, 'target': 15, 'weight': 1},\n",
+ " 5: {'source': 14, 'target': 15, 'weight': 1},\n",
+ " 6: {'source': 2, 'target': 4, 'weight': 1},\n",
+ " 9: {'source': 23, 'target': 4, 'weight': 1}}\n",
+ "breakpoint 2\n",
+ "dict_keys([0, 1, 2, 3, 4, 5, 6, 9])\n",
+ "dict_values([{'source': 2, 'target': 3, 'weight': 1}, {'source': 4, 'target': 5, 'weight': 1}, {'source': 2, 'target': 8, 'weight': 1}, {'source': 2, 'target': 14, 'weight': 1}, {'source': 2, 'target': 15, 'weight': 1}, {'source': 14, 'target': 15, 'weight': 1}, {'source': 2, 'target': 4, 'weight': 1}, {'source': 23, 'target': 4, 'weight': 1}])\n",
+ "breakpoint 3\n",
+ "13\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "breakpoint 4\n",
+ "update_key_position is: 6\n",
+ "update_key is: 6\n",
+ "[0, 1, 2, 3, 4, 5, 6, 9]\n",
+ "breakpoint 5\n",
+ "{'source': 2, 'target': 4, 'weight': 1}\n",
+ "breakpoint 6\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "accent_edges = deduplicateDict(accent_edges)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 150,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "{ 0: {'source': 2, 'target': 3, 'weight': 1},\n",
+ " 1: {'source': 4, 'target': 5, 'weight': 1},\n",
+ " 2: {'source': 2, 'target': 8, 'weight': 1},\n",
+ " 3: {'source': 2, 'target': 14, 'weight': 1},\n",
+ " 4: {'source': 2, 'target': 15, 'weight': 1},\n",
+ " 5: {'source': 14, 'target': 15, 'weight': 1},\n",
+ " 6: {'source': 2, 'target': 4, 'weight': 7},\n",
+ " 9: {'source': 23, 'target': 4, 'weight': 1},\n",
+ " 14: {'source': 4, 'target': 2, 'weight': 1},\n",
+ " 15: {'source': 2, 'target': 39, 'weight': 1}}\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(type(accent_edges))\n",
+ "pp.pprint(accent_edges)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 151,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "10\n"
+ ]
+ }
+ ],
+ "source": [
+ "# check to see how many duplicates were removed - looks like about 50, or about a quarter\n",
+ "# so it's worth calculating a 'value' for each edge to signify its weight\n",
+ "print(len(accent_edges))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Remove bidirectional edges\n",
+ "\n",
+ "Now, we want to deduplicate the **edges** because graph we want to draw is not a _directed graph_. \n",
+ "\n",
+ "That is, the direction of links between nodes is not relevant for the analysis. \n",
+ "\n",
+ "To do this, we compare the `source` and the `target` of each of the edges, and if the `source` and `target` match the `target` and `source` of the edge being compared, we flag that edge for deletion. We then delete those edges flagged for deletion. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 152,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "match\n",
+ "(2, 4)\n",
+ "(2, 4)\n",
+ "added [14, 6] to deletion_list\n",
+ "match\n",
+ "(4, 2)\n",
+ "(4, 2)\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "deletion_list = [] # list to keep track of the dict keys that should be deleted \n",
+ "\n",
+ "for edge in accent_edges.items(): \n",
+ " #print (node) \n",
+ " \n",
+ " for inner_edge in accent_edges.items(): \n",
+ " #pp.pprint(edge)\n",
+ " #pp.pprint(inner_edge)\n",
+ " \n",
+ " # create values to compare on \n",
+ " edge_source_target = (edge[1]['source'], edge[1]['target'])\n",
+ " edge_target_source = (edge[1]['target'], edge[1]['source'])\n",
+ " inner_edge_target_source = (inner_edge[1]['target'], inner_edge[1]['source'])\n",
+ " inner_edge_source_target = (inner_edge[1]['source'], inner_edge[1]['target'])\n",
+ " \n",
+ " \n",
+ "\n",
+ " if edge_source_target == inner_edge_target_source: # match, remove it \n",
+ " \n",
+ " print ('match')\n",
+ " print(edge_source_target)\n",
+ " print(inner_edge_target_source)\n",
+ " \n",
+ " # we need to check that the outer edge is not already on the deletion_list \n",
+ " # otherwise we end up removing *all* the edges, not just the duplicates \n",
+ " \n",
+ " # we need to also check that the transverse of the outer edge is not already on the deletion_list\n",
+ " # otherwise we will end up deleting *both* of the edges\n",
+ " # not just one of them \n",
+ " \n",
+ " if (([inner_edge[0], edge[0]]) not in deletion_list) \\\n",
+ " and (([edge[0], inner_edge[0]]) not in deletion_list) : \n",
+ " deletion_list.append([inner_edge[0], edge[0]])\n",
+ " print ('added ', ([inner_edge[0], edge[0]]), ' to deletion_list')\n",
+ " \n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 153,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[[14, 6]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print (deletion_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 154,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1\n"
+ ]
+ }
+ ],
+ "source": [
+ "print (len(deletion_list))\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 155,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'source': 2, 'target': 4, 'weight': 7}\n",
+ "{'source': 4, 'target': 2, 'weight': 1}\n",
+ "now deleting: 14\n",
+ "7\n",
+ "1\n",
+ "now adding weights to: 6\n",
+ "8\n"
+ ]
+ }
+ ],
+ "source": [
+ "# delete the edges in the deletion list, but transfer their weights to the edge that was de-duplicated\n",
+ "\n",
+ "for edge_pair in deletion_list: \n",
+ " print(accent_edges[edge_pair[1]])\n",
+ " print(accent_edges[edge_pair[0]])\n",
+ " \n",
+ " print('now deleting: ', edge_pair[0])\n",
+ " \n",
+ " print(accent_edges[edge_pair[1]]['weight'])\n",
+ " print(accent_edges[edge_pair[0]]['weight'])\n",
+ " \n",
+ " \n",
+ " print ('now adding weights to: ', edge_pair[1])\n",
+ " accent_edges[edge_pair[1]]['weight'] += accent_edges[edge_pair[0]]['weight']\n",
+ " print(accent_edges[edge_pair[1]]['weight'])\n",
+ " del accent_edges[edge_pair[0]]\n",
+ " \n",
+ "\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 156,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "print(len(accent_edges)) # this should equal the count before the de-duplication"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 157,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{ 0: {'source': 2, 'target': 3, 'weight': 1},\n",
+ " 1: {'source': 4, 'target': 5, 'weight': 1},\n",
+ " 2: {'source': 2, 'target': 8, 'weight': 1},\n",
+ " 3: {'source': 2, 'target': 14, 'weight': 1},\n",
+ " 4: {'source': 2, 'target': 15, 'weight': 1},\n",
+ " 5: {'source': 14, 'target': 15, 'weight': 1},\n",
+ " 6: {'source': 2, 'target': 4, 'weight': 8},\n",
+ " 9: {'source': 23, 'target': 4, 'weight': 1},\n",
+ " 15: {'source': 2, 'target': 39, 'weight': 1}}\n"
+ ]
+ }
+ ],
+ "source": [
+ "pp.pprint(accent_edges)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 158,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{ 0: [1],\n",
+ " 1: [2, 3],\n",
+ " 2: [4, 5],\n",
+ " 3: [],\n",
+ " 4: [2],\n",
+ " 5: [2, 8],\n",
+ " 6: [5],\n",
+ " 7: [10],\n",
+ " 8: [11],\n",
+ " 9: [12],\n",
+ " 10: [2, 14, 15],\n",
+ " 11: [2, 4],\n",
+ " 12: [2, 4],\n",
+ " 13: [2, 4],\n",
+ " 14: [10],\n",
+ " 15: [23, 4],\n",
+ " 16: [2, 4],\n",
+ " 17: [3],\n",
+ " 18: [2, 4],\n",
+ " 19: [],\n",
+ " 20: [2, 4],\n",
+ " 21: [12],\n",
+ " 22: [2, 4],\n",
+ " 23: [4],\n",
+ " 24: [4, 2],\n",
+ " 25: [2, 39]}\n"
+ ]
+ }
+ ],
+ "source": [
+ "pp.pprint(accent_nodes)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 159,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "---\n",
+ "there are 32 lines, excellent\n",
+ "---\n",
+ "---\n",
+ "there are 10 lines, excellent\n",
+ "---\n",
+ "---\n",
+ "there are 16 lines, excellent\n",
+ "---\n"
+ ]
+ }
+ ],
+ "source": [
+ "# let's do some sanity checking to make sure these are correct \n",
+ "\n",
+ "# 0: {'source': 1, 'target': 2, 'weight': 32}\n",
+ "# this is equivalent to the occurrence [1, 2] in accent_nodes \n",
+ "# plus the occurrences of [2, 1] because we have removed bidirectional edges\n",
+ "# plus any occurrences where 1 or 2 occur in a list, such as [1, 5, 17, 2]\n",
+ "# and represents a relationship between \n",
+ "# 'England English' - accent id 1 - and 'United States English' - accent id 2\n",
+ "\n",
+ "for idx, node in accent_nodes.items(): \n",
+ " if (1 in node and 2 in node) :\n",
+ " print ('match')\n",
+ "\n",
+ "print ('---')\n",
+ "print ('there are 32 lines, excellent')\n",
+ "print ('---')\n",
+ "\n",
+ "\n",
+ "# 5: {'source': 2, 'target': 18, 'weight': 10},\n",
+ "\n",
+ "for idx, node in accent_nodes.items(): \n",
+ " if (18 in node and 2 in node) :\n",
+ " print ('match')\n",
+ "\n",
+ "print ('---')\n",
+ "print ('there are 10 lines, excellent')\n",
+ "print ('---')\n",
+ "\n",
+ "# 64: {'source': 2, 'target': 1325, 'weight': 16}\n",
+ "\n",
+ "for idx, node in accent_nodes.items(): \n",
+ " if (2 in node and 1325 in node) :\n",
+ " print ('match')\n",
+ "\n",
+ "print ('---')\n",
+ "print ('there are 16 lines, excellent')\n",
+ "print ('---')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 160,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# I am now confident that the edges are being represented correctly"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 161,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# export the edges to a file \n",
+ "\n",
+ "filePath = links_filename\n",
+ "\n",
+ "with open(filePath, \"w\") as outfile:\n",
+ " json.dump(accent_edges, outfile)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Some miscellaneous reporting for the paper\n",
+ "\n",
+ "I want to get counts by the category of the accent descriptors and the predetermined accents. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 162,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[]\n"
+ ]
+ }
+ ],
+ "source": [
+ "reload(cva)\n",
+ "\n",
+ "predetermined = all_accents.reportPredeterminedAccents()\n",
+ "pp.pprint(predetermined)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 163,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[ ['First or other language', 3],\n",
+ " ['Subnational region', 3],\n",
+ " ['Country', 2],\n",
+ " ['Register', 2],\n",
+ " ['Supranational region', 1],\n",
+ " ['Accent strength descriptor', 1],\n",
+ " ['Linguistic heritage of speaker', 1]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "reload (cva) \n",
+ "accent_category_counts = all_accents.reportAccentDescriptorCategories()\n",
+ "pp.pprint(accent_category_counts)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 164,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "13\n"
+ ]
+ }
+ ],
+ "source": [
+ "total = 0\n",
+ "for accent_category_count in accent_category_counts: \n",
+ " total+=accent_category_count[1]\n",
+ " \n",
+ "print (total)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 165,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[]\n"
+ ]
+ }
+ ],
+ "source": [
+ "reload (cva) \n",
+ "accent_multi_descriptor_counts = all_accents.reportMultipleAccentDescriptors()\n",
+ "print(accent_multi_descriptor_counts)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 166,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for accent in accent_multi_descriptor_counts: \n",
+ " print('\\naccent is:', accent[1]._name)\n",
+ " for descriptor in accent[1]._descriptors: \n",
+ " print(descriptor._name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 167,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "id is 1, name is Good, count is 1, locale is en, descriptors are [], predetermined is False. id is 2, name is Kiswahili accent, count is 13, locale is en, descriptors are [], predetermined is False. id is 3, name is Coastal Swahili, count is 2, locale is en, descriptors are [], predetermined is False. id is 4, name is Fluent, count is 11, locale is en, descriptors are [], predetermined is False. id is 5, name is kiMvita, count is 2, locale is en, descriptors are [], predetermined is False. id is 8, name is Strong, count is 1, locale is en, descriptors are [], predetermined is False. id is 10, name is Kenyan, count is 2, locale is en, descriptors are [], predetermined is False. id is 11, name is Lived in area, count is 1, locale is en, descriptors are [], predetermined is False. id is 12, name is native, count is 2, locale is en, descriptors are [], predetermined is False. id is 14, name is Tanzania, count is 1, locale is en, descriptors are [], predetermined is False. id is 15, name is academic, count is 1, locale is en, descriptors are [], predetermined is False. id is 23, name is Eloquent, count is 1, locale is en, descriptors are [], predetermined is False. id is 39, name is Arusha, count is 1, locale is en, descriptors are [], predetermined is False.\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(all_accents)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "kathy-cvaccents",
+ "language": "python",
+ "name": "kathy-cvaccents"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/README.pdf b/README.pdf
new file mode 100644
index 0000000..25dbcf2
Binary files /dev/null and b/README.pdf differ
diff --git a/cvaccents-v13.ipynb b/cvaccents-v13.ipynb
index 44327c3..3f0c65a 100644
--- a/cvaccents-v13.ipynb
+++ b/cvaccents-v13.ipynb
@@ -66,7 +66,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -103,7 +103,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -129,7 +129,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -167,7 +167,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -184,7 +184,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -195,7 +195,7 @@
" dtype='object')"
]
},
- "execution_count": 5,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -206,7 +206,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -215,7 +215,7 @@
"Index(['client_id', 'age', 'gender', 'accents', 'variant'], dtype='object')"
]
},
- "execution_count": 6,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -231,7 +231,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -240,7 +240,7 @@
"1689234"
]
},
- "execution_count": 7,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -251,7 +251,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 10,
"metadata": {
"scrolled": true
},
@@ -262,7 +262,7 @@
"907238"
]
},
- "execution_count": 8,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -274,7 +274,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -439,7 +439,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -448,7 +448,7 @@
"907238"
]
},
- "execution_count": 10,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -466,7 +466,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
@@ -475,7 +475,7 @@
"15341"
]
},
- "execution_count": 11,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -487,7 +487,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
@@ -496,7 +496,7 @@
"15341"
]
},
- "execution_count": 12,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@@ -537,7 +537,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
@@ -557,7 +557,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
@@ -618,7 +618,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
@@ -635,7 +635,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
@@ -13525,7 +13525,13 @@
"12880 ['England English']\n",
"12881 ['England English']\n",
"12882 ['United States English']\n",
- "12883 ['England English']\n",
+ "12883 ['England English']\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
"12884 ['United States English']\n",
"12885 ['United States English']\n",
"12886 ['United States English']\n",
@@ -16074,7 +16080,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
@@ -16125,7 +16131,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 20,
"metadata": {},
"outputs": [
{
@@ -16186,15 +16192,15 @@
"processed mostly American with some British and Australian inflections to be ['United States English', 'British English', 'Australian English', 'mostly'] and the old accent list is: ['United States English', 'British English', 'Australian English', 'mostly'] and the new accent list is: ['United States English', 'British English', 'Australian English', 'mostly']\n",
"processed Born and lived in eastern VA for 8 years. Then lived in southern CA for 13 years. Lived in MD to be ['Eastern Virginia', 'Southern California', 'Maryland', 'Northern Carolina', 'Washington State', \"Hawai'i\", 'Washington DC', 'Northern Kentucky', 'Cincinatti', 'Ohio', 'Born in area', 'Lived in area'] and the old accent list is: ['United States English', 'NC', 'WA', 'HI for 1-3 years each. Spent 30 years in Washington DC area and 17 years in Northern KY/Cincinnati OH area', 'Eastern Virginia', 'Southern California', 'Maryland', 'Northern Carolina', 'Washington State', \"Hawai'i\", 'Washington DC', 'Northern Kentucky', 'Cincinatti', 'Ohio', 'Born in area', 'Lived in area'] and the new accent list is: ['United States English', 'NC', 'WA', 'HI for 1-3 years each. Spent 30 years in Washington DC area and 17 years in Northern KY/Cincinnati OH area', 'Eastern Virginia', 'Southern California', 'Maryland', 'Northern Carolina', 'Washington State', \"Hawai'i\", 'Washington DC', 'Northern Kentucky', 'Cincinatti', 'Ohio', 'Born in area', 'Lived in area']\n",
"processed NC to be [''] and the old accent list is: ['United States English', 'WA', 'HI for 1-3 years each. Spent 30 years in Washington DC area and 17 years in Northern KY/Cincinnati OH area', 'Eastern Virginia', 'Southern California', 'Maryland', 'Northern Carolina', 'Washington State', \"Hawai'i\", 'Washington DC', 'Northern Kentucky', 'Cincinatti', 'Ohio', 'Born in area', 'Lived in area', ''] and the new accent list is: ['United States English', 'WA', 'HI for 1-3 years each. Spent 30 years in Washington DC area and 17 years in Northern KY/Cincinnati OH area', 'Eastern Virginia', 'Southern California', 'Maryland', 'Northern Carolina', 'Washington State', \"Hawai'i\", 'Washington DC', 'Northern Kentucky', 'Cincinatti', 'Ohio', 'Born in area', 'Lived in area', '']\n",
- "processed WA to be [''] and the old accent list is: ['United States English', 'HI for 1-3 years each. Spent 30 years in Washington DC area and 17 years in Northern KY/Cincinnati OH area', 'Eastern Virginia', 'Southern California', 'Maryland', 'Northern Carolina', 'Washington State', \"Hawai'i\", 'Washington DC', 'Northern Kentucky', 'Cincinatti', 'Ohio', 'Born in area', 'Lived in area', ''] and the new accent list is: ['United States English', 'HI for 1-3 years each. Spent 30 years in Washington DC area and 17 years in Northern KY/Cincinnati OH area', 'Eastern Virginia', 'Southern California', 'Maryland', 'Northern Carolina', 'Washington State', \"Hawai'i\", 'Washington DC', 'Northern Kentucky', 'Cincinatti', 'Ohio', 'Born in area', 'Lived in area', '']\n"
+ "processed WA to be [''] and the old accent list is: ['United States English', 'HI for 1-3 years each. Spent 30 years in Washington DC area and 17 years in Northern KY/Cincinnati OH area', 'Eastern Virginia', 'Southern California', 'Maryland', 'Northern Carolina', 'Washington State', \"Hawai'i\", 'Washington DC', 'Northern Kentucky', 'Cincinatti', 'Ohio', 'Born in area', 'Lived in area', ''] and the new accent list is: ['United States English', 'HI for 1-3 years each. Spent 30 years in Washington DC area and 17 years in Northern KY/Cincinnati OH area', 'Eastern Virginia', 'Southern California', 'Maryland', 'Northern Carolina', 'Washington State', \"Hawai'i\", 'Washington DC', 'Northern Kentucky', 'Cincinatti', 'Ohio', 'Born in area', 'Lived in area', '']\n",
+ "processed HI for 1-3 years each. Spent 30 years in Washington DC area and 17 years in Northern KY/Cincinnati OH area to be [''] and the old accent list is: ['United States English', 'Eastern Virginia', 'Southern California', 'Maryland', 'Northern Carolina', 'Washington State', \"Hawai'i\", 'Washington DC', 'Northern Kentucky', 'Cincinatti', 'Ohio', 'Born in area', 'Lived in area', ''] and the new accent list is: ['United States English', 'Eastern Virginia', 'Southern California', 'Maryland', 'Northern Carolina', 'Washington State', \"Hawai'i\", 'Washington DC', 'Northern Kentucky', 'Cincinatti', 'Ohio', 'Born in area', 'Lived in area', '']\n",
+ "processed United States-West Coast-Alaska to be ['United States English', 'West Coast United States', 'Alaska'] and the old accent list is: ['United States English', 'United States-Midwestern', 'United States English', 'West Coast United States', 'Alaska'] and the new accent list is: ['United States English', 'United States-Midwestern', 'United States English', 'West Coast United States', 'Alaska']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "processed HI for 1-3 years each. Spent 30 years in Washington DC area and 17 years in Northern KY/Cincinnati OH area to be [''] and the old accent list is: ['United States English', 'Eastern Virginia', 'Southern California', 'Maryland', 'Northern Carolina', 'Washington State', \"Hawai'i\", 'Washington DC', 'Northern Kentucky', 'Cincinatti', 'Ohio', 'Born in area', 'Lived in area', ''] and the new accent list is: ['United States English', 'Eastern Virginia', 'Southern California', 'Maryland', 'Northern Carolina', 'Washington State', \"Hawai'i\", 'Washington DC', 'Northern Kentucky', 'Cincinatti', 'Ohio', 'Born in area', 'Lived in area', '']\n",
- "processed United States-West Coast-Alaska to be ['United States English', 'West Coast United States', 'Alaska'] and the old accent list is: ['United States English', 'United States-Midwestern', 'United States English', 'West Coast United States', 'Alaska'] and the new accent list is: ['United States English', 'United States-Midwestern', 'United States English', 'West Coast United States', 'Alaska']\n",
"processed Slight Latino accent to be ['slight', 'Latino'] and the old accent list is: ['United States English', 'immigrant', 'slight', 'Latino'] and the new accent list is: ['United States English', 'immigrant', 'slight', 'Latino']\n",
"processed Slight Dutch accent to be ['slight', 'Dutch'] and the old accent list is: ['United States English', 'slight', 'Dutch'] and the new accent list is: ['United States English', 'slight', 'Dutch']\n",
"processed Slavic English to be ['Slavic'] and the old accent list is: ['England English', 'United States English', 'English with Polish accent', 'Slavic'] and the new accent list is: ['England English', 'United States English', 'English with Polish accent', 'Slavic']\n",
@@ -16591,7 +16597,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 21,
"metadata": {},
"outputs": [
{
@@ -16675,14 +16681,14 @@
"processed British English to be ['British'] and the old accent list is: ['United States English', 'Australian English', 'mostly', 'British'] and the new accent list is: ['United States English', 'Australian English', 'mostly', 'British']\n",
"processed U.K. English to be ['British'] and the old accent list is: ['British'] and the new accent list is: ['British']\n",
"processed English (UK) to be ['British'] and the old accent list is: ['British'] and the new accent list is: ['British']\n",
- "processed British accent to be ['British'] and the old accent list is: ['British'] and the new accent list is: ['British']\n",
- "processed england to be ['England English'] and the old accent list is: ['london', 'academic', 'England English'] and the new accent list is: ['london', 'academic', 'England English']\n"
+ "processed British accent to be ['British'] and the old accent list is: ['British'] and the new accent list is: ['British']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
+ "processed england to be ['England English'] and the old accent list is: ['london', 'academic', 'England English'] and the new accent list is: ['london', 'academic', 'England English']\n",
"processed England to be ['England English'] and the old accent list is: ['England English', 'Lancashire', 'England English'] and the new accent list is: ['England English', 'Lancashire', 'England English']\n",
"processed English to be ['England English'] and the old accent list is: ['Irish English', 'England English'] and the new accent list is: ['Irish English', 'England English']\n",
"processed Cambridge English to be ['Cambridge'] and the old accent list is: ['England English', 'Bedford English', 'Cambridge'] and the new accent list is: ['England English', 'Bedford English', 'Cambridge']\n",
@@ -16736,17 +16742,17 @@
"processed french accent to be ['French'] and the old accent list is: ['French'] and the new accent list is: ['French']\n",
"processed french accent to be ['French'] and the old accent list is: ['French'] and the new accent list is: ['French']\n",
"processed french accent to be ['French'] and the old accent list is: ['French'] and the new accent list is: ['French']\n",
- "processed french accent to be ['French'] and the old accent list is: ['French', 'mid level', 'French'] and the new accent list is: ['French', 'mid level', 'French']\n",
- "processed french english to be ['French'] and the old accent list is: ['French'] and the new accent list is: ['French']\n",
- "processed Swiss English to be ['Swiss'] and the old accent list is: ['Swiss'] and the new accent list is: ['Swiss']\n",
- "processed European English to be ['European'] and the old accent list is: ['French', 'Romanian', 'European'] and the new accent list is: ['French', 'Romanian', 'European']\n",
- "processed European English to be ['European'] and the old accent list is: ['United States English', 'European'] and the new accent list is: ['United States English', 'European']\n"
+ "processed french accent to be ['French'] and the old accent list is: ['French', 'mid level', 'French'] and the new accent list is: ['French', 'mid level', 'French']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
+ "processed french english to be ['French'] and the old accent list is: ['French'] and the new accent list is: ['French']\n",
+ "processed Swiss English to be ['Swiss'] and the old accent list is: ['Swiss'] and the new accent list is: ['Swiss']\n",
+ "processed European English to be ['European'] and the old accent list is: ['French', 'Romanian', 'European'] and the new accent list is: ['French', 'Romanian', 'European']\n",
+ "processed European English to be ['European'] and the old accent list is: ['United States English', 'European'] and the new accent list is: ['United States English', 'European']\n",
"processed European English to be ['European'] and the old accent list is: ['European'] and the new accent list is: ['European']\n",
"processed Generic European to be ['European'] and the old accent list is: ['European'] and the new accent list is: ['European']\n",
"processed eastern europe to be ['Eastern European'] and the old accent list is: ['European', \"pronounced r's\", 'heavy consonants', 'Eastern European'] and the new accent list is: ['European', \"pronounced r's\", 'heavy consonants', 'Eastern European']\n",
@@ -16792,20 +16798,20 @@
"processed Latino to be ['Hispanic / Latino'] and the old accent list is: ['United States English', 'second language', 'little', 'Hispanic / Latino'] and the new accent list is: ['United States English', 'second language', 'little', 'Hispanic / Latino']\n",
"processed Colombian Accent to be ['Colombian'] and the old accent list is: ['Colombian'] and the new accent list is: ['Colombian']\n",
"processed Colombia to be ['Colombian'] and the old accent list is: ['United States English', 'Colombian'] and the new accent list is: ['United States English', 'Colombian']\n",
- "processed Brazillian Accent to be ['Brazilian'] and the old accent list is: ['Brazilian'] and the new accent list is: ['Brazilian']\n",
- "processed Argentinian English to be ['Argentinian'] and the old accent list is: ['Argentinian'] and the new accent list is: ['Argentinian']\n",
- "processed Chinese English to be ['Chinese'] and the old accent list is: ['Chinese'] and the new accent list is: ['Chinese']\n",
- "processed Thai English to be ['Thai'] and the old accent list is: ['Singaporean English', 'Thai'] and the new accent list is: ['Singaporean English', 'Thai']\n",
- "processed Japanese English to be ['Japanese'] and the old accent list is: ['Japanese'] and the new accent list is: ['Japanese']\n",
- "processed Japanese English to be ['Japanese'] and the old accent list is: ['Japanese'] and the new accent list is: ['Japanese']\n",
- "processed Indonesian English to be ['Indonesian'] and the old accent list is: ['Javanese', 'Malaysian English', 'Indonesian'] and the new accent list is: ['Javanese', 'Malaysian English', 'Indonesian']\n",
- "processed Asian-American English to be ['Asian-American'] and the old accent list is: ['United States English', 'Asian-American'] and the new accent list is: ['United States English', 'Asian-American']\n"
+ "processed Brazillian Accent to be ['Brazilian'] and the old accent list is: ['Brazilian'] and the new accent list is: ['Brazilian']\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
+ "processed Argentinian English to be ['Argentinian'] and the old accent list is: ['Argentinian'] and the new accent list is: ['Argentinian']\n",
+ "processed Chinese English to be ['Chinese'] and the old accent list is: ['Chinese'] and the new accent list is: ['Chinese']\n",
+ "processed Thai English to be ['Thai'] and the old accent list is: ['Singaporean English', 'Thai'] and the new accent list is: ['Singaporean English', 'Thai']\n",
+ "processed Japanese English to be ['Japanese'] and the old accent list is: ['Japanese'] and the new accent list is: ['Japanese']\n",
+ "processed Japanese English to be ['Japanese'] and the old accent list is: ['Japanese'] and the new accent list is: ['Japanese']\n",
+ "processed Indonesian English to be ['Indonesian'] and the old accent list is: ['Javanese', 'Malaysian English', 'Indonesian'] and the new accent list is: ['Javanese', 'Malaysian English', 'Indonesian']\n",
+ "processed Asian-American English to be ['Asian-American'] and the old accent list is: ['United States English', 'Asian-American'] and the new accent list is: ['United States English', 'Asian-American']\n",
"processed bangladesh to be ['Bangladeshi'] and the old accent list is: ['India and South Asia (India, Pakistan, Sri Lanka)', 'Bangladeshi'] and the new accent list is: ['India and South Asia (India, Pakistan, Sri Lanka)', 'Bangladeshi']\n",
"processed Bangladesh English to be ['Bangladeshi'] and the old accent list is: ['Bangladeshi', 'Bangladeshi'] and the new accent list is: ['Bangladeshi', 'Bangladeshi']\n",
"processed Bangladeshi English to be ['Bangladeshi'] and the old accent list is: ['Bangladeshi'] and the new accent list is: ['Bangladeshi']\n",
@@ -16840,7 +16846,13 @@
"processed some time spent in location to be ['time spent in location'] and the old accent list is: ['Spanish', 'Polish', 'Southern United States', 'time spent in location', 'German', 'some', 'Midwest United States', 'Mix of accents', 'time spent in location'] and the new accent list is: ['Spanish', 'Polish', 'Southern United States', 'time spent in location', 'German', 'some', 'Midwest United States', 'Mix of accents', 'time spent in location']\n",
"processed some time spent in location to be ['time spent in location'] and the old accent list is: ['United States English', 'mostly affecting inflection', 'I think', 'Scottish English', 'time spent in location'] and the new accent list is: ['United States English', 'mostly affecting inflection', 'I think', 'Scottish English', 'time spent in location']\n",
"processed little bit to be ['little'] and the old accent list is: ['United States English', 'classy', 'sassy', 'city', 'little'] and the new accent list is: ['United States English', 'classy', 'sassy', 'city', 'little']\n",
- "processed just practicing to be and the old accent list is: ['New Zealand English', 'Non-native speaker'] and the new accent list is: ['New Zealand English', 'Non-native speaker']\n",
+ "processed just practicing to be and the old accent list is: ['New Zealand English', 'Non-native speaker'] and the new accent list is: ['New Zealand English', 'Non-native speaker']\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
"processed Yes Please. to be and the old accent list is: ['India and South Asia (India, Pakistan, Sri Lanka)'] and the new accent list is: ['India and South Asia (India, Pakistan, Sri Lanka)']\n",
"processed I have none that I can tell. to be and the old accent list is: [] and the new accent list is: []\n"
]
@@ -17535,7 +17547,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 22,
"metadata": {},
"outputs": [
{
@@ -24211,7 +24223,13 @@
" ['Canadian English'],\n",
" ['United States English', 'Australian English', 'mostly', 'British'],\n",
" ['England English'],\n",
- " ['United States English'],\n",
+ " ['United States English'],\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
" ['United States English'],\n",
" ['United States English'],\n",
" ['United States English'],\n",
@@ -33018,7 +33036,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
@@ -33087,7 +33105,7 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
@@ -33099,7 +33117,7 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 25,
"metadata": {},
"outputs": [
{
@@ -33116,7 +33134,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 26,
"metadata": {},
"outputs": [
{
@@ -33125,7 +33143,7 @@
"'id is 1, name is United States English, count is 7800, locale is en, descriptors are None, predetermined is False. id is 2, name is Midwest United States, count is 22, locale is en, descriptors are None, predetermined is False. id is 3, name is Hong Kong English, count is 136, locale is en, descriptors are None, predetermined is False. id is 4, name is Filipino, count is 134, locale is en, descriptors are None, predetermined is False. id is 7, name is Wolof, count is 1, locale is en, descriptors are None, predetermined is False. id is 8, name is England English, count is 2415, locale is en, descriptors are None, predetermined is False. id is 9, name is Australian English, count is 681, locale is en, descriptors are None, predetermined is False. id is 13, name is Southern African (South Africa, Zimbabwe, Namibia), count is 271, locale is en, descriptors are None, predetermined is False. id is 16, name is India and South Asia (India, Pakistan, Sri Lanka), count is 2089, locale is en, descriptors are None, predetermined is False. id is 31, name is Puerto Rican, count is 1, locale is en, descriptors are None, predetermined is False. id is 32, name is Florida, count is 1, locale is en, descriptors are None, predetermined is False. id is 33, name is New York, count is 2, locale is en, descriptors are None, predetermined is False. id is 34, name is Long Island, count is 1, locale is en, descriptors are None, predetermined is False. id is 35, name is Savannah, count is 1, locale is en, descriptors are None, predetermined is False. id is 36, name is Georgia, count is 1, locale is en, descriptors are None, predetermined is False. id is 37, name is Hispanic / Latino, count is 10, locale is en, descriptors are None, predetermined is False. id is 40, name is Canadian English, count is 933, locale is en, descriptors are None, predetermined is False. id is 55, name is Polish, count is 9, locale is en, descriptors are None, predetermined is False. id is 69, name is Scottish English, count is 175, locale is en, descriptors are None, predetermined is False. id is 74, name is Swedish, count is 5, locale is en, descriptors are None, predetermined is False. id is 85, name is German, count is 27, locale is en, descriptors are None, predetermined is False. id is 86, name is Denglish, count is 1, locale is en, descriptors are None, predetermined is False. id is 101, name is Upstate New York, count is 1, locale is en, descriptors are None, predetermined is False. id is 132, name is Singaporean English, count is 81, locale is en, descriptors are None, predetermined is False. id is 155, name is Russian, count is 9, locale is en, descriptors are None, predetermined is False. id is 156, name is Chichester, count is 1, locale is en, descriptors are None, predetermined is False. id is 159, name is Gay, count is 2, locale is en, descriptors are None, predetermined is False. id is 161, name is Welsh English, count is 71, locale is en, descriptors are None, predetermined is False. id is 168, name is West Indies and Bermuda (Bahamas, Bermuda, Jamaica, Trinidad), count is 52, locale is en, descriptors are None, predetermined is False. id is 182, name is Non-native speaker, count is 11, locale is en, descriptors are None, predetermined is False. id is 220, name is Irish English, count is 193, locale is en, descriptors are None, predetermined is False. id is 245, name is New Zealand English, count is 166, locale is en, descriptors are None, predetermined is False. id is 251, name is Malaysian English, count is 100, locale is en, descriptors are None, predetermined is False. id is 272, name is Thai, count is 3, locale is en, descriptors are None, predetermined is False. id is 321, name is West Coast United States, count is 3, locale is en, descriptors are None, predetermined is False. id is 326, name is Brooklyn, count is 2, locale is en, descriptors are None, predetermined is False. id is 331, name is Midlands English, count is 1, locale is en, descriptors are None, predetermined is False. id is 340, name is Southern New England, count is 1, locale is en, descriptors are None, predetermined is False. id is 341, name is Boston, count is 2, locale is en, descriptors are None, predetermined is False. id is 342, name is Worcester, count is 1, locale is en, descriptors are None, predetermined is False. id is 343, name is Lowell, count is 1, locale is en, descriptors are None, predetermined is False. id is 469, name is Greek, count is 2, locale is en, descriptors are None, predetermined is False. id is 475, name is Dutch, count is 6, locale is en, descriptors are None, predetermined is False. id is 476, name is Catalan, count is 1, locale is en, descriptors are None, predetermined is False. id is 520, name is California, count is 5, locale is en, descriptors are None, predetermined is False. id is 533, name is African American Vernacular, count is 1, locale is en, descriptors are None, predetermined is False. id is 534, name is Mid-atlantic United States, count is 5, locale is en, descriptors are None, predetermined is False. id is 547, name is French, count is 16, locale is en, descriptors are None, predetermined is False. id is 549, name is Eastern Virginia, count is 1, locale is en, descriptors are None, predetermined is False. id is 550, name is Southern California, count is 2, locale is en, descriptors are None, predetermined is False. id is 551, name is Maryland, count is 1, locale is en, descriptors are None, predetermined is False. id is 552, name is Northern Carolina, count is 1, locale is en, descriptors are None, predetermined is False. id is 553, name is Washington State, count is 1, locale is en, descriptors are None, predetermined is False. id is 554, name is Hawai\\'i, count is 1, locale is en, descriptors are None, predetermined is False. id is 555, name is Washington DC, count is 1, locale is en, descriptors are None, predetermined is False. id is 556, name is Northern Kentucky, count is 1, locale is en, descriptors are None, predetermined is False. id is 557, name is Cincinatti, count is 1, locale is en, descriptors are None, predetermined is False. id is 558, name is Ohio, count is 2, locale is en, descriptors are None, predetermined is False. id is 559, name is Born in area, count is 2, locale is en, descriptors are None, predetermined is False. id is 560, name is Lived in area, count is 2, locale is en, descriptors are None, predetermined is False. id is 566, name is Northeastern United States, count is 2, locale is en, descriptors are None, predetermined is False. id is 613, name is Glaswegian, count is 1, locale is en, descriptors are None, predetermined is False. id is 618, name is Cambridge, count is 1, locale is en, descriptors are None, predetermined is False. id is 619, name is Bedford, count is 1, locale is en, descriptors are None, predetermined is False. id is 636, name is Alaska, count is 1, locale is en, descriptors are None, predetermined is False. id is 643, name is Neutral, count is 1, locale is en, descriptors are None, predetermined is False. id is 644, name is slow, count is 1, locale is en, descriptors are None, predetermined is False. id is 724, name is Argentinian, count is 1, locale is en, descriptors are None, predetermined is False. id is 801, name is Bisaya, count is 1, locale is en, descriptors are None, predetermined is False. id is 814, name is Haitian Creole, count is 1, locale is en, descriptors are None, predetermined is False. id is 848, name is British, count is 6, locale is en, descriptors are None, predetermined is False. id is 851, name is slight, count is 5, locale is en, descriptors are None, predetermined is False. id is 875, name is Chicago, count is 2, locale is en, descriptors are None, predetermined is False. id is 876, name is Gen Z, count is 1, locale is en, descriptors are None, predetermined is False. id is 919, name is Scandinavian, count is 1, locale is en, descriptors are None, predetermined is False. id is 933, name is Western Europe, count is 1, locale is en, descriptors are None, predetermined is False. id is 962, name is Durban, count is 1, locale is en, descriptors are None, predetermined is False. id is 988, name is Second language, count is 7, locale is en, descriptors are None, predetermined is False. id is 1005, name is country, count is 1, locale is en, descriptors are None, predetermined is False. id is 1035, name is slurred, count is 1, locale is en, descriptors are None, predetermined is False. id is 1079, name is South Australia, count is 1, locale is en, descriptors are None, predetermined is False. id is 1081, name is Northern California, count is 1, locale is en, descriptors are None, predetermined is False. id is 1120, name is Received Pronunciation, count is 2, locale is en, descriptors are None, predetermined is False. id is 1160, name is Israeli, count is 4, locale is en, descriptors are None, predetermined is False. id is 1242, name is Italian, count is 2, locale is en, descriptors are None, predetermined is False. id is 1287, name is Standard American English, count is 1, locale is en, descriptors are None, predetermined is False. id is 1297, name is Southern Texas, count is 3, locale is en, descriptors are None, predetermined is False. id is 1311, name is Serbian, count is 2, locale is en, descriptors are None, predetermined is False. id is 1390, name is New York City, count is 1, locale is en, descriptors are None, predetermined is False. id is 1428, name is Nigerian, count is 4, locale is en, descriptors are None, predetermined is False. id is 1483, name is Finnish, count is 1, locale is en, descriptors are None, predetermined is False. id is 1544, name is surfer, count is 2, locale is en, descriptors are None, predetermined is False. id is 1546, name is Not bad, count is 1, locale is en, descriptors are None, predetermined is False. id is 1550, name is East Texas, count is 1, locale is en, descriptors are None, predetermined is False. id is 1551, name is Texas, count is 3, locale is en, descriptors are None, predetermined is False. id is 1603, name is Northern, count is 1, locale is en, descriptors are None, predetermined is False. id is 1604, name is Urban, count is 1, locale is en, descriptors are None, predetermined is False. id is 1774, name is Chinese, count is 3, locale is en, descriptors are None, predetermined is False. id is 1941, name is Middle class, count is 1, locale is en, descriptors are None, predetermined is False. id is 1942, name is London, count is 3, locale is en, descriptors are None, predetermined is False. id is 1943, name is West Country, count is 1, locale is en, descriptors are None, predetermined is False. id is 1953, name is Durham, count is 1, locale is en, descriptors are None, predetermined is False. id is 1969, name is Japanese, count is 3, locale is en, descriptors are None, predetermined is False. id is 1981, name is Ukrainian, count is 2, locale is en, descriptors are None, predetermined is False. id is 1982, name is pronounced, count is 1, locale is en, descriptors are None, predetermined is False. id is 1984, name is South Atlantic (Falkland Islands, Saint Helena), count is 5, locale is en, descriptors are None, predetermined is False. id is 2002, name is Southern United States, count is 7, locale is en, descriptors are None, predetermined is False. id is 2006, name is Educated, count is 2, locale is en, descriptors are None, predetermined is False. id is 2007, name is non regional, count is 1, locale is en, descriptors are None, predetermined is False. id is 2010, name is Rhode Island, count is 1, locale is en, descriptors are None, predetermined is False. id is 2011, name is Massachusetts, count is 1, locale is en, descriptors are None, predetermined is False. id is 2102, name is Pacific Northwest United States, count is 1, locale is en, descriptors are None, predetermined is False. id is 2129, name is plain, count is 1, locale is en, descriptors are None, predetermined is False. id is 2132, name is Southern drawl, count is 2, locale is en, descriptors are None, predetermined is False. id is 2249, name is Basic, count is 1, locale is en, descriptors are None, predetermined is False. id is 2268, name is Hunglish, count is 1, locale is en, descriptors are None, predetermined is False. id is 2269, name is East London, count is 1, locale is en, descriptors are None, predetermined is False. id is 2275, name is Colombian, count is 2, locale is en, descriptors are None, predetermined is False. id is 2406, name is A\\'lo, count is 1, locale is en, descriptors are None, predetermined is False. id is 2502, name is Variable, count is 1, locale is en, descriptors are None, predetermined is False. id is 2521, name is Silicon Valley, count is 1, locale is en, descriptors are None, predetermined is False. id is 2522, name is Native speaker, count is 1, locale is en, descriptors are None, predetermined is False. id is 2537, name is slang, count is 1, locale is en, descriptors are None, predetermined is False. id is 2645, name is Brazilian, count is 3, locale is en, descriptors are None, predetermined is False. id is 2683, name is touch, count is 1, locale is en, descriptors are None, predetermined is False. id is 2697, name is Slavic, count is 7, locale is en, descriptors are None, predetermined is False. id is 2742, name is Northern Irish, count is 5, locale is en, descriptors are None, predetermined is False. id is 2818, name is New Jersey, count is 1, locale is en, descriptors are None, predetermined is False. id is 2946, name is Mild, count is 1, locale is en, descriptors are None, predetermined is False. id is 2947, name is Northern England, count is 5, locale is en, descriptors are None, predetermined is False. id is 2973, name is Kenyan, count is 5, locale is en, descriptors are None, predetermined is False. id is 2997, name is Romanian, count is 2, locale is en, descriptors are None, predetermined is False. id is 2998, name is European, count is 8, locale is en, descriptors are None, predetermined is False. id is 3045, name is Southern Appalachian English, count is 1, locale is en, descriptors are None, predetermined is False. id is 3076, name is Pacific Northwest, count is 2, locale is en, descriptors are None, predetermined is False. id is 3151, name is formal, count is 1, locale is en, descriptors are None, predetermined is False. id is 3152, name is sultry, count is 1, locale is en, descriptors are None, predetermined is False. id is 3182, name is academic, count is 4, locale is en, descriptors are None, predetermined is False. id is 3579, name is Yorkshire, count is 2, locale is en, descriptors are None, predetermined is False. id is 3589, name is changes due to oral surgery, count is 1, locale is en, descriptors are None, predetermined is False. id is 3849, name is classy, count is 1, locale is en, descriptors are None, predetermined is False. id is 3850, name is sassy, count is 1, locale is en, descriptors are None, predetermined is False. id is 3851, name is city, count is 1, locale is en, descriptors are None, predetermined is False. id is 3852, name is little, count is 2, locale is en, descriptors are None, predetermined is False. id is 3904, name is Minnesotan, count is 2, locale is en, descriptors are None, predetermined is False. id is 4263, name is Norwegian, count is 3, locale is en, descriptors are None, predetermined is False. id is 4280, name is Patois, count is 1, locale is en, descriptors are None, predetermined is False. id is 4468, name is Swiss German, count is 1, locale is en, descriptors are None, predetermined is False. id is 4603, name is East Indian, count is 1, locale is en, descriptors are None, predetermined is False. id is 4606, name is pin/pen merger, count is 1, locale is en, descriptors are None, predetermined is False. id is 5065, name is East African, count is 1, locale is en, descriptors are None, predetermined is False. id is 5070, name is East Ukrainian, count is 1, locale is en, descriptors are None, predetermined is False. id is 5114, name is Spanish, count is 3, locale is en, descriptors are None, predetermined is False. id is 5115, name is Bilingual, count is 1, locale is en, descriptors are None, predetermined is False. id is 5214, name is Czech, count is 1, locale is en, descriptors are None, predetermined is False. id is 5340, name is Afrikaans English, count is 1, locale is en, descriptors are None, predetermined is False. id is 5459, name is Bangladeshi, count is 3, locale is en, descriptors are None, predetermined is False. id is 5512, name is Mexican, count is 1, locale is en, descriptors are None, predetermined is False. id is 5546, name is West Indian, count is 1, locale is en, descriptors are None, predetermined is False. id is 5577, name is Liverpool, count is 2, locale is en, descriptors are None, predetermined is False. id is 5602, name is International English, count is 4, locale is en, descriptors are None, predetermined is False. id is 5684, name is Swiss, count is 2, locale is en, descriptors are None, predetermined is False. id is 5701, name is Adjustable, count is 1, locale is en, descriptors are None, predetermined is False. id is 5867, name is Slovak, count is 1, locale is en, descriptors are None, predetermined is False. id is 6085, name is East Anglia, count is 1, locale is en, descriptors are None, predetermined is False. id is 6183, name is Northern European, count is 1, locale is en, descriptors are None, predetermined is False. id is 6315, name is Georgian English, count is 1, locale is en, descriptors are None, predetermined is False. id is 6329, name is Mix of accents, count is 4, locale is en, descriptors are None, predetermined is False. id is 6499, name is pronounced r\\'s, count is 1, locale is en, descriptors are None, predetermined is False. id is 6500, name is heavy consonants, count is 1, locale is en, descriptors are None, predetermined is False. id is 6501, name is Eastern European, count is 5, locale is en, descriptors are None, predetermined is False. id is 6517, name is tinge, count is 1, locale is en, descriptors are None, predetermined is False. id is 6548, name is Pittsburgh, count is 1, locale is en, descriptors are None, predetermined is False. id is 6549, name is Pennsylvania, count is 3, locale is en, descriptors are None, predetermined is False. id is 6619, name is time spent in location, count is 2, locale is en, descriptors are None, predetermined is False. id is 6621, name is some, count is 2, locale is en, descriptors are None, predetermined is False. id is 6821, name is mostly, count is 1, locale is en, descriptors are None, predetermined is False. id is 7900, name is Sydney, count is 1, locale is en, descriptors are None, predetermined is False. id is 7901, name is Middle eastern seaboard Australian, count is 1, locale is en, descriptors are None, predetermined is False. id is 7916, name is Central American, count is 1, locale is en, descriptors are None, predetermined is False. id is 7971, name is Chicano English, count is 1, locale is en, descriptors are None, predetermined is False. id is 7972, name is \"Valley Girl\" English, count is 1, locale is en, descriptors are None, predetermined is False. id is 8052, name is Southern England, count is 3, locale is en, descriptors are None, predetermined is False. id is 8137, name is South Eastern United States, count is 1, locale is en, descriptors are None, predetermined is False. id is 8138, name is Georgia (United States), count is 1, locale is en, descriptors are None, predetermined is False. id is 8153, name is cot-caught merger, count is 1, locale is en, descriptors are None, predetermined is False. id is 8418, name is fluent, count is 1, locale is en, descriptors are None, predetermined is False. id is 8473, name is West African, count is 1, locale is en, descriptors are None, predetermined is False. id is 8507, name is Lancashire, count is 2, locale is en, descriptors are None, predetermined is False. id is 8608, name is West Coast England, count is 1, locale is en, descriptors are None, predetermined is False. id is 8687, name is Hmong, count is 1, locale is en, descriptors are None, predetermined is False. id is 8916, name is Kiwi, count is 1, locale is en, descriptors are None, predetermined is False. id is 9227, name is Cool, count is 1, locale is en, descriptors are None, predetermined is False. id is 9341, name is Asian-American, count is 1, locale is en, descriptors are None, predetermined is False. id is 9386, name is South German, count is 2, locale is en, descriptors are None, predetermined is False. id is 9440, name is Okie, count is 1, locale is en, descriptors are None, predetermined is False. id is 9675, name is New England, count is 1, locale is en, descriptors are None, predetermined is False. id is 9676, name is East coast United States, count is 1, locale is en, descriptors are None, predetermined is False. id is 10368, name is Spoke language when a child, count is 1, locale is en, descriptors are None, predetermined is False. id is 10370, name is 90%, count is 1, locale is en, descriptors are None, predetermined is False. id is 10371, name is 10%, count is 1, locale is en, descriptors are None, predetermined is False. id is 10677, name is Upper Midwestern, count is 1, locale is en, descriptors are None, predetermined is False. id is 10902, name is South London, count is 2, locale is en, descriptors are None, predetermined is False. id is 10903, name is Essex, count is 1, locale is en, descriptors are None, predetermined is False. id is 11136, name is heavy, count is 1, locale is en, descriptors are None, predetermined is False. id is 11137, name is Cantonese, count is 1, locale is en, descriptors are None, predetermined is False. id is 11423, name is Kazakhstan, count is 1, locale is en, descriptors are None, predetermined is False. id is 11547, name is Transatlantic English, count is 2, locale is en, descriptors are None, predetermined is False. id is 11620, name is minor, count is 1, locale is en, descriptors are None, predetermined is False. id is 11853, name is mid level, count is 1, locale is en, descriptors are None, predetermined is False. id is 11873, name is Indonesian, count is 2, locale is en, descriptors are None, predetermined is False. id is 12245, name is Javanese, count is 1, locale is en, descriptors are None, predetermined is False. id is 12626, name is Michigan, count is 1, locale is en, descriptors are None, predetermined is False. id is 12857, name is mostly affecting inflection, count is 1, locale is en, descriptors are None, predetermined is False. id is 12858, name is I think, count is 1, locale is en, descriptors are None, predetermined is False. id is 13433, name is Austrian, count is 2, locale is en, descriptors are None, predetermined is False. id is 13612, name is Bulgarian, count is 1, locale is en, descriptors are None, predetermined is False. id is 13656, name is Slightly effeminate, count is 1, locale is en, descriptors are None, predetermined is False. id is 13657, name is Conversational, count is 1, locale is en, descriptors are None, predetermined is False. id is 13669, name is Alemannic German Accent, count is 1, locale is en, descriptors are None, predetermined is False. id is 13672, name is South West German, count is 1, locale is en, descriptors are None, predetermined is False. id is 13743, name is try to maintain originality, count is 1, locale is en, descriptors are None, predetermined is False. id is 13899, name is Slight lisp, count is 1, locale is en, descriptors are None, predetermined is False. id is 13910, name is Foreign, count is 2, locale is en, descriptors are None, predetermined is False. id is 13917, name is Ontario, count is 1, locale is en, descriptors are None, predetermined is False. id is 14128, name is Sussex, count is 1, locale is en, descriptors are None, predetermined is False. id is 14397, name is Culchie, count is 1, locale is en, descriptors are None, predetermined is False. id is 14472, name is Philadelphia, count is 1, locale is en, descriptors are None, predetermined is False. id is 14852, name is Northumbrian British English, count is 1, locale is en, descriptors are None, predetermined is False. id is 15116, name is Southwestern United States English, count is 1, locale is en, descriptors are None, predetermined is False. id is 15246, name is East African Khoja, count is 1, locale is en, descriptors are None, predetermined is False. id is 15303, name is Nepali, count is 1, locale is en, descriptors are None, predetermined is False. id is 15369, name is New Orleans, count is 1, locale is en, descriptors are None, predetermined is False. id is 15661, name is Low, count is 1, locale is en, descriptors are None, predetermined is False. id is 15662, name is Demure, count is 1, locale is en, descriptors are None, predetermined is False.'"
]
},
- "execution_count": 24,
+ "execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
@@ -33136,7 +33154,7 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 27,
"metadata": {},
"outputs": [
{
@@ -33436,7 +33454,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
@@ -33470,498 +33488,498 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "(1, )\n",
+ "(1, )\n",
"1\n",
"changed id is 1, name is United States English, count is 7800, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(2, )\n",
+ "(2, )\n",
"2\n",
- "(3, )\n",
+ "(3, )\n",
"3\n",
"changed id is 3, name is Hong Kong English, count is 136, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(4, )\n",
+ "(4, )\n",
"4\n",
"changed id is 4, name is Filipino, count is 134, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(7, )\n",
+ "(7, )\n",
"7\n",
- "(8, )\n",
+ "(8, )\n",
"8\n",
"changed id is 8, name is England English, count is 2415, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(9, )\n",
+ "(9, )\n",
"9\n",
"changed id is 9, name is Australian English, count is 681, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(13, )\n",
+ "(13, )\n",
"13\n",
"changed id is 13, name is Southern African (South Africa, Zimbabwe, Namibia), count is 271, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(16, )\n",
+ "(16, )\n",
"16\n",
"changed id is 16, name is India and South Asia (India, Pakistan, Sri Lanka), count is 2089, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(31, )\n",
+ "(31, )\n",
"31\n",
- "(32, )\n",
+ "(32, )\n",
"32\n",
- "(33, )\n",
+ "(33, )\n",
"33\n",
- "(34, )\n",
+ "(34, )\n",
"34\n",
- "(35, )\n",
+ "(35, )\n",
"35\n",
- "(36, )\n",
+ "(36, )\n",
"36\n",
- "(37, )\n",
+ "(37, )\n",
"37\n",
- "(40, )\n",
+ "(40, )\n",
"40\n",
"changed id is 40, name is Canadian English, count is 933, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(55, )\n",
+ "(55, )\n",
"55\n",
- "(69, )\n",
+ "(69, )\n",
"69\n",
"changed id is 69, name is Scottish English, count is 175, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(74, )\n",
+ "(74, )\n",
"74\n",
- "(85, )\n",
+ "(85, )\n",
"85\n",
- "(86, )\n",
+ "(86, )\n",
"86\n",
- "(101, )\n",
+ "(101, )\n",
"101\n",
- "(132, )\n",
+ "(132, )\n",
"132\n",
"changed id is 132, name is Singaporean English, count is 81, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(155, )\n",
+ "(155, )\n",
"155\n",
- "(156, )\n",
+ "(156, )\n",
"156\n",
- "(159, )\n",
+ "(159, )\n",
"159\n",
- "(161, )\n",
+ "(161, )\n",
"161\n",
"changed id is 161, name is Welsh English, count is 71, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(168, )\n",
+ "(168, )\n",
"168\n",
"changed id is 168, name is West Indies and Bermuda (Bahamas, Bermuda, Jamaica, Trinidad), count is 52, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(182, )\n",
+ "(182, )\n",
"182\n",
- "(220, )\n",
+ "(220, )\n",
"220\n",
"changed id is 220, name is Irish English, count is 193, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(245, )\n",
+ "(245, )\n",
"245\n",
"changed id is 245, name is New Zealand English, count is 166, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(251, )\n",
+ "(251, )\n",
"251\n",
"changed id is 251, name is Malaysian English, count is 100, locale is en, descriptors are None, predetermined is True. status to True\n",
- "(272, )\n",
+ "(272, )\n",
"272\n",
- "(321, )\n",
+ "(321, )\n",
"321\n",
- "(326, )\n",
+ "(326, )\n",
"326\n",
- "(331, )\n",
+ "(331, )\n",
"331\n",
- "(340, )\n",
+ "(340, )\n",
"340\n",
- "(341, )\n",
+ "(341, )\n",
"341\n",
- "(342, )\n",
+ "(342, )\n",
"342\n",
- "(343, )\n",
+ "(343, )\n",
"343\n",
- "(469, )\n",
+ "(469, )\n",
"469\n",
- "(475, )\n",
+ "(475, )\n",
"475\n",
- "(476, )\n",
+ "(476, )\n",
"476\n",
- "(520, )\n",
+ "(520, )\n",
"520\n",
- "(533, )\n",
+ "(533, )\n",
"533\n",
- "(534, )\n",
+ "(534, )\n",
"534\n",
- "(547, )\n",
+ "(547, )\n",
"547\n",
- "(549, )\n",
+ "(549, )\n",
"549\n",
- "(550, )\n",
+ "(550, )\n",
"550\n",
- "(551, )\n",
+ "(551, )\n",
"551\n",
- "(552, )\n",
+ "(552, )\n",
"552\n",
- "(553, )\n",
+ "(553, )\n",
"553\n",
- "(554, )\n",
+ "(554, )\n",
"554\n",
- "(555, )\n",
+ "(555, )\n",
"555\n",
- "(556, )\n",
+ "(556, )\n",
"556\n",
- "(557, )\n",
+ "(557, )\n",
"557\n",
- "(558, )\n",
+ "(558, )\n",
"558\n",
- "(559, )\n",
+ "(559, )\n",
"559\n",
- "(560, )\n",
+ "(560, )\n",
"560\n",
- "(566, )\n",
+ "(566, )\n",
"566\n",
- "(613, )\n",
+ "(613, )\n",
"613\n",
- "(618, )\n",
+ "(618, )\n",
"618\n",
- "(619, )\n",
+ "(619, )\n",
"619\n",
- "(636, )\n",
+ "(636, )\n",
"636\n",
- "(643, )\n",
+ "(643, )\n",
"643\n",
- "(644, )\n",
+ "(644, )\n",
"644\n",
- "(724, )\n",
+ "(724, )\n",
"724\n",
- "(801, )\n",
+ "(801, )\n",
"801\n",
- "(814, )\n",
+ "(814, )\n",
"814\n",
- "(848, )\n",
+ "(848, )\n",
"848\n",
- "(851, )\n",
+ "(851, )\n",
"851\n",
- "(875, )\n",
+ "(875, )\n",
"875\n",
- "(876, )\n",
+ "(876, )\n",
"876\n",
- "(919, )\n",
+ "(919, )\n",
"919\n",
- "(933, )\n",
+ "(933, )\n",
"933\n",
- "(962, )\n",
+ "(962, )\n",
"962\n",
- "(988, )\n",
+ "(988, )\n",
"988\n",
- "(1005, )\n",
+ "(1005, )\n",
"1005\n",
- "(1035, )\n",
+ "(1035, )\n",
"1035\n",
- "(1079, )\n",
+ "(1079, )\n",
"1079\n",
- "(1081, )\n",
+ "(1081, )\n",
"1081\n",
- "(1120, )\n",
+ "(1120, )\n",
"1120\n",
- "(1160, )\n",
+ "(1160, )\n",
"1160\n",
- "(1242, )\n",
+ "(1242, )\n",
"1242\n",
- "(1287, )\n",
+ "(1287, )\n",
"1287\n",
- "(1297, )\n",
+ "(1297, )\n",
"1297\n",
- "(1311, )\n",
+ "(1311, )\n",
"1311\n",
- "(1390, )\n",
+ "(1390, )\n",
"1390\n",
- "(1428, )\n",
+ "(1428, )\n",
"1428\n",
- "(1483, )\n",
+ "(1483, )\n",
"1483\n",
- "(1544, )\n",
+ "(1544, )\n",
"1544\n",
- "(1546, )\n",
+ "(1546, )\n",
"1546\n",
- "(1550, )\n",
+ "(1550, )\n",
"1550\n",
- "(1551, )\n",
+ "(1551, )\n",
"1551\n",
- "(1603, )\n",
+ "(1603, )\n",
"1603\n",
- "(1604, )\n",
+ "(1604, )\n",
"1604\n",
- "(1774, )\n",
+ "(1774, )\n",
"1774\n",
- "(1941, )\n",
+ "(1941, )\n",
"1941\n",
- "(1942,