Implemented suggestions from review

EBI-Metagenomics · Aug 3, 2023 · cde8397 · cde8397
1 parent 001a2ab
commit cde8397
Showing 1 changed file with 45 additions and 43 deletions.
diff --git a/src/notebooks/Python Examples/GSC23 MGnify Workshop.ipynb b/src/notebooks/Python Examples/GSC23 MGnify Workshop.ipynb
@@ -123,7 +123,7 @@
    "source": [
     "#### Task 1: Browse the recently analysed studies and save them to a file\n",
     "\n",
-    "Let's start by loading 50 most recently analysed studies and saving them to a file called ```studies.csv``` (it would take too long to load all studies for this example so we limit the number at 50):"
+    "Let's start by loading the 50 most recently analysed studies and saving them to a file called ```studies.csv``` (it would take too long to load all studies for this example so we limit the number at 50):"
    ]
   },
   {
@@ -354,49 +354,49 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0fee9320-4354-47ce-afbc-e44b2ef09c50",
+   "id": "cda3bcef-00fa-4473-a3c1-2751c47e160b",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "# Save all possible JSON keys into a list \"all_keys\"\n",
-    "all_keys = [\n",
-    "    'Submitted nucleotide sequences',\n",
-    "    'Nucleotide sequences after format-specific filtering',\n",
-    "    'Nucleotide sequences after length filtering',\n",
-    "    'Nucleotide sequences after undetermined bases filtering',\n",
-    "    'Predicted SSU sequences',\n",
-    "    'Predicted LSU sequences',\n",
-    "    'Reads with predicted CDS',\n",
-    "    'Reads with predicted RNA',\n",
-    "    'Reads with InterProScan match',\n",
-    "    'Predicted CDS',\n",
-    "    'Predicted CDS with InterProScan match',\n",
-    "    'Total InterProScan matches'\n",
-    "]\n",
+    "# First we define a function which we will call \"analysis_summary_to_df\" that will \n",
+    "# take the attributes.analysis-summary column, convert it into a dataframe and then \n",
+    "# return it as a Pandas pivot table, which is a table that summarises the dataframe\n",
+    "def analysis_summary_to_df(analysis):\n",
+    "    summary_df = pd.json_normalize(analysis['attributes.analysis-summary'])\n",
+    "    if summary_df.size:\n",
+    "        return summary_df.pivot_table(index=None, columns=\"key\", values=\"value\", aggfunc=\"first\")\n",
+    "    return summary_df\n",
     "\n",
-    "# Iterate over each row of the 'attributes.analysis-summary' column, extract information \n",
-    "# and save to a temporary dictionary. If an analysis doesn't have a value for any of the keys\n",
-    "# from the all_keys list, insert \"N/A\"\n",
-    "extracted_data = dict()\n",
-    "for row in analyses['attributes.analysis-summary']:\n",
-    "    result_dict = {item['key']: item['value'] for item in row}\n",
-    "    for key in all_keys:\n",
-    "        add_value = result_dict[key] if key in result_dict else \"N/A\"\n",
-    "        extracted_data.setdefault(key, list()).append(add_value)\n",
-    "        \n",
-    "# Convert the dictionary to a DataFrame\n",
-    "extracted_df = pd.DataFrame(extracted_data)\n",
     "\n",
-    "# Concatenate the two dataframes into a new one called transformed_analyses \n",
-    "transformed_analyses = pd.concat([analyses, extracted_df], axis=1)\n",
+    "# Add a column \"summary_df\" to the dataframe \"analyses\". The column is generated using\n",
+    "# the function \"analysis_summary_to_df\" that we defined above and will hold the data\n",
+    "# from the JSONs that we are converting here.\n",
+    "analyses['summary_df'] = analyses.apply(analysis_summary_to_df, axis=1)\n",
+    "\n",
+    "# Now we will generate a new dataframe called analyses_summary_df\n",
+    "analyses_summary_df = None\n",
+    "\n",
+    "# Iterate through the rows of the analyses dataframe\n",
+    "for idx, row in analyses.iterrows():\n",
+    "    # Set the index of the summary_df dataframe to the index of the line we are iterating through\n",
+    "    row.summary_df.index = [idx]\n",
+    "    # Add lines from summary_df to analyses_summary_df\n",
+    "    if analyses_summary_df is None:\n",
+    "        analyses_summary_df = row.summary_df\n",
+    "    else:\n",
+    "        analyses_summary_df = pd.concat([analyses_summary_df, row.summary_df])\n",
     "\n",
-    "# Remove the attributes.analysis-summary column from the new dataframe\n",
-    "transformed_analyses.drop('attributes.analysis-summary', axis=1, inplace=True)\n",
+    "# Concatenate the new analyses_summary_df dataframe (which has the data from JSON in column form)\n",
+    "# and our original analyses dataframe and save them to a new dataframe called transformed_analyses\n",
+    "transformed_analyses = pd.concat([analyses, analyses_summary_df], axis=1)\n",
     "\n",
-    "# Display the first few rows of the resulting dataframe\n",
-    "transformed_analyses.head()"
+    "# Remove the attributes.analysis-summary and summary_df columns, we no longer need them\n",
+    "transformed_analyses.drop(['attributes.analysis-summary', 'summary_df'], axis=1, inplace=True)\n",
+    "\n",
+    "# View the first few lines of the updated dataframe\n",
+    "transformed_analyses.head()\n"
    ]
   },
   {
@@ -420,11 +420,13 @@
    "source": [
     "# Create a new dataframe called filtered_analyses which will include all lines from the\n",
     "# transformed_analyses dataframe except for the ones where the value in the \"Predicted LSU sequences\"\n",
-    "# column is \"0\" or \"N/A\"\n",
+    "# column is \"0\" or no value (NaN)\n",
     "filtered_analyses = transformed_analyses[\n",
-    "    ~(transformed_analyses['Predicted LSU sequences'].isin(['0', 'N/A']))\n",
+    "    ~(transformed_analyses['Predicted LSU sequences'].isin(['0']))\n",
     "]\n",
     "\n",
+    "filtered_analyses = filtered_analyses.dropna(subset=[\"Predicted LSU sequences\"])\n",
+    "\n",
     "# print the filtered dataframe\n",
     "filtered_analyses"
    ]
@@ -977,7 +979,7 @@
    "id": "cbae69d4-594e-414d-be2a-8a5383d091a7",
    "metadata": {},
    "source": [
-    "For this exercise we will be working with pre-fetched data for all genomes. This is because fetching it takes a while given the number of genomes but if you were fetching it yourself, the code for that is below (expand by clicking on the three dots but please don't execute the code)."
+    "For this exercise we will be working with pre-fetched data for all genomes. This is because fetching it takes a while given the number of genomes but if you were fetching it yourself, the code for that is below (expand by clicking on the three dots but please don't uncomment and execute the code)."
    ]
   },
   {
@@ -992,11 +994,11 @@
    },
    "outputs": [],
    "source": [
-    "endpoint = \"genomes\"\n",
-    "with APISession(\"https://www.ebi.ac.uk/metagenomics/api/v1\") as mgnify:\n",
-    "    resources_all = map(lambda r: r.json, mgnify.iterate(endpoint))\n",
-    "resources_all_df = pd.json_normalize(resources_all)\n",
-    "resources_all_df.to_parquet('all_genome_resources_Aug2023.parquet')"
+    "# endpoint = \"genomes\"\n",
+    "# with APISession(\"https://www.ebi.ac.uk/metagenomics/api/v1\") as mgnify:\n",
+    "#     resources_all = map(lambda r: r.json, mgnify.iterate(endpoint))\n",
+    "# resources_all_df = pd.json_normalize(resources_all)\n",
+    "# resources_all_df.to_parquet('all_genome_resources_Aug2023.parquet')"
    ]
   },
   {