Skip to content

Commit

Permalink
Implemented suggestions from review
Browse files Browse the repository at this point in the history
  • Loading branch information
tgurbich committed Aug 3, 2023
1 parent 001a2ab commit cde8397
Showing 1 changed file with 45 additions and 43 deletions.
88 changes: 45 additions & 43 deletions src/notebooks/Python Examples/GSC23 MGnify Workshop.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@
"source": [
"#### Task 1: Browse the recently analysed studies and save them to a file\n",
"\n",
"Let's start by loading 50 most recently analysed studies and saving them to a file called ```studies.csv``` (it would take too long to load all studies for this example so we limit the number at 50):"
"Let's start by loading the 50 most recently analysed studies and saving them to a file called ```studies.csv``` (it would take too long to load all studies for this example so we limit the number at 50):"
]
},
{
Expand Down Expand Up @@ -354,49 +354,49 @@
{
"cell_type": "code",
"execution_count": null,
"id": "0fee9320-4354-47ce-afbc-e44b2ef09c50",
"id": "cda3bcef-00fa-4473-a3c1-2751c47e160b",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Save all possible JSON keys into a list \"all_keys\"\n",
"all_keys = [\n",
" 'Submitted nucleotide sequences',\n",
" 'Nucleotide sequences after format-specific filtering',\n",
" 'Nucleotide sequences after length filtering',\n",
" 'Nucleotide sequences after undetermined bases filtering',\n",
" 'Predicted SSU sequences',\n",
" 'Predicted LSU sequences',\n",
" 'Reads with predicted CDS',\n",
" 'Reads with predicted RNA',\n",
" 'Reads with InterProScan match',\n",
" 'Predicted CDS',\n",
" 'Predicted CDS with InterProScan match',\n",
" 'Total InterProScan matches'\n",
"]\n",
"# First we define a function which we will call \"analysis_summary_to_df\" that will \n",
"# take the attributes.analysis-summary column, convert it into a dataframe and then \n",
"# return it as a Pandas pivot table, which is a table that summarises the dataframe\n",
"def analysis_summary_to_df(analysis):\n",
" summary_df = pd.json_normalize(analysis['attributes.analysis-summary'])\n",
" if summary_df.size:\n",
" return summary_df.pivot_table(index=None, columns=\"key\", values=\"value\", aggfunc=\"first\")\n",
" return summary_df\n",
"\n",
"# Iterate over each row of the 'attributes.analysis-summary' column, extract information \n",
"# and save to a temporary dictionary. If an analysis doesn't have a value for any of the keys\n",
"# from the all_keys list, insert \"N/A\"\n",
"extracted_data = dict()\n",
"for row in analyses['attributes.analysis-summary']:\n",
" result_dict = {item['key']: item['value'] for item in row}\n",
" for key in all_keys:\n",
" add_value = result_dict[key] if key in result_dict else \"N/A\"\n",
" extracted_data.setdefault(key, list()).append(add_value)\n",
" \n",
"# Convert the dictionary to a DataFrame\n",
"extracted_df = pd.DataFrame(extracted_data)\n",
"\n",
"# Concatenate the two dataframes into a new one called transformed_analyses \n",
"transformed_analyses = pd.concat([analyses, extracted_df], axis=1)\n",
"# Add a column \"summary_df\" to the dataframe \"analyses\". The column is generated using\n",
"# the function \"analysis_summary_to_df\" that we defined above and will hold the data\n",
"# from the JSONs that we are converting here.\n",
"analyses['summary_df'] = analyses.apply(analysis_summary_to_df, axis=1)\n",
"\n",
"# Now we will generate a new dataframe called analyses_summary_df\n",
"analyses_summary_df = None\n",
"\n",
"# Iterate through the rows of the analyses dataframe\n",
"for idx, row in analyses.iterrows():\n",
" # Set the index of the summary_df dataframe to the index of the line we are iterating through\n",
" row.summary_df.index = [idx]\n",
" # Add lines from summary_df to analyses_summary_df\n",
" if analyses_summary_df is None:\n",
" analyses_summary_df = row.summary_df\n",
" else:\n",
" analyses_summary_df = pd.concat([analyses_summary_df, row.summary_df])\n",
"\n",
"# Remove the attributes.analysis-summary column from the new dataframe\n",
"transformed_analyses.drop('attributes.analysis-summary', axis=1, inplace=True)\n",
"# Concatenate the new analyses_summary_df dataframe (which has the data from JSON in column form)\n",
"# and our original analyses dataframe and save them to a new dataframe called transformed_analyses\n",
"transformed_analyses = pd.concat([analyses, analyses_summary_df], axis=1)\n",
"\n",
"# Display the first few rows of the resulting dataframe\n",
"transformed_analyses.head()"
"# Remove the attributes.analysis-summary and summary_df columns, we no longer need them\n",
"transformed_analyses.drop(['attributes.analysis-summary', 'summary_df'], axis=1, inplace=True)\n",
"\n",
"# View the first few lines of the updated dataframe\n",
"transformed_analyses.head()\n"
]
},
{
Expand All @@ -420,11 +420,13 @@
"source": [
"# Create a new dataframe called filtered_analyses which will include all lines from the\n",
"# transformed_analyses dataframe except for the ones where the value in the \"Predicted LSU sequences\"\n",
"# column is \"0\" or \"N/A\"\n",
"# column is \"0\" or no value (NaN)\n",
"filtered_analyses = transformed_analyses[\n",
" ~(transformed_analyses['Predicted LSU sequences'].isin(['0', 'N/A']))\n",
" ~(transformed_analyses['Predicted LSU sequences'].isin(['0']))\n",
"]\n",
"\n",
"filtered_analyses = filtered_analyses.dropna(subset=[\"Predicted LSU sequences\"])\n",
"\n",
"# print the filtered dataframe\n",
"filtered_analyses"
]
Expand Down Expand Up @@ -977,7 +979,7 @@
"id": "cbae69d4-594e-414d-be2a-8a5383d091a7",
"metadata": {},
"source": [
"For this exercise we will be working with pre-fetched data for all genomes. This is because fetching it takes a while given the number of genomes but if you were fetching it yourself, the code for that is below (expand by clicking on the three dots but please don't execute the code)."
"For this exercise we will be working with pre-fetched data for all genomes. This is because fetching it takes a while given the number of genomes but if you were fetching it yourself, the code for that is below (expand by clicking on the three dots but please don't uncomment and execute the code)."
]
},
{
Expand All @@ -992,11 +994,11 @@
},
"outputs": [],
"source": [
"endpoint = \"genomes\"\n",
"with APISession(\"https://www.ebi.ac.uk/metagenomics/api/v1\") as mgnify:\n",
" resources_all = map(lambda r: r.json, mgnify.iterate(endpoint))\n",
"resources_all_df = pd.json_normalize(resources_all)\n",
"resources_all_df.to_parquet('all_genome_resources_Aug2023.parquet')"
"# endpoint = \"genomes\"\n",
"# with APISession(\"https://www.ebi.ac.uk/metagenomics/api/v1\") as mgnify:\n",
"# resources_all = map(lambda r: r.json, mgnify.iterate(endpoint))\n",
"# resources_all_df = pd.json_normalize(resources_all)\n",
"# resources_all_df.to_parquet('all_genome_resources_Aug2023.parquet')"
]
},
{
Expand Down

0 comments on commit cde8397

Please sign in to comment.