diff --git a/dependencies/mgnify-cache.tgz b/dependencies/mgnify-cache.tgz index 50f1bda..c7f3e2a 100644 Binary files a/dependencies/mgnify-cache.tgz and b/dependencies/mgnify-cache.tgz differ diff --git a/dependencies/populate-mgnifyr-cache.R b/dependencies/populate-mgnifyr-cache.R index 7fc30ca..7e33892 100644 --- a/dependencies/populate-mgnifyr-cache.R +++ b/dependencies/populate-mgnifyr-cache.R @@ -3,7 +3,10 @@ library(MGnifyR) mg <- mgnify_client(usecache = T, cache_dir = '/home/jovyan/.mgnify_cache') -# For the "Comparative Metagenomics 1" notebook +# For the "Comparative Metagenomics" notebook tara_all = mgnify_analyses_from_studies(mg, 'MGYS00002008') metadata = mgnify_get_analyses_metadata(mg, tara_all) +# To generate phyloseq object +clean_acc=c('MGYA00590456','MGYA00590543','MGYA00593110','MGYA00590477','MGYA00593125','MGYA00590448','MGYA00590508','MGYA00589025','MGYA00593139','MGYA00593220','MGYA00590525','MGYA00590534','MGYA00593112','MGYA00590498','MGYA00590535','MGYA00593223','MGYA00590480','MGYA00590496','MGYA00590523','MGYA00590444','MGYA00590517','MGYA00590575','MGYA00589039','MGYA00590574','MGYA00590474','MGYA00590554','MGYA00590469','MGYA00590471','MGYA00590522','MGYA00593141','MGYA00589049','MGYA00593123','MGYA00590564','MGYA00589024','MGYA00590572','MGYA00590545','MGYA00590518','MGYA00593126','MGYA00590526','MGYA00590500','MGYA00590570','MGYA00590520','MGYA00590443','MGYA00589013','MGYA00590449','MGYA00589021','MGYA00593130','MGYA00589047','MGYA00589042','MGYA00590577','MGYA00590470','MGYA00590473','MGYA00593216','MGYA00590562','MGYA00590464','MGYA00590484','MGYA00590462','MGYA00590565','MGYA00590439','MGYA00590472','MGYA00590566','MGYA00590552','MGYA00590485','MGYA00593133','MGYA00590544','MGYA00590455','MGYA00590437','MGYA00589044') +ps = mgnify_get_analyses_phyloseq(mg, clean_acc) diff --git a/src/notebooks/R Examples/Comparative Metagenomics.ipynb b/src/notebooks/R Examples/Comparative Metagenomics.ipynb index 814235a..19196ba 100644 --- a/src/notebooks/R Examples/Comparative Metagenomics.ipynb +++ b/src/notebooks/R Examples/Comparative Metagenomics.ipynb @@ -21,7 +21,7 @@ "\n", "## Normalization methods, alpha & beta diversity, and differentially abundant taxa\n", "\n", - "In this notebook we aim to demonstrate how the MGnifyR tool can be used to fetch data and metadata of a MGnify metagenomic analyisis. Then we show how to generate diversity metrics for comparative metagenomics using taxonomic profiles. Finally, we will use `SIAMCAT` to detect differentially abundant taxa.\n", + "In this notebook we aim to demonstrate how the MGnifyR tool can be used to fetch data and metadata of a MGnify metagenomic analyisis. Then we show how to generate diversity metrics for comparative metagenomics using taxonomic profiles. Finally, we will use `SIAMCAT` to detect differentially abundant taxa and to build a ML classification model.\n", "\n", "[MGnifyR](http://github.com/beadyallen/mgnifyr) is a library that provides a set of tools for easily accessing and processing MGnify data in R, making queries to MGnify databases through the [MGnify API](https://www.ebi.ac.uk/metagenomics/api/v1/). \n", "The benefit of MGnifyR is that data can either be fetched in tsv format or be directly combined in a phyloseq object to run an analysis in a custom workflow.\n", @@ -63,17 +63,14 @@ "source": [ "# Loading libraries:\n", "suppressMessages({\n", - " library(MGnifyR)\n", - " library(stringr)\n", - " library(vegan)\n", " library(ggplot2)\n", - " library(phyloseq)\n", - " library(metagenomeSeq)\n", + " library(IRdisplay) \n", + " library(MGnifyR)\n", " library(microbiomeMarker)\n", " library(plyr)\n", " library(SIAMCAT)\n", " library(tidyverse)\n", - " library(IRdisplay)\n", + " library(vegan)\n", "})\n", " \n", "display_markdown(file = '../_resources/mgnifyr_help.md')" @@ -133,7 +130,6 @@ "source": [ "# Create your session mgnify_client object\n", "mg = mgnify_client(usecache = T, cache_dir = '/home/jovyan/.mgnify_cache')\n", - "\n", "tara_all = mgnify_analyses_from_studies(mg, 'MGYS00002008')" ] }, @@ -152,8 +148,7 @@ "metadata": {}, "outputs": [], "source": [ - "metadata = mgnify_get_analyses_metadata(mg, tara_all)\n", - "#head(metadata)" + "metadata = mgnify_get_analyses_metadata(mg, tara_all)" ] }, { @@ -186,7 +181,7 @@ "id": "c2a8d114-4810-4d6c-8d98-c0ce44f7a5bd", "metadata": {}, "source": [ - "We want to keep only **metagenomic** samples (not amplicon) of 'surface water layer ([ENVO:00002042](https://www.ebi.ac.uk/ols/ontologies/envo/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FENVO_00002042))' and 'mesopelagic zone ([ENVO:00000213](https://www.ebi.ac.uk/ols/ontologies/envo/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FENVO_00000213))' to compare. We also want to filter out results generated with old pipeline versions (<[v5.0](https://www.ebi.ac.uk/metagenomics/pipelines/5.0)). In the following steps we will filter out other samples before exporting to the phyloseq object. Let's first explore the metadata we fetched:" + "We want to keep only **metagenomic** samples (not amplicon) of 'surface' and 'mesopelagic zone' to compare. We also want to filter out results generated with old pipeline versions (<[v5.0](https://www.ebi.ac.uk/metagenomics/pipelines/5.0)). In the following steps we will filter out other samples before exporting to the phyloseq object. Let's first explore the metadata we fetched:" ] }, { @@ -270,12 +265,18 @@ "id": "080aa6d8-6906-48da-ae18-36feb6265660", "metadata": {}, "source": [ - "Let's keep only samples having [ENVO:00002042](https://www.ebi.ac.uk/ols/ontologies/envo/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FENVO_00002042) or [ENVO:00000213](https://www.ebi.ac.uk/ols/ontologies/envo/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FENVO_00000213) in the `sample_environment-feature` column. We want to create a new dataframe containing the relevant samples.\n", - "\n", - "We are also going to create a clean label for the environment feature.\n", + "5. Filtering out noisy samples.\n", "\n", + "We want to create a new dataframe containing the relevant samples. In this step, we will also create a clean label for the environment feature. Additionally, we noticed that some samples have extreme non-sense values in the metadata. We will filter out those samples to reduce the noise on the analysis." + ] + }, + { + "cell_type": "markdown", + "id": "e52de90b-b671-49eb-a8ab-d340f9a25a7c", + "metadata": {}, + "source": [ "
\n", - "Note: To speed up the following analysis, we are going to keep only 25 samples per group (by randomly subsampling the accessions)\n", + "Note: To speed up the following analysis, we are going to keep only 34 samples per group (by randomly subsampling the table)\n", "
" ] }, @@ -286,19 +287,61 @@ "metadata": {}, "outputs": [], "source": [ - "# Saving the list of relevant samples in a dataframe\n", - "sub1 = v5_metadata[str_detect(v5_metadata$'sample_environment-feature', \"ENVO:00002042\"), ]\n", - "set.seed(345)\n", - "acc_s1 = sample(sub1$'analysis_accession', size=25, replace = FALSE)\n", + "# Saving the list of Surface water samples in a dataframe\n", + "sub1 = v5_metadata[str_detect(v5_metadata$'sample_environment-feature', \"surface\"), ]\n", + "\n", + "# Reducing the metadata table to keep relevant fields only\n", + "variables_to_keep = c('sample_temperature','sample_depth','sample_salinity','sample_chlorophyll sensor','sample_nitrate sensor','sample_oxygen sensor')\n", + "reduced_sub1 = sub1[variables_to_keep]\n", + "\n", + "# Removing rows with extreme values\n", + "reduced_sub1[reduced_sub1 == \"99999\"] <- NA\n", + "reduced_sub1[reduced_sub1 == \"99999.0\"] <- NA\n", + "reduced_sub1[reduced_sub1 == \"0\"] <- NA\n", + "\n", + "clean1=na.omit(reduced_sub1)\n", "\n", - "sub2 = v5_metadata[str_detect(v5_metadata$'sample_environment-feature', \"ENVO:00000213\"), ]\n", + "# Subsampling to 34\n", "set.seed(345)\n", - "acc_s2 = sample(sub2$'analysis_accession', size=25, replace = FALSE)\n", + "random_1 = clean1[sample(nrow(clean1), 34), ]\n", + "random_1$'env_label'=c(rep('Surface', times=length(rownames(random_1))))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16f1394d-935b-4d91-9012-42be61c4359a", + "metadata": {}, + "outputs": [], + "source": [ + "# Saving the list of Mesopelagic zone samples in a dataframe\n", + "sub2 = v5_metadata[str_detect(v5_metadata$'sample_environment-feature', \"mesopelagic\"), ]\n", + "\n", + "# Reducing the metadata table to keep relevant fields only\n", + "reduced_sub2 = sub2[variables_to_keep]\n", + "\n", + "# Removing rows with extreme values\n", + "reduced_sub2[reduced_sub2 == \"99999\"] <- NA\n", + "reduced_sub2[reduced_sub2 == \"99999.0\"] <- NA\n", + "reduced_sub2[reduced_sub2 == \"0\"] <- NA\n", "\n", - "filtered_samples = c(acc_s1,acc_s2)\n", + "clean2=na.omit(reduced_sub2)\n", "\n", - "# To create the environment feature label:\n", - "env_label = c(rep('Surface', times=25), rep('Mesopelagic', times=25))\n" + "# Subsampling to 34\n", + "set.seed(345)\n", + "random_2 = clean2[sample(nrow(clean2), 34), ]\n", + "random_2$'env_label'=c(rep('Mesopelagic', times=length(rownames(random_2))))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0749670-9aa3-4c72-8bc8-4d20aedae8b3", + "metadata": {}, + "outputs": [], + "source": [ + "clean_metadata=rbind(random_1,random_2)\n", + "clean_acc=rownames(clean_metadata)" ] }, { @@ -314,7 +357,7 @@ "id": "3dc92881-75bc-4931-90a5-96c6a4cf24ff", "metadata": {}, "source": [ - "1. Now that we have a new dataframe with 50 samples from either surface or mesopelagic zone water, we are going to create the phyloseq object. Consider that this step takes around 6 min to complete." + "1. Now that we have a new dataframe with samples from either surface or mesopelagic zone water, we are going to create the phyloseq object." ] }, { @@ -324,7 +367,7 @@ "metadata": {}, "outputs": [], "source": [ - "ps = mgnify_get_analyses_phyloseq(mg, filtered_samples)" + "ps = mgnify_get_analyses_phyloseq(mg, clean_acc)" ] }, { @@ -332,7 +375,7 @@ "id": "b1a7166e-b287-4668-98e6-47a08841a725", "metadata": {}, "source": [ - "2. Keep only relevant columns in metadata table and transform numeric variables from characters to numbers. Add the environment label as well." + "2. Keep only relevant columns in the phyloseq metadata table and transform numeric variables from characters to numbers. Add the environment label as well." ] }, { @@ -342,22 +385,16 @@ "metadata": {}, "outputs": [], "source": [ - "#sample_variables(ps)\n", - "\n", - "# Keeping relevant metadata only\n", + "# Keeping relevant metadata in the phyloseq object\n", "variables_to_keep = c('sample_temperature','sample_depth','sample_salinity','sample_chlorophyll.sensor','sample_nitrate.sensor','sample_oxygen.sensor')\n", - "\n", "df = data.frame(sample_data(ps))[variables_to_keep]\n", "\n", "# Transforming character to nummeric variables\n", "df[] = lapply(df, function(x) as.numeric(as.character(x)))\n", - "\n", "sample_data(ps) = df\n", "\n", "# Adding the env label \n", - "sample_data(ps)$'env_feature' = env_label\n", - "\n", - "#sample_data(ps)" + "sample_data(ps)$'env_label' = clean_metadata$env_label\n" ] }, { @@ -393,8 +430,6 @@ "metadata": {}, "outputs": [], "source": [ - "ps\n", - "\n", "options(repr.plot.width=4, repr.plot.height=4)\n", "hist(log10(sample_sums(ps)), breaks=50, main=\"Sample size distribution\", xlab=\"Sample size (log10)\", ylab=\"Frequency\", col=\"#007c80\")" ] @@ -416,7 +451,6 @@ "outputs": [], "source": [ "ps_good = subset_samples(ps, sample_sums(ps) > 32)\n", - "\n", "hist(log10(sample_sums(ps_good)), breaks=50, main=\"Sample size distribution\", xlab=\"Sample size (log10)\", ylab=\"Frequency\", col=\"#007c80\")" ] }, @@ -435,8 +469,7 @@ "metadata": {}, "outputs": [], "source": [ - "ps_final = filter_taxa(ps_good, function(x) sum(x) > 1, prune=TRUE)\n", - "ps_final" + "ps_final = filter_taxa(ps_good, function(x) sum(x) > 1, prune=TRUE)" ] }, { @@ -579,7 +612,7 @@ "metadata": {}, "outputs": [], "source": [ - "ps_rare = rarefy_even_depth(ps_final, sample.size=23, replace=FALSE, rngseed=123, verbose=FALSE)" + "ps_rare = rarefy_even_depth(ps_final, sample.size=56, replace=FALSE, rngseed=123, verbose=FALSE)" ] }, { @@ -691,21 +724,21 @@ "options(repr.plot.width=12, repr.plot.height=3)\n", "\n", "### No normalization\n", - "plot_richness(ps_final, x=\"env_feature\", color=\"env_feature\", title=\"No normalization\", measures=c(\"Observed\", \"Chao1\", \"ACE\", \"Shannon\", \"Simpson\")) + \n", + "plot_richness(ps_final, x=\"env_label\", color=\"env_label\", title=\"No normalization\", measures=c(\"Observed\", \"Chao1\", \"ACE\", \"Shannon\", \"Simpson\")) + \n", " geom_boxplot() + \n", " theme_bw() + \n", " scale_color_manual(values=c(\"#0a5032\", \"#a1be1f\")) + \n", " labs(x='', color = \"Environment\")\n", "\n", "### Rarefaction\n", - "plot_richness(ps_rare, x=\"env_feature\", color=\"env_feature\", title=\"Rarefaction\", measures=c(\"Observed\", \"Chao1\", \"ACE\", \"Shannon\", \"Simpson\")) + \n", + "plot_richness(ps_rare, x=\"env_label\", color=\"env_label\", title=\"Rarefaction\", measures=c(\"Observed\", \"Chao1\", \"ACE\", \"Shannon\", \"Simpson\")) + \n", " geom_boxplot() + \n", " theme_bw() + \n", " scale_color_manual(values=c(\"#0a5032\", \"#a1be1f\")) + \n", " labs(x='', color = \"Environment\")\n", "\n", "### Cumulative sum scaling\n", - "plot_richness(ps_CSS, x=\"env_feature\", color=\"env_feature\", title=\"Cumulative sum scaling\", measures=c(\"Observed\", \"Chao1\", \"ACE\", \"Shannon\", \"Simpson\")) + \n", + "plot_richness(ps_CSS, x=\"env_label\", color=\"env_label\", title=\"Cumulative sum scaling\", measures=c(\"Observed\", \"Chao1\", \"ACE\", \"Shannon\", \"Simpson\")) + \n", " geom_boxplot() + \n", " theme_bw() + \n", " scale_color_manual(values=c(\"#0a5032\", \"#a1be1f\")) + \n", @@ -849,7 +882,7 @@ " # Don't carry over previous plot (if error, p will be blank)\n", " p = NULL\n", " # Create plot, store as temp variable, p\n", - " p = plot_ordination(ps_CSS, iMDS, color=\"env_feature\")\n", + " p = plot_ordination(ps_CSS, iMDS, color=\"env_label\")\n", " # Add title to each plot\n", " p = p + ggtitle(paste(\"MDS using distance method \", i, sep=\"\"))\n", " # Save the graphic to file.\n", @@ -859,11 +892,11 @@ "# Create a combined plot\n", "df = ldply(plist, function(x) x$data)\n", "names(df)[1] = \"distance\"\n", - "p = ggplot(df, aes(Axis.1, Axis.2, color=env_feature))\n", + "p = ggplot(df, aes(Axis.1, Axis.2, color=env_label))\n", "p = p + geom_point(size=3, alpha=0.5)\n", "p = p + facet_wrap(~distance, scales=\"free\")\n", "p = p + ggtitle(\"MDS on various distance metrics for TARA ocean dataset\") + \n", - " stat_ellipse(level=0.95, type=\"norm\", geom=\"polygon\", alpha=0, aes(color=env_feature)) + \n", + " stat_ellipse(level=0.95, type=\"norm\", geom=\"polygon\", alpha=0, aes(color=env_label)) + \n", " theme_bw() + \n", " scale_color_manual(values=c(\"#0a5032\", \"#a1be1f\")) + \n", " labs(color = \"Environment\") \n", @@ -888,8 +921,8 @@ "outputs": [], "source": [ "metadata = data.frame(sample_data(ps_CSS))\n", - "css_beta = distance(ps_CSS, method=\"canberra\")\n", - "adonis2(css_beta ~ env_feature, data = metadata, perm=1e3)" + "css_beta = distance(ps_CSS, method=\"mountford\")\n", + "adonis2(css_beta ~ env_label, data = metadata, perm=1e3)" ] }, { @@ -917,7 +950,7 @@ "metadata": {}, "outputs": [], "source": [ - "bd = betadisper(css_beta, metadata$'env_feature')\n", + "bd = betadisper(css_beta, metadata$'env_label')\n", "anova(bd)" ] }, @@ -926,7 +959,7 @@ "id": "b89f180e-ee44-4455-9e6b-81aa01164550", "metadata": {}, "source": [ - "ANOVA's p-value not significant (p>0.05) means that group dispersions are homogenous (acept the null hypothesis of no difference in dispersion between groups). The general conclusion of `adonis` and `betadisper` is that our groups of samples (Surface and Mesopelagic water layers) are homogeneous on group dispersions (compositions vary similarly within the group) while having significantly different compositions between groups (according to `adonis` p<0.05)." + "ANOVA's p-value not significant (p>0.05) means that group dispersions are homogenous (acept the null hypothesis of no difference in dispersion between groups). The general conclusion of `adonis` and `betadisper` is that our groups of samples (Surface and Mesopelagic zone water) are homogeneous on group dispersions (compositions vary similarly within the group) while having significantly different compositions between groups (according to `adonis` p<0.05)." ] }, { @@ -986,7 +1019,7 @@ "id": "b68c76e0-2307-47a4-8963-b76203560d0e", "metadata": {}, "source": [ - "2. Creating the `SIAMCAT` object. We first need to create a label to set which group will be used as control for comparisons. We selected arbitrary Mesopelagic as case group and Surface as control." + "2. Creating the `SIAMCAT` object. We first need to create a label to set which group will be used as control for comparisons. We selected arbitrary Surface water as a case group and Mesopelagic layer as a control." ] }, { @@ -997,7 +1030,7 @@ "outputs": [], "source": [ "# Creating a siamcat label\n", - "sc_label = create.label(meta=sample_data(psglom), label='env_feature', case='Mesopelagic')\n", + "sc_label = create.label(meta=sample_data(psglom), label='env_label', case='Surface')\n", "\n", "# Creating the siamcat object\n", "siamcat_obj = siamcat(phyloseq=psglom, label=sc_label)" @@ -1037,7 +1070,7 @@ "metadata": {}, "outputs": [], "source": [ - "options(repr.plot.width=10, repr.plot.height=6)\n", + "options(repr.plot.width=12, repr.plot.height=7)\n", "siamcat_obj = check.associations(siamcat_obj)\n", "association.plot(siamcat_obj, panels=c(\"fc\", \"prevalence\"), prompt = FALSE, verbose = 0)" ] @@ -1050,7 +1083,7 @@ "
\n", "Tip: For significantly associated microbial features, the plot shows:\n", "