From 3aa477e658503edd71acd192a62e190ec0b4fec5 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 9 Aug 2024 16:45:33 +0100 Subject: [PATCH] fix and improve plotting pairwise average fst --- malariagen_data/anoph/fst.py | 59 +++++++---- malariagen_data/anoph/plotly_params.py | 7 +- notebooks/plot_pairwise_average_fst.ipynb | 114 ++++++++++++++++++++-- 3 files changed, 154 insertions(+), 26 deletions(-) diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py index 57511be4e..abf54d1b3 100644 --- a/malariagen_data/anoph/fst.py +++ b/malariagen_data/anoph/fst.py @@ -485,32 +485,49 @@ def plot_pairwise_average_fst( zmax: Optional[plotly_params.zmax] = None, text_auto: plotly_params.text_auto = ".3f", color_continuous_scale: plotly_params.color_continuous_scale = "gray_r", - width: plotly_params.fig_width = 700, - height: plotly_params.fig_height = 600, + width: plotly_params.fig_width = None, + height: plotly_params.fig_height = None, + row_height: plotly_params.height = 40, + col_width: plotly_params.width = 50, show: plotly_params.show = True, renderer: plotly_params.renderer = None, **kwargs, ): - # setup df - cohort_list = np.unique(fst_df[["cohort1", "cohort2"]].values) - # df to fill - fig_df = pd.DataFrame(columns=cohort_list, index=cohort_list) - # fill df from fst_df - for index_key in range(len(fst_df)): - index = fst_df.iloc[index_key]["cohort1"] - col = fst_df.iloc[index_key]["cohort2"] - fst = fst_df.iloc[index_key]["fst"] - fig_df[index][col] = fst + # Obtain a list of all cohorts analysed. N.B., preserve the order in + # which the cohorts are provided in the input dataframe. + cohorts = pd.unique(fst_df[["cohort1", "cohort2"]].values.flatten()) + + # Create a dataframe to visualise as a heatmap. + fig_df = pd.DataFrame(columns=cohorts, index=cohorts) + + # Set up plot title. + title = "FST" + if annotation is not None: + title += " ⧅ " + annotation + + # Fill the figure dataframe from the Fst dataframe. + for cohort1, cohort2, fst, se in fst_df.itertuples(index=False): + fig_df.loc[cohort2, cohort1] = fst if annotation == "standard error": - se = fst_df.iloc[index_key]["se"] - fig_df[col][index] = se + fig_df.loc[cohort1, cohort2] = se elif annotation == "Z score": - zs = fst_df.iloc[index_key]["fst"] / fst_df.iloc[index_key]["se"] - fig_df[col][index] = zs + zs = fst / se + fig_df.loc[cohort1, cohort2] = zs else: - fig_df.loc[index, col] = fst + fig_df.loc[cohort1, cohort2] = fst + + # Don't colour the plot if the upper triangle is SE or Z score, + # as the colouring doesn't really make sense. + if annotation is not None and zmax is None: + zmax = 1e9 - # create plot + # Dynamically size the figure based on number of cohorts. + if height is None: + height = 100 + len(cohorts) * row_height + if width is None: + width = 200 + len(cohorts) * col_width + + # Create plot. with np.errstate(invalid="ignore"): fig = px.imshow( img=fig_df, @@ -523,7 +540,11 @@ def plot_pairwise_average_fst( aspect="auto", **kwargs, ) - fig.update_layout(plot_bgcolor="rgba(0,0,0,0)") + fig.update_layout( + plot_bgcolor="rgba(0,0,0,0)", + coloraxis_showscale=False, + title=title, + ) fig.update_yaxes(showgrid=False, linecolor="black") fig.update_xaxes(showgrid=False, linecolor="black") diff --git a/malariagen_data/anoph/plotly_params.py b/malariagen_data/anoph/plotly_params.py index 4430124ba..e931b78b0 100644 --- a/malariagen_data/anoph/plotly_params.py +++ b/malariagen_data/anoph/plotly_params.py @@ -26,7 +26,12 @@ fig_height: TypeAlias = Annotated[ Optional[int], - "Figure weight in pixels (px).", + "Figure height in pixels (px).", +] + +width: TypeAlias = Annotated[ + int, + "Width in pixels (px).", ] height: TypeAlias = Annotated[ diff --git a/notebooks/plot_pairwise_average_fst.ipynb b/notebooks/plot_pairwise_average_fst.ipynb index 901bf17fa..cefddfe75 100644 --- a/notebooks/plot_pairwise_average_fst.ipynb +++ b/notebooks/plot_pairwise_average_fst.ipynb @@ -24,7 +24,7 @@ "ag3 = malariagen_data.Ag3(\n", " \"simplecache::gs://vo_agam_release\",\n", " simplecache=dict(cache_storage=\"../gcs_cache\"),\n", - " cohorts_analysis=\"20230516\",\n", + " cohorts_analysis=\"20240717\",\n", " results_cache=\"results_cache\",\n", ")\n", "ag3" @@ -87,6 +87,16 @@ "pairwise_fst_df" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "35959f42-732e-40fa-86ad-16d95fb6e740", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_pairwise_average_fst(pairwise_fst_df)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -110,21 +120,23 @@ { "cell_type": "code", "execution_count": null, - "id": "35959f42-732e-40fa-86ad-16d95fb6e740", + "id": "c16035d2-b5d3-4441-87d9-eda953079161", "metadata": {}, "outputs": [], "source": [ - "ag3.plot_pairwise_average_fst(pairwise_fst_df, zmax=0.1)" + "ag3.plot_pairwise_average_fst(pairwise_fst_df, zmax=0.1, width=400, height=300)" ] }, { "cell_type": "code", "execution_count": null, - "id": "c16035d2-b5d3-4441-87d9-eda953079161", + "id": "13afd6ed-7962-4935-b1f1-ed12f5e2ceed", "metadata": {}, "outputs": [], "source": [ - "ag3.plot_pairwise_average_fst(pairwise_fst_df, zmax=0.1, width=400, height=300)" + "region=\"3L:15,000,000-16,000,000\"\n", + "site_mask='arab'\n", + "n_jack=200" ] }, { @@ -133,6 +145,96 @@ "id": "48a52211-2ef9-47da-b715-d0d74a69a4ad", "metadata": {}, "outputs": [], + "source": [ + "pairwise_fst_df = ag3.pairwise_average_fst(\n", + " region=region,\n", + " cohorts=\"cohort_admin1_year\",\n", + " sample_query=\"country == 'Kenya' and taxon == 'arabiensis'\",\n", + " sample_sets=[\"AG1000G-KE\", \"1274-VO-KE-KAMAU-VMF00246\"],\n", + " n_jack=n_jack,\n", + " site_mask=site_mask,\n", + " min_cohort_size=10,\n", + ")\n", + "pairwise_fst_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09cc1824-e6f4-4799-a8f4-cf61e472eaf8", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_pairwise_average_fst(pairwise_fst_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95f112d2-fe76-4091-b73b-d7a17a61cc18", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_pairwise_average_fst(pairwise_fst_df, annotation=\"standard error\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "908b08c8-290f-49d1-b3d3-cb8f9e7d2d21", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_pairwise_average_fst(pairwise_fst_df, annotation=\"Z score\", zmax=0.03)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "818fdac7-dcb4-4da2-8a52-f16ce51a2c0d", + "metadata": {}, + "outputs": [], + "source": [ + "wild_cohorts = {\n", + " \"Mwea_2007\": \"taxon == 'arabiensis' and location == 'Mwea' and year == 2007\",\n", + " \"Mwea_2014\": \"taxon == 'arabiensis' and location == 'Mwea' and year == 2014\",\n", + " \"Teso_2013\": \"taxon == 'arabiensis' and location == 'Teso'\",\n", + " \"Turkana_2006\": \"taxon == 'arabiensis' and location == 'Turkana'\",\n", + " \"Kilifi_2012\": \"taxon == 'arabiensis' and location == 'Kilifi' and year == 2012\",\n", + "}\n", + "fst_df = ag3.pairwise_average_fst(\n", + " region=\"3L:15,000,000-16,000,000\", \n", + " cohorts=wild_cohorts,\n", + " min_cohort_size=10,\n", + " site_mask=\"arab\",\n", + ")\n", + "fst_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65c1bdd5-f1d2-4814-93e5-ddb1d9c42258", + "metadata": {}, + "outputs": [], + "source": [ + "ag3.plot_pairwise_average_fst(fst_df, annotation=\"Z score\", zmax=0.4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ec8cfe3-336c-4362-9361-78d40ffa0a8e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d3236a2-842d-4c13-92ec-9a4338750aca", + "metadata": {}, + "outputs": [], "source": [] } ], @@ -152,7 +254,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.18" + "version": "3.10.12" } }, "nbformat": 4,