From 3aa477e658503edd71acd192a62e190ec0b4fec5 Mon Sep 17 00:00:00 2001
From: Alistair Miles <alimanfoo@googlemail.com>
Date: Fri, 9 Aug 2024 16:45:33 +0100
Subject: [PATCH] fix and improve plotting pairwise average fst

---
 malariagen_data/anoph/fst.py              |  59 +++++++----
 malariagen_data/anoph/plotly_params.py    |   7 +-
 notebooks/plot_pairwise_average_fst.ipynb | 114 ++++++++++++++++++++--
 3 files changed, 154 insertions(+), 26 deletions(-)
diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py
index 57511be4e..abf54d1b3 100644
--- a/malariagen_data/anoph/fst.py
+++ b/malariagen_data/anoph/fst.py
@@ -485,32 +485,49 @@ def plot_pairwise_average_fst(
         zmax: Optional[plotly_params.zmax] = None,
         text_auto: plotly_params.text_auto = ".3f",
         color_continuous_scale: plotly_params.color_continuous_scale = "gray_r",
-        width: plotly_params.fig_width = 700,
-        height: plotly_params.fig_height = 600,
+        width: plotly_params.fig_width = None,
+        height: plotly_params.fig_height = None,
+        row_height: plotly_params.height = 40,
+        col_width: plotly_params.width = 50,
         show: plotly_params.show = True,
         renderer: plotly_params.renderer = None,
         **kwargs,
     ):
-        # setup df
-        cohort_list = np.unique(fst_df[["cohort1", "cohort2"]].values)
-        # df to fill
-        fig_df = pd.DataFrame(columns=cohort_list, index=cohort_list)
-        # fill df from fst_df
-        for index_key in range(len(fst_df)):
-            index = fst_df.iloc[index_key]["cohort1"]
-            col = fst_df.iloc[index_key]["cohort2"]
-            fst = fst_df.iloc[index_key]["fst"]
-            fig_df[index][col] = fst
+        # Obtain a list of all cohorts analysed. N.B., preserve the order in
+        # which the cohorts are provided in the input dataframe.
+        cohorts = pd.unique(fst_df[["cohort1", "cohort2"]].values.flatten())
+
+        # Create a dataframe to visualise as a heatmap.
+        fig_df = pd.DataFrame(columns=cohorts, index=cohorts)
+
+        # Set up plot title.
+        title = "<i>F</i><sub>ST</sub>"
+        if annotation is not None:
+            title += " ⧅ " + annotation
+
+        # Fill the figure dataframe from the Fst dataframe.
+        for cohort1, cohort2, fst, se in fst_df.itertuples(index=False):
+            fig_df.loc[cohort2, cohort1] = fst
             if annotation == "standard error":
-                se = fst_df.iloc[index_key]["se"]
-                fig_df[col][index] = se
+                fig_df.loc[cohort1, cohort2] = se
             elif annotation == "Z score":
-                zs = fst_df.iloc[index_key]["fst"] / fst_df.iloc[index_key]["se"]
-                fig_df[col][index] = zs
+                zs = fst / se
+                fig_df.loc[cohort1, cohort2] = zs
             else:
-                fig_df.loc[index, col] = fst
+                fig_df.loc[cohort1, cohort2] = fst
+
+        # Don't colour the plot if the upper triangle is SE or Z score,
+        # as the colouring doesn't really make sense.
+        if annotation is not None and zmax is None:
+            zmax = 1e9
 
-        # create plot
+        # Dynamically size the figure based on number of cohorts.
+        if height is None:
+            height = 100 + len(cohorts) * row_height
+        if width is None:
+            width = 200 + len(cohorts) * col_width
+
+        # Create plot.
         with np.errstate(invalid="ignore"):
             fig = px.imshow(
                 img=fig_df,
@@ -523,7 +540,11 @@ def plot_pairwise_average_fst(
                 aspect="auto",
                 **kwargs,
             )
-        fig.update_layout(plot_bgcolor="rgba(0,0,0,0)")
+        fig.update_layout(
+            plot_bgcolor="rgba(0,0,0,0)",
+            coloraxis_showscale=False,
+            title=title,
+        )
         fig.update_yaxes(showgrid=False, linecolor="black")
         fig.update_xaxes(showgrid=False, linecolor="black")
 
diff --git a/malariagen_data/anoph/plotly_params.py b/malariagen_data/anoph/plotly_params.py
index 4430124ba..e931b78b0 100644
--- a/malariagen_data/anoph/plotly_params.py
+++ b/malariagen_data/anoph/plotly_params.py
@@ -26,7 +26,12 @@
 
 fig_height: TypeAlias = Annotated[
     Optional[int],
-    "Figure weight in pixels (px).",
+    "Figure height in pixels (px).",
+]
+
+width: TypeAlias = Annotated[
+    int,
+    "Width in pixels (px).",
 ]
 
 height: TypeAlias = Annotated[
diff --git a/notebooks/plot_pairwise_average_fst.ipynb b/notebooks/plot_pairwise_average_fst.ipynb
index 901bf17fa..cefddfe75 100644
--- a/notebooks/plot_pairwise_average_fst.ipynb
+++ b/notebooks/plot_pairwise_average_fst.ipynb
@@ -24,7 +24,7 @@
     "ag3 = malariagen_data.Ag3(\n",
     "    \"simplecache::gs://vo_agam_release\",\n",
     "    simplecache=dict(cache_storage=\"../gcs_cache\"),\n",
-    "    cohorts_analysis=\"20230516\",\n",
+    "    cohorts_analysis=\"20240717\",\n",
     "    results_cache=\"results_cache\",\n",
     ")\n",
     "ag3"
@@ -87,6 +87,16 @@
     "pairwise_fst_df"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35959f42-732e-40fa-86ad-16d95fb6e740",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_pairwise_average_fst(pairwise_fst_df)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -110,21 +120,23 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "35959f42-732e-40fa-86ad-16d95fb6e740",
+   "id": "c16035d2-b5d3-4441-87d9-eda953079161",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ag3.plot_pairwise_average_fst(pairwise_fst_df, zmax=0.1)"
+    "ag3.plot_pairwise_average_fst(pairwise_fst_df, zmax=0.1, width=400, height=300)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c16035d2-b5d3-4441-87d9-eda953079161",
+   "id": "13afd6ed-7962-4935-b1f1-ed12f5e2ceed",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ag3.plot_pairwise_average_fst(pairwise_fst_df, zmax=0.1, width=400, height=300)"
+    "region=\"3L:15,000,000-16,000,000\"\n",
+    "site_mask='arab'\n",
+    "n_jack=200"
    ]
   },
   {
@@ -133,6 +145,96 @@
    "id": "48a52211-2ef9-47da-b715-d0d74a69a4ad",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "pairwise_fst_df = ag3.pairwise_average_fst(\n",
+    "    region=region,\n",
+    "    cohorts=\"cohort_admin1_year\",\n",
+    "    sample_query=\"country == 'Kenya' and taxon == 'arabiensis'\",\n",
+    "    sample_sets=[\"AG1000G-KE\", \"1274-VO-KE-KAMAU-VMF00246\"],\n",
+    "    n_jack=n_jack,\n",
+    "    site_mask=site_mask,\n",
+    "    min_cohort_size=10,\n",
+    ")\n",
+    "pairwise_fst_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09cc1824-e6f4-4799-a8f4-cf61e472eaf8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_pairwise_average_fst(pairwise_fst_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95f112d2-fe76-4091-b73b-d7a17a61cc18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_pairwise_average_fst(pairwise_fst_df, annotation=\"standard error\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "908b08c8-290f-49d1-b3d3-cb8f9e7d2d21",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_pairwise_average_fst(pairwise_fst_df, annotation=\"Z score\", zmax=0.03)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "818fdac7-dcb4-4da2-8a52-f16ce51a2c0d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wild_cohorts = {\n",
+    "    \"Mwea_2007\": \"taxon == 'arabiensis' and location == 'Mwea' and year == 2007\",\n",
+    "    \"Mwea_2014\": \"taxon == 'arabiensis' and location == 'Mwea' and year == 2014\",\n",
+    "    \"Teso_2013\": \"taxon == 'arabiensis' and location == 'Teso'\",\n",
+    "    \"Turkana_2006\": \"taxon == 'arabiensis' and location == 'Turkana'\",\n",
+    "    \"Kilifi_2012\": \"taxon == 'arabiensis' and location == 'Kilifi' and year == 2012\",\n",
+    "}\n",
+    "fst_df = ag3.pairwise_average_fst(\n",
+    "    region=\"3L:15,000,000-16,000,000\", \n",
+    "    cohorts=wild_cohorts,\n",
+    "    min_cohort_size=10,\n",
+    "    site_mask=\"arab\",\n",
+    ")\n",
+    "fst_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65c1bdd5-f1d2-4814-93e5-ddb1d9c42258",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_pairwise_average_fst(fst_df, annotation=\"Z score\", zmax=0.4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ec8cfe3-336c-4362-9361-78d40ffa0a8e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d3236a2-842d-4c13-92ec-9a4338750aca",
+   "metadata": {},
+   "outputs": [],
    "source": []
   }
  ],
@@ -152,7 +254,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.18"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,