formatting

haniffalab · Sep 12, 2024 · e2b126d · e2b126d
1 parent cb26dc2
commit e2b126d
Show file tree

Hide file tree

Showing 9 changed files with 81 additions and 59 deletions.
diff --git a/bin/cmdbase/__init__.py b/bin/cmdbase/__init__.py
@@ -1,2 +1,2 @@
-from . import alignment, irods, rna, basic
+from . import alignment, basic, irods, rna
 from .helpers import *
diff --git a/bin/cmdbase/alignment/cellranger.py b/bin/cmdbase/alignment/cellranger.py
@@ -10,8 +10,13 @@
 
 @click.command("cellranger")
 @click.option("--samplefile", required=True, help="Sample file text file")
-@click.option("--includebam", is_flag=True, default=False, help="Include BAM files (removes --no-bam from cellranger)")
-#@click.option('--includebam', required=False, show_default=True, default=False, help="Pull Bam files")
+@click.option(
+    "--includebam",
+    is_flag=True,
+    default=False,
+    help="Include BAM files (removes --no-bam from cellranger)",
+)
+# @click.option('--includebam', required=False, show_default=True, default=False, help="Pull Bam files")
 def cellranger(samplefile, includebam):
     """
     Cellranger aligns sc-rna seq reads... \n
@@ -31,6 +36,8 @@ def cellranger(samplefile, includebam):
     includebam = str(includebam * 1)
     includebam_str = "1" if includebam else "0"
     result_CR = subprocess.run(
-        [shell_cellranger_script, samplefile, includebam], capture_output=True, text=True
+        [shell_cellranger_script, samplefile, includebam],
+        capture_output=True,
+        text=True,
     )
     click.echo(result_CR.stdout)
diff --git a/bin/cmdbase/alignment/starsolo.py b/bin/cmdbase/alignment/starsolo.py
@@ -9,7 +9,7 @@
 
 
 @click.command()
-@click.option('--samplefile', required=True, help="Sample file text file")
+@click.option("--samplefile", required=True, help="Sample file text file")
 def starsolo(samplefile):
     """
     STARsolo aligns sc-rna seq reads...

diff --git a/bin/cmdbase/basic/history.py b/bin/cmdbase/basic/history.py
@@ -1,32 +1,32 @@
 #!/usr/bin/env python3
 
-import click
 import os
-from tabulate import tabulate
-import pandas as pd
 
-@click.command("history")
+import click
+import pandas as pd
+from tabulate import tabulate
 
-@click.option("--last", required=False,  help="Retrieve last n commands",
-              default=10, type = int)
 
+@click.command("history")
+@click.option(
+    "--last", required=False, help="Retrieve last n commands", default=10, type=int
+)
 @click.option(
     "--all",
     default=False,
     is_flag=True,
     required=False,
     help="Retrieve all history",
 )
-
 def history(last, all):
-    CWD = os.environ['CWD']
+    CWD = os.environ["CWD"]
     hist_file = os.path.join(CWD, ".pap/") + "hist"
     if not os.path.exists(hist_file):
         click.echo("No history file present")
         return 0
-    hist = pd.read_csv(hist_file, index_col = 0)
+    hist = pd.read_csv(hist_file, index_col=0)
     if all:
-        print(tabulate(hist, headers='keys', tablefmt='plain'))
+        print(tabulate(hist, headers="keys", tablefmt="plain"))
     else:
-        print(tabulate(hist.tail(last), headers='keys', tablefmt='plain'))
+        print(tabulate(hist.tail(last), headers="keys", tablefmt="plain"))
     return 0
diff --git a/bin/cmdbase/irods/pull_fastqs.py b/bin/cmdbase/irods/pull_fastqs.py
@@ -9,7 +9,7 @@
 
 
 @click.command("pull-fastqs")
-@click.option('--samplefile', required=True, help="Sample file text file")
+@click.option("--samplefile", required=True, help="Sample file text file")
 def pull_fastqs(samplefile):
     """
     Downloads processed irods data or any folder from irods
@@ -19,6 +19,8 @@ def pull_fastqs(samplefile):
     print("Using irods to download data")
     print("If you have a large set of files, this command will take a while to run")
     shell_script_fq = os.path.join(SHELL_SCRIPT_BASE, "irods..fastqs")
-    result_fq = subprocess.run([shell_script_fq, samplefile], capture_output=True, text=True)
+    result_fq = subprocess.run(
+        [shell_script_fq, samplefile], capture_output=True, text=True
+    )
     click.echo(result_fq.stdout)
     click.echo(result_fq.stderr)
diff --git a/bin/cmdbase/rna/cellbender.py b/bin/cmdbase/rna/cellbender.py
@@ -7,9 +7,11 @@
 
 SHELL_SCRIPT_BASE = os.environ["SHELL_SCRIPT_BASE"]
 
+
 @click.command("cellbender")
 @click.option("--samplefile", required=True, help="Sample file text file")
-@click.option("--total_droplets_included", required=True, help="total_droplets_included"
+@click.option(
+    "--total_droplets_included", required=True, help="total_droplets_included"
 )
 def cellbender(samplefile, total_droplets_included, **kwargs):
     """

diff --git a/bin/cmdbase/rna/scanpy_basic.py b/bin/cmdbase/rna/scanpy_basic.py
@@ -5,12 +5,17 @@
 import click
 
 SHELL_SCRIPT_BASE = os.environ["SHELL_SCRIPT_BASE"]
-HL_IRODS_DOWNLOAD=os.environ["HL_IRODS_DOWNLOAD"]
+HL_IRODS_DOWNLOAD = os.environ["HL_IRODS_DOWNLOAD"]
+
 
 @click.command("scanpy")
 @click.option("--samplefile", required=True, help="Sample file text file")
-@click.option("--sample_basedir", required=False, default = HL_IRODS_DOWNLOAD,
-              help="sample database folder")
+@click.option(
+    "--sample_basedir",
+    required=False,
+    default=HL_IRODS_DOWNLOAD,
+    help="sample database folder",
+)
 def scanpyrun(samplefile, sample_basedir):
     """
     Basic scanpy run

diff --git a/bin/nb/sc_base1.ipynb b/bin/nb/sc_base1.ipynb
@@ -40,8 +40,9 @@
     "    )\n",
     "    return outlier\n",
     "\n",
+    "\n",
     "def print_file(filename):\n",
-    "    with open(filename, 'r') as f:\n",
+    "    with open(filename, \"r\") as f:\n",
     "        print(f.readlines())"
    ]
   },
@@ -52,7 +53,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "os.environ['HLBI_PIPE_BASEDIR'] = '/lustre/scratch126/cellgen/team298/vm11/PROJECTS/PIPELINE/Haniffa-utils/'"
+    "os.environ[\n",
+    "    \"HLBI_PIPE_BASEDIR\"\n",
+    "] = \"/lustre/scratch126/cellgen/team298/vm11/PROJECTS/PIPELINE/Haniffa-utils/\""
    ]
   },
   {
@@ -73,7 +76,7 @@
     }
    ],
    "source": [
-    " !cat scanpy_rna_config.py"
+    "!cat scanpy_rna_config.py"
    ]
   },
   {
@@ -92,11 +95,11 @@
    ],
    "source": [
     "# Importing config file\n",
-    "#%reload_ext scanpy_rna_config\n",
-    "#if os.path.exists(\"scanpy_rna_config.py\"):\n",
+    "# %reload_ext scanpy_rna_config\n",
+    "# if os.path.exists(\"scanpy_rna_config.py\"):\n",
     "#    print(\"Reading user config file\")\n",
     "#    from scanpy_rna_config import *\n",
-    "#else:\n",
+    "# else:\n",
     "#    print(\"Reading global config file\")\n",
     "#    sys.path.append(os.path.join(os.environ['HLBI_PIPE_BASEDIR'], 'bin', 'nb'))\n",
     "#    from scanpy_rna_config import *"
@@ -123,12 +126,12 @@
     "min_genes_cutoff = 100\n",
     "sim_doublet_ratio = 5\n",
     "target_sum = 10000\n",
-    "n_top_genes=2000\n",
-    "n_comps = 50 # pca\n",
-    "n_pcs = 50 # kNN\n",
-    "resolution = 1 # leiden\n",
-    "min_dist = 0.5 # UMAP\n",
-    "spread = 1 # UMAP"
+    "n_top_genes = 2000\n",
+    "n_comps = 50  # pca\n",
+    "n_pcs = 50  # kNN\n",
+    "resolution = 1  # leiden\n",
+    "min_dist = 0.5  # UMAP\n",
+    "spread = 1  # UMAP"
    ]
   },
   {
@@ -143,8 +146,8 @@
    "outputs": [],
    "source": [
     "samples_database = \"/lustre/scratch126/cellgen/team298/sample_data/\"\n",
-    "#sample_n = \"Apr24_chimeroid_d97_03A-BFP_HCA_SkO15052460\"\n",
-    "#sample_folder=\"/lustre/scratch126/cellgen/team298/sample_data/BK23-SKI-27-FT-1b_mG_rBCN14655446/processed_sanger\"\n",
+    "# sample_n = \"Apr24_chimeroid_d97_03A-BFP_HCA_SkO15052460\"\n",
+    "# sample_folder=\"/lustre/scratch126/cellgen/team298/sample_data/BK23-SKI-27-FT-1b_mG_rBCN14655446/processed_sanger\"\n",
     "sample_name = \"BK23-SKI-27-FT-1b_mG_rBCN14655446\""
    ]
   },
@@ -204,17 +207,18 @@
    "source": [
     "# Read file\n",
     "outpt_folder = os.path.join(samples_database, sample_name, \"rna_scanpy\")\n",
-    "#outpt_folder =  os.path.join(\"test\", sample_name)\n",
+    "# outpt_folder =  os.path.join(\"test\", sample_name)\n",
     "os.makedirs(outpt_folder, exist_ok=True)\n",
     "\n",
-    "x = list(Path(os.path.join(samples_database, sample_name, 'processed_sanger')).rglob('filtered_feature_bc_matrix.h5'))\n",
+    "x = list(\n",
+    "    Path(os.path.join(samples_database, sample_name, \"processed_sanger\")).rglob(\n",
+    "        \"filtered_feature_bc_matrix.h5\"\n",
+    "    )\n",
+    ")\n",
     "print(x)\n",
     "readfilename = x[0]\n",
     "\n",
-    "adata = sc.read_10x_h5(\n",
-    "    readfilename,\n",
-    "    gex_only=gex_only\n",
-    ")\n",
+    "adata = sc.read_10x_h5(readfilename, gex_only=gex_only)\n",
     "\n",
     "adata.var_names_make_unique()\n",
     "adata"
@@ -267,7 +271,8 @@
    "outputs": [],
    "source": [
     "min_cells = np.round(adata.shape[0] * 0.005)\n",
-    "if min_cells > min_cells_cutoff: min_cells = min_cells_cutoff\n",
+    "if min_cells > min_cells_cutoff:\n",
+    "    min_cells = min_cells_cutoff\n",
     "min_counts = 1"
    ]
   },
@@ -279,7 +284,8 @@
    "outputs": [],
    "source": [
     "min_genes = np.round(adata.shape[0] * 0.005)\n",
-    "if min_cells > min_cells_cutoff: min_cells = min_cells_cutoff\n",
+    "if min_cells > min_cells_cutoff:\n",
+    "    min_cells = min_cells_cutoff\n",
     "min_counts = 1"
    ]
   },
@@ -526,10 +532,10 @@
     }
    ],
    "source": [
-    "print(\"Sim doublet ratio:\", sim_doublet_ratio )\n",
-    "threshold=0.7\n",
-    "sc.external.pp.scrublet(adata, sim_doublet_ratio=sim_doublet_ratio, threshold = threshold)\n",
-    "#sc.pp.scrublet(adata)"
+    "print(\"Sim doublet ratio:\", sim_doublet_ratio)\n",
+    "threshold = 0.7\n",
+    "sc.external.pp.scrublet(adata, sim_doublet_ratio=sim_doublet_ratio, threshold=threshold)\n",
+    "# sc.pp.scrublet(adata)"
    ]
   },
   {
@@ -587,7 +593,7 @@
     }
    ],
    "source": [
-    "#adata.obs.predicted_doublet.value_counts()"
+    "# adata.obs.predicted_doublet.value_counts()"
    ]
   },
   {
@@ -597,7 +603,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#adata = adata[~adata.obs.predicted_doublet]"
+    "# adata = adata[~adata.obs.predicted_doublet]"
    ]
   },
   {
@@ -647,7 +653,7 @@
     }
    ],
    "source": [
-    "adata.layers['counts'] = adata.X.copy()\n",
+    "adata.layers[\"counts\"] = adata.X.copy()\n",
     "sc.pp.normalize_total(adata, target_sum=target_sum)"
    ]
   },
@@ -669,7 +675,7 @@
    "outputs": [],
    "source": [
     "adata.raw = adata\n",
-    "adata.layers['logcounts'] = adata.X.copy()"
+    "adata.layers[\"logcounts\"] = adata.X.copy()"
    ]
   },
   {
@@ -701,8 +707,8 @@
     "# scaling\n",
     "# classicially you scale the data\n",
     "# this is a time consuming step, so is not run now.\n",
-    "# \n",
-    "#sc.pp.scale(adata)"
+    "#\n",
+    "# sc.pp.scale(adata)"
    ]
   },
   {
@@ -810,7 +816,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sc.tl.umap(adata, min_dist = min_dist, spread = spread)"
+    "sc.tl.umap(adata, min_dist=min_dist, spread=spread)"
    ]
   },
   {
@@ -820,7 +826,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "reqCols.extend(['leiden'])"
+    "reqCols.extend([\"leiden\"])"
    ]
   },
   {
@@ -877,7 +883,7 @@
     }
    ],
    "source": [
-    "sc.pl.umap(adata, color = reqCols, ncols=3)"
+    "sc.pl.umap(adata, color=reqCols, ncols=3)"
    ]
   },
   {
@@ -887,7 +893,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "adata.write_h5ad(os.path.join(outpt_folder, sample_name+\".h5ad\"))"
+    "adata.write_h5ad(os.path.join(outpt_folder, sample_name + \".h5ad\"))"
    ]
   }
  ],

diff --git a/bin/nb/scanpy_rna_config.py b/bin/nb/scanpy_rna_config.py
@@ -1,3 +1,3 @@
 gex_only = True
 sim_doublet_ratio = 5
-min_cells_cutoff = 50 # min number of cells a gene is present for it to be realised
+min_cells_cutoff = 50  # min number of cells a gene is present for it to be realised