Merge pull request #8 from haniffalab/scanpy

Added full scanpy support
haniffalab · Sep 11, 2024 · 18e2ddf · 18e2ddf
2 parents 168f6c1 + 2efc998
commit 18e2ddf
Show file tree

Hide file tree

Showing 7 changed files with 1,087 additions and 8 deletions.
diff --git a/bin/cmdbase/rna/__init__.py b/bin/cmdbase/rna/__init__.py
@@ -1 +1,3 @@
 from .cellbender import *
+from .scanpy_basic import *
+
diff --git a/bin/cmdbase/rna/scanpy_basic.py b/bin/cmdbase/rna/scanpy_basic.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+import os
+import subprocess
+
+import click
+
+SHELL_SCRIPT_BASE = os.environ["SHELL_SCRIPT_BASE"]
+HL_IRODS_DOWNLOAD=os.environ["HL_IRODS_DOWNLOAD"]
+
+@click.command("scanpy")
+@click.option("--samplefile", required=True, help="Sample file text file")
+@click.option("--sample_basedir", required=False, default = HL_IRODS_DOWNLOAD,
+              help="sample database folder")
+def scanpyrun(samplefile, sample_basedir):
+    """
+    Basic scanpy run
+
+    Example: /lustre/scratch126/cellgen/team298/soft/bin/examples/irods_download.txt
+    Input file should have 3 mandatory columns
+    1st column: sanger_id
+    2nd column: sample_name
+    LAST column: irods path
+    You can have any column in between
+
+    pBCN14844712 BK31_1 /seq/illumina/runs/49/..../cellranger710multi....
+    pBCN14844713 BK31_2 /seq/illumina/runs/49/..../cellranger710multi....
+    pBCN14844714 BK31_3 /seq/illumina/runs/49/..../cellranger710multi....
+    pBCN14844715 BK31_4 /seq/illumina/runs/49/..../cellranger710multi....
+
+    ----------------------
+    Use the same sample file you used for irods/pull-processed
+    """
+    shell_script = os.path.join(SHELL_SCRIPT_BASE, "rna..scanpy")
+    result = subprocess.run(
+        [shell_script, sample_basedir, samplefile], capture_output=True, text=True
+    )
+    click.echo(result.stdout)
+    click.echo(result.stderr)
diff --git a/bin/hl..piperv100 b/bin/hl..piperv100
@@ -126,6 +126,7 @@ irods.add_command(cmdbase.irods.pull_fastqs)
 
 ################# scrna seq analysis commands ##
 rna.add_command(cmdbase.rna.cellbender)
+rna.add_command(cmdbase.rna.scanpyrun)
 
 ################# alignment commands ###########
 alignment.add_command(cmdbase.alignment.cellranger)

diff --git a/bin/nb/sc_base1.ipynb b/bin/nb/sc_base1.ipynb
diff --git a/bin/workflows/_rna..scanpy b/bin/workflows/_rna..scanpy
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+conda_env="/software/cellgen/team298/shared/envs/hl-conda/hl_scanpy_v0.2.0"
+if [ $# -ne 2 ]; then
+	echo "$0 samples_database sample_sheet.tsv"
+	echo "This is a follow up of irods/pull-processed. If you have not run it, do so"
+	echo "samples_database: Folder where you have all sample cellranger data. Ideally - /lustre/scratch126/cellgen/team298/sample_data/"
+	echo "sample_name: Folder name of sample that contains the processed_sanger folder"
+	exit 0
+fi
+
+samples_database=$1; shift
+sample_tsv=$1; shift
+
+
+
+HL_HIST_FOLDER=".pap"
+mem=10000
+target_dir=$HL_IRODS_DOWNLOAD # This is obtained by module load hl
+cwd=`pwd`
+run_token=$RUN_TOKEN
+ofile="rna_scanpy_$run_token.cmds"
+rm -f $ofile
+declare -i i=0
+while read line
+do
+        i+=1
+    if [ `echo $line | grep -c -i Sample` -ne 1 ]; then
+        sanger_id=`echo $line | awk ' { print $1 } '`
+        sample_id=`echo $line | awk ' { print $2 } '`
+	sample_name="${sample_id}_${sanger_id}"
+	#sample_folder="$samples_database/${sample_id}_${sanger_id}/processed_sanger/"
+	outpt_folder="$samples_database//rna_scanpy/"
+	outpt_folder="test"
+	cmd="papermill sc_base1.ipynb $outpt_folder/$sample_name.ipynb  -p samples_database '${samples_database}' -p sample_name $sample_name -k python3"
+	echo $cmd >> $ofile
+    fi
+done < $sample_tsv
+
+if [ ! -f $ofile ]; then
+        echo "Looks like nothing needs to be done"
+        echo "Exiting cleanly..."
+        exit 0
+fi
+
+total_jobs=$(cat $ofile | wc -l)
+bsub_id="rna_scanpy_${run_token}"
+cat > $bsub_id.bsub <<EOF
+#!/bin/bash
+#BSUB -J ${bsub_id}_[1-$total_jobs]%20
+#BSUB -o $HL_HIST_FOLDER/lsf/${bsub_id}_%I.out
+#BSUB -e $HL_HIST_FOLDER/lsf/${bsub_id}_%I.err
+#BSUB -M $mem
+#BSUB -R "select[mem>$mem] rusage[mem=$mem]"
+eval $conda_env
+COMMAND=\$(sed -n "\${LSB_JOBINDEX}p" $ofile) 
+eval \$COMMAND
+EOF
+
+#bsub -J irods_dl -o irods_dl_%J.log -e irods_dl_%J.log -q normal -n 1  -M4000 -R"select[mem>4000] rusage[mem=4000]" bash $ofile
+#bash $ofile
+#bsub < ${bsub_id}.bsub
+
diff --git a/bin/workflows/irods..download_processed b/bin/workflows/irods..download_processed
@@ -48,20 +48,21 @@ do
         sanger_id=`echo $line | awk ' { print $1 } '`
         sample_id=`echo $line | awk ' { print $2 } '`
         irods_path=`echo $line | awk ' { print $NF } '`
-        mkdir -p $target_dir/${sample_id}_${sanger_id}
-        folder_name="$target_dir/${sample_id}_${sanger_id}/processed_sanger/"
+        folder_name="$target_dir/${sample_id}_${sanger_id}"
+        target_name="$folder_name/processed_sanger/"
+        mkdir -p $target_name
 	if [ $overwrite -eq 1 ]; then
-		rm -rf $folder_name
+		rm -rf $target_name
 	fi
-	if [ -d $folder_name ]; then
-		echo "[Warn] Target folder already exists. Not downloading. Try overwrite option if you want to download. Irods::$irods_path --> Folder::$folder_name "
+	if [ -d $target_name ]; then
+		echo "[Warn] Target folder already exists. Not downloading. Try overwrite option if you want to download. Irods::$irods_path --> Folder::$target_name "
 	else
-		echo "($i)[Info] Irods::$irods_path --> Folder::$folder_name"
+		echo "($i)[Info] Irods::$irods_path --> Folder::$target_name"
 		if [ $retain_bam == 1 ]; then
 			#echo "sleep 5; $bsub_command $irods_command $irods_path $folder_name ; chmod -R 774 $folder_name ;  ln -s $folder_name $cwd/" >>  $ofile
-			echo "$irods_command $irods_path $folder_name ; chmod -R 774 $folder_name ;  ln -s $folder_name $cwd/" >>  $ofile
+			echo "$irods_command $irods_path $target_name ; chmod -R 774 $folder_name ;  ln -s $folder_name $cwd/" >>  $ofile
 		else
-			echo "$irods_command $irods_path $folder_name ; chmod -R 774 $folder_name ; rm -rf $folder_name/gex_possorted_bam.bam ; find $folder_name -name 'possorted_genome_bam.bam' -exec rm -rf {} \; ; ln -s $folder_name $cwd/" >>  $ofile
+			echo "$irods_command $irods_path $target_name ; chmod -R 774 $folder_name ; rm -rf $target_name/gex_possorted_bam.bam ; find $target_name -name 'possorted_genome_bam.bam' -exec rm -rf {} \; ; ln -s $folder_name $cwd/" >>  $ofile
 		fi
 
 	fi

diff --git a/bin/workflows/rna..scanpy b/bin/workflows/rna..scanpy
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+conda_env="/software/cellgen/team298/shared/envs/hl-conda/hl_scanpy_v0.2.0"
+if [ $# -ne 2 ]; then
+	echo "$0 samples_database sample_sheet.tsv"
+	echo "This is a follow up of irods/pull-processed. If you have not run it, do so"
+	echo "samples_database: Folder where you have all sample cellranger data. Ideally - /lustre/scratch126/cellgen/team298/sample_data/"
+	echo "sample_name: Folder name of sample that contains the processed_sanger folder"
+	exit 0
+fi
+
+samples_database=$1; shift
+sample_tsv=$1; shift
+
+mkdir -p pap
+
+HL_HIST_FOLDER=".pap"
+mem=10000
+target_dir=$HL_IRODS_DOWNLOAD # This is obtained by module load hl
+cwd=`pwd`
+run_token=$RUN_TOKEN
+ofile="rna_scanpy_$run_token.cmds"
+rm -f $ofile
+declare -i i=0
+while read line
+do
+        i+=1
+    if [ `echo $line | grep -c -i Sample` -ne 1 ]; then
+        sanger_id=`echo $line | awk ' { print $1 } '`
+        sample_id=`echo $line | awk ' { print $2 } '`
+	sample_name="${sample_id}_${sanger_id}"
+	#sample_folder="$samples_database/${sample_id}_${sanger_id}/processed_sanger/"
+	outpt_folder="$samples_database/${sample_name}/rna_scanpy/"
+	mkdir -p $outpt_folder
+	cmd="papermill $HL_PIPE_BASEDIR/bin/nb/sc_base1.ipynb $outpt_folder/$sample_name.ipynb  -p samples_database '${samples_database}' -p sample_name $sample_name -k python3;ln -s $samples_database/${sample_name} ."
+	echo $cmd >> $ofile
+    fi
+done < $sample_tsv
+
+if [ ! -f $ofile ]; then
+        echo "Looks like nothing needs to be done"
+        echo "Exiting cleanly..."
+        exit 0
+fi
+
+total_jobs=$(cat $ofile | wc -l)
+bsub_id="rna_scanpy_${run_token}"
+cat > $bsub_id.bsub <<EOF
+#!/bin/bash
+#BSUB -J ${bsub_id}_[1-$total_jobs]%20
+#BSUB -o $HL_HIST_FOLDER/lsf/${bsub_id}_%I.out
+#BSUB -e $HL_HIST_FOLDER/lsf/${bsub_id}_%I.err
+#BSUB -M $mem
+#BSUB -R "select[mem>$mem] rusage[mem=$mem]"
+eval $conda_env
+COMMAND=\$(sed -n "\${LSB_JOBINDEX}p" $ofile) 
+eval \$COMMAND
+EOF
+
+bsub < ${bsub_id}.bsub
+
Original file line number	Diff line number	Diff line change
		@@ -1 +1,3 @@
		from .cellbender import *
		from .scanpy_basic import *