nf-core · ggabernet · Jun 22, 2023 · Jun 20, 2023 · Jun 20, 2023 · Jun 20, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -48,7 +48,8 @@ jobs:
         NXF_VER:
           - "22.10.1"
           - "latest-everything"
-        profile: ["test_tcr", "test_no_umi", "test_nocluster", "test_fetchimgt", "test_assembled_hs", "test_assembled_mm"]
+        profile:
+          ["test_tcr", "test_no_umi", "test_nocluster", "test_fetchimgt", "test_assembled_hs", "test_assembled_mm"]
       fail-fast: false
     steps:
       - name: Check out pipeline code

diff --git a/.github/workflows/ci_immcantation.yml b/.github/workflows/ci_immcantation.yml
@@ -25,7 +25,12 @@ jobs:
         NXF_VER:
           - "22.10.1"
           - "latest-everything"
-        profile: ["test_assembled_immcantation_devel_hs", "test_assembled_immcantation_devel_mm", "test_raw_immcantation_devel"]
+        profile:
+          [
+            "test_assembled_immcantation_devel_hs",
+            "test_assembled_immcantation_devel_mm",
+            "test_raw_immcantation_devel",
+          ]
       fail-fast: false
     steps:
       - name: Check out pipeline code

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,20 @@
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
-## [3.1] - 2023-06-05 "Protego"
+## [3.2.0dev] -
+
+### `Added`
+
+- [#268](https://github.com/nf-core/airrflow/pull/268) Added parameters for FindThreshold in `modules.config`.
+- [#268](https://github.com/nf-core/airrflow/pull/268) Validate samplesheet also for `assembled` samplesheet.
+
+### `Fixed`
+
+- [#268](https://github.com/nf-core/airrflow/pull/268) Allows for uppercase and lowercase locus in samplesheet `pcr_target_locus`.
+
+### `Dependencies`
+
+## [3.1.0] - 2023-06-05 "Protego"
 
 ### `Added`
 

diff --git a/assets/repertoire_comparison.Rmd b/assets/repertoire_comparison.Rmd
@@ -55,21 +55,10 @@ datadir <- "."
 Number of reads for each of the samples and number of sequences left after performing sequence assembly and alignment to reference data.
 The full table can be found under [Table_sequences_assembly](repertoire_comparison/Sequence_numbers_summary/Table_sequences_assembly.tsv).
 
-```{r seq_numbers, echo=FALSE, warning=FALSE, results='asis'}
-read_table <- function(tab_file){
-        tab_seqs <- read.table(tab_file, header=TRUE, sep="\t", check.names = FALSE)
-        write.table(tab_seqs, file=paste0(seq_dir,"/Table_sequences_assembly.tsv"), sep="\t", quote=F, row.names=F)
-    }
-tryCatch( {read_table("./Table_sequences.tsv")} ,
-  error=function(e){message("No sequence numbers are available if starting with assembled reads.")}
-)
-
-```
-
-
 ```{r seq_numbers_plot, echo=FALSE, warning=FALSE, results='asis'}
 tryCatch( {
     tab_seqs <- read.table("./Table_sequences.tsv", header=TRUE, sep="\t", check.names = FALSE)
+    write.table(tab_seqs, file=paste0(seq_dir,"/Table_sequences_assembly.tsv"), sep="\t", quote=F, row.names=F)
 
     plot_table <- tidyr::pivot_longer(tab_seqs,
                                       cols=Sequences_R1:Igblast,
@@ -89,6 +78,8 @@ tryCatch( {
                   theme(axis.text.x= element_text(angle = 45))
 
     ggplotly(seqs_plot)
+
+
   },
   error=function(e){message("No sequence numbers are available if starting with assembled reads.")}
 )
@@ -145,11 +136,7 @@ ggplotly(seqs_plot_assembled)
 # in the current folder
 all_files <- system(paste0("find '", datadir, "' -name '*clone-pass.tsv'"), intern=T)
 
-diversity_dir <- paste(outdir, "Diversity", sep="/")
-abundance_dir <- paste(outdir, "Abundance", sep="/")
 vfamily_dir <- paste(outdir, "V_family", sep="/")
-dir.create(diversity_dir)
-dir.create(abundance_dir)
 dir.create(vfamily_dir)
 
 # Generate one big dataframe from all patient dataframes
@@ -160,29 +147,26 @@ col_select <- c(
   "junction",
   "pcr_target_locus"
 )
-i <- 0
-df_all <- dplyr::bind_rows(lapply(all_files,
-  function(x) {
-    i <<- i +1
-    message("Loading file #",i,":",x)
-    read_rearrangement(x, col_select=col_select)
-  })
-)
+df_all <- dplyr::bind_rows(lapply(all_files, read_rearrangement, col_select=col_select))
+
 
-# Remove underscores in these columns
-df_all$subject_id <- sapply(df_all$subject_id, function(x) str_replace(as.character(x), "_", ""))
-df_all$sample_id <- sapply(df_all$sample_id, function(x) str_replace(as.character(x), "_", ""))
+# Remove underscores in these columns (only needed if including clonal abundance and diversity)
+df_all$subject_id <- stringr::str_replace_all(df_all$subject_id, "_", "")
+df_all$sample_id <- stringr::str_replace_all(df_all$sample_id , "_", "")
 
 # Annotate sample and samplepop (sample + population) by add ing all the conditions
 df_all$subj_locus <- as.factor(paste(df_all$sample_id, df_all$subject_id, df_all$pcr_target_locus, sep="_"))
 
-# Write table to file
-write.table(df_all, paste0(outdir,"/all_data.tsv"), sep = "\t", quote=F, row.names = F, col.names = T)
+# Uncomment to save a table with all the sequencess across samples together
+# write.table(df_all, paste0(outdir,"/all_data.tsv"), sep = "\t", quote=F, row.names = F, col.names = T)
 
 # Set number of bootrstraps
 nboot <- 200
 ```
 
+
+<!-- Uncomment to include Clonal abundance and clonal diversity in the repertoire comparison report
+
 # Clonal abundance
 
 For plotting the clonal abundance, the clones were ordered by size from bigger clones to smaller clones (x-axis, Rank).
@@ -196,7 +180,15 @@ range of the bootstrap samples.
 
 All clonal abundance plots and tables with abundance values can be found under `repertoire_analysis/Abundance`.
 
-```{r clonal_abundance, echo=FALSE}
+-->
+
+```{r clonal_abundance, echo=FALSE, eval=FALSE}
+# Set line above to eval=TRUE to include clonal abundance
+diversity_dir <- paste(outdir, "Diversity", sep="/")
+abundance_dir <- paste(outdir, "Abundance", sep="/")
+dir.create(diversity_dir)
+dir.create(abundance_dir)
+
 abund <- estimateAbundance(df_all, group = "subj_locus", ci=0.95, nboot=nboot)
 abund@abundance$sample_id <- sapply(abund@abundance$subj_locus, function(x) unlist(strsplit(as.character(x), "_"))[1])
 abund@abundance$subject_id <- sapply(abund@abundance$subj_locus, function(x) unlist(strsplit(as.character(x), "_"))[2])
@@ -220,12 +212,14 @@ p_ca
 
 ```
 
-```{r plot_abundance, include = FALSE}
+```{r plot_abundance, include = FALSE, eval=FALSE}
+# Set to eval=TRUE to include clonal abundance
 ggsave(plot=p_ca, filename = paste0(abundance_dir,"/Clonal_abundance_subject.pdf"), device="pdf", width = 25, height = 10, units="cm")
 ggsave(plot=p_ca, filename = paste0(abundance_dir,"/Clonal_abundance_subject.png"), device="png", width = 25, height = 10, units="cm")
 write.table(abund@abundance, file = paste0(abundance_dir, "/Clonal_abundance_data_subject.tsv"), sep="\t", quote = F, row.names = F)
 ```
 
+<!-- Uncomment to include Clonal diversity and clonal diversity in the repertoire comparison report
 
 # Clonal diversity
 
@@ -264,9 +258,10 @@ To correct for the different number of sequences in each of the samples, the Boo
 in which `r nboot` random bootstrap samples were taken, with size the number of sequences in the sample with less sequences (N).
 The solid line shows the mean Diversity of the bootstrap samples, whereas the transparent area shows the full Diversity
 range of the bootstrap samples.
+-->
 
-
-```{r clonal_diversity, echo = FALSE}
+```{r clonal_diversity, echo = FALSE, eval=FALSE}
+# Set line above to eval=TRUE to include clonal diversity
 sample_div <- alphaDiversity(abund, group="subj_locus", min_q=0, max_q=4, step_q=0.05,
                             ci=0.95, nboot=nboot)
 sample_main <- paste0("Sample diversity (N=", sample_div@n[1], ")")
@@ -285,12 +280,14 @@ div_p <- ggplot(sample_div@diversity, aes(x = q, y = d, group=sample_id)) +
 
 div_p
 ```
-```{r plot_diversity, include = FALSE}
+```{r plot_diversity, include = FALSE, eval=FALSE}
+# Set to eval=TRUE to include clonal diversity
 ggsave(plot=div_p, filename=paste0(diversity_dir,"/Diversity_patient_grid.png"), device="png", width = 25, height = 10, units="cm")
 ggsave(plot=div_p, filename=paste0(diversity_dir,"/Diversity_patient_grid.pdf"), device="pdf", width = 25, height = 10, units="cm")
 write.table(sample_div@diversity, file = paste0(diversity_dir, "/Clonal_diversity_data_subject.tsv"), sep="\t", quote = F, row.names = F)
 ```
 
+
 # V gene usage
 
 ## V gene family usage

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -15,7 +15,8 @@ def parse_args(args=None):
     Epilog = "Example usage: python check_samplesheet.py <FILE_IN>"
 
     parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
-    parser.add_argument("FILE_IN", help="Input samplesheet file.")
+    parser.add_argument("file_in", help="Input samplesheet file.")
+    parser.add_argument("-a", "--assembled", help="Input samplesheet type", action="store_true", default=False)
     return parser.parse_args(args)
 
 
@@ -38,22 +39,22 @@ def print_error(error, context="Line", context_str=""):
     sys.exit(1)
 
 
-def check_samplesheet(file_in):
+def check_samplesheet(file_in, assembled):
     """
     This function checks that the samplesheet:
 
     - contains the compulsory fields: sample_id, filename_R1, filename_R2, subject_id, pcr_target_locus, species, single_cell
     - sample ids are unique
     - samples from the same subject come from the same species
-    - pcr_target_locus is "IG" or "TR"
+    - pcr_target_locus is "IG"/"ig" or "TR"/"tr"
     - species is "human" or "mouse"
     """
 
     sample_run_dict = {}
     with open(file_in, "r") as fin:
-        ## Check that required columns are present
+        # Defining minimum columns and required columns
         min_cols = 7
-        required_columns = [
+        required_columns_raw = [
             "sample_id",
             "filename_R1",
             "filename_R2",
@@ -66,7 +67,19 @@ def check_samplesheet(file_in):
             "biomaterial_provider",
             "age",
         ]
-        no_whitespaces = [
+        required_columns_assembled = [
+            "sample_id",
+            "filename",
+            "subject_id",
+            "species",
+            "pcr_target_locus",
+            "single_cell",
+            "sex",
+            "tissue",
+            "biomaterial_provider",
+            "age",
+        ]
+        no_whitespaces_raw = [
             "sample_id",
             "filename_R1",
             "filename_R2",
@@ -75,13 +88,52 @@ def check_samplesheet(file_in):
             "pcr_target_locus",
             "tissue",
         ]
+        no_whitespaces_assembled = [
+            "sample_id",
+            "filename",
+            "subject_id",
+            "species",
+            "pcr_target_locus",
+            "tissue",
+        ]
+
+        ## Read header
         header = [x.strip('"') for x in fin.readline().strip().split("\t")]
-        for col in required_columns:
-            if col not in header:
-                print("ERROR: Please check samplesheet header: {} ".format(",".join(header)))
-                print("Header is missing column {}".format(col))
-                print("Header must contain columns {}".format("\t".join(required_columns)))
-                raise IndexError("Header must contain columns {}".format("\t".join(required_columns)))
+        ## Read tab
+        tab = pd.read_csv(file_in, sep="\t", header=0)
+
+        # Check that all required columns for assembled and raw samplesheets are there, and do not contain whitespaces
+        if assembled:
+            for col in required_columns_assembled:
+                if col not in header:
+                    print("ERROR: Please check samplesheet header: {} ".format(",".join(header)))
+                    print("Header is missing column {}".format(col))
+                    print("Header must contain columns {}".format("\t".join(required_columns)))
+                    raise IndexError("Header must contain columns {}".format("\t".join(required_columns)))
+            for col in no_whitespaces_assembled:
+                values = tab[col].tolist()
+                if any([re.search(r"\s+", s) for s in values]):
+                    print_error(
+                        "The column {} contains values with whitespaces. Please ensure that there are no tabs, spaces or any other whitespaces in these columns as well: {}".format(
+                            col, no_whitespaces_assembled
+                        )
+                    )
+
+        else:
+            for col in required_columns_raw:
+                if col not in header:
+                    print("ERROR: Please check samplesheet header: {} ".format(",".join(header)))
+                    print("Header is missing column {}".format(col))
+                    print("Header must contain columns {}".format("\t".join(required_columns)))
+                    raise IndexError("Header must contain columns {}".format("\t".join(required_columns)))
+            for col in no_whitespaces_raw:
+                values = tab[col].tolist()
+                if any([re.search(r"\s+", s) for s in values]):
+                    print_error(
+                        "The column {} contains values with whitespaces. Please ensure that there are no tabs, spaces or any other whitespaces in these columns as well: {}".format(
+                            col, no_whitespaces_raw
+                        )
+                    )
 
         ## Check that rows have the same fields as header, and at least the compulsory ones are provided
         for line_num, line in enumerate(fin):
@@ -103,15 +155,14 @@ def check_samplesheet(file_in):
                 )
 
         ## Check that sample ids are unique
-        tab = pd.read_csv(file_in, sep="\t", header=0)
         if len(tab["sample_id"]) != len(set(tab["sample_id"])):
             print_error(
                 "Sample IDs are not unique! The sample IDs in the input samplesheet should be unique for each sample."
             )
 
         ## Check that pcr_target_locus is IG or TR
         for val in tab["pcr_target_locus"]:
-            if val not in ["IG", "TR"]:
+            if val.upper() not in ["IG", "TR"]:
                 print_error("pcr_target_locus must be one of: IG, TR.")
 
         ## Check that species is human or mouse
@@ -129,20 +180,10 @@ def check_samplesheet(file_in):
                     "The same subject_id cannot belong to different species! Check input file columns 'subject_id' and 'species'."
                 )
 
-        ## Check that values do not contain spaces in the no whitespaces columns
-        for col in no_whitespaces:
-            values = tab[col].tolist()
-            if any([re.search(r"\s+", s) for s in values]):
-                print_error(
-                    "The column {} contains values with whitespaces. Please ensure that there are no tabs, spaces or any other whitespaces in these columns as well: {}".format(
-                        col, no_whitespaces
-                    )
-                )
-
 
 def main(args=None):
     args = parse_args(args)
-    check_samplesheet(args.FILE_IN)
+    check_samplesheet(args.file_in, args.assembled)
 
 
 if __name__ == "__main__":

diff --git a/conf/modules.config b/conf/modules.config
@@ -35,6 +35,16 @@ process {
         ]
     }
 
+    // Validate input assembled
+    withName: SAMPLESHEET_CHECK_ASSEMBLED {
+        publishDir = [
+            path: { "${params.outdir}/pipeline_info" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+        ext.args = '--assembled'
+    }
+
     withName: 'FASTP' {
             publishDir = [
                 [
@@ -398,6 +408,11 @@ process {
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
+        ext.args = ['findthreshold_method':'gmm',
+            'findthreshold_model':'gamma-norm',
+            'findthreshold_edge':0.9,
+            'findthreshold_cutoff':'user',
+            'findthreshold_spc':0.995]
     }
 
     withName: REPORT_THRESHOLD {
@@ -428,7 +443,7 @@ process {
         ]
         ext.args = ['outname':'', 'model':'hierarchical',
                     'method':'nt', 'linkage':'single',
-                    'skip_convergence':true,
+                    'skip_convergence':false,
                     'outputby':'sample_id', 'min_n':30]
     }
 

diff --git a/conf/test_assembled_mm.config b/conf/test_assembled_mm.config
@@ -9,7 +9,7 @@
 
 params {
     config_profile_name        = 'Test assembled mode'
-    config_profile_description = 'Minimal test dataset to test assembled mode'
+    config_profile_description = 'Minimal mouse test dataset to test assembled mode'
 
     // Limit resources so that this can run on GitHub Actions
     max_cpus   = 2