From be281bdcffa87c9e3c00e80e86d230e6820f4004 Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Tue, 20 Jun 2023 13:42:14 -0400
Subject: [PATCH 01/12] fix report use read rearrangement

---
 assets/repertoire_comparison.Rmd | 58 +++++++++++++++++---------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/assets/repertoire_comparison.Rmd b/assets/repertoire_comparison.Rmd
index 16de9bd6..6f6f758d 100644
--- a/assets/repertoire_comparison.Rmd
+++ b/assets/repertoire_comparison.Rmd
@@ -31,6 +31,7 @@ library(alakazam)
 library(shazam)
 library(stringr)
 library(plotly)
+library(airr)
 
 theme_set(theme_bw(base_family = "ArialMT") +
             theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), text = element_text(family="ArialMT")))
@@ -54,21 +55,10 @@ datadir <- "."
 Number of reads for each of the samples and number of sequences left after performing sequence assembly and alignment to reference data.
 The full table can be found under [Table_sequences_assembly](repertoire_comparison/Sequence_numbers_summary/Table_sequences_assembly.tsv).
 
-```{r seq_numbers, echo=FALSE, warning=FALSE, results='asis'}
-read_table <- function(tab_file){
-        tab_seqs <- read.table(tab_file, header=TRUE, sep="\t", check.names = FALSE)
-        write.table(tab_seqs, file=paste0(seq_dir,"/Table_sequences_assembly.tsv"), sep="\t", quote=F, row.names=F)
-    }
-tryCatch( {read_table("./Table_sequences.tsv")} ,
-  error=function(e){message("No sequence numbers are available if starting with assembled reads.")}
-)
-
-```
-
-
 ```{r seq_numbers_plot, echo=FALSE, warning=FALSE, results='asis'}
 tryCatch( {
     tab_seqs <- read.table("./Table_sequences.tsv", header=TRUE, sep="\t", check.names = FALSE)
+    write.table(tab_seqs, file=paste0(seq_dir,"/Table_sequences_assembly.tsv"), sep="\t", quote=F, row.names=F)
 
     plot_table <- tidyr::pivot_longer(tab_seqs,
                                       cols=Sequences_R1:Igblast,
@@ -88,6 +78,8 @@ tryCatch( {
                   theme(axis.text.x= element_text(angle = 45))
 
     ggplotly(seqs_plot)
+
+
   },
   error=function(e){message("No sequence numbers are available if starting with assembled reads.")}
 )
@@ -144,33 +136,32 @@ ggplotly(seqs_plot_assembled)
 # in the current folder
 all_files <- system(paste0("find '", datadir, "' -name '*clone-pass.tsv'"), intern=T)
 
-diversity_dir <- paste(outdir, "Diversity", sep="/")
-abundance_dir <- paste(outdir, "Abundance", sep="/")
 vfamily_dir <- paste(outdir, "V_family", sep="/")
-dir.create(diversity_dir)
-dir.create(abundance_dir)
 dir.create(vfamily_dir)
 
 # Generate one big dataframe from all patient dataframes
 
-df_list = lapply(all_files, read.csv, sep="\t")
+df_list = lapply(all_files, read_rearrangement)
 
 df_all <- dplyr::bind_rows(df_list)
 
-# Remove underscores in these columns
-df_all$subject_id <- sapply(df_all$subject_id, function(x) str_replace(as.character(x), "_", ""))
-df_all$sample_id <- sapply(df_all$sample_id, function(x) str_replace(as.character(x), "_", ""))
+# Remove underscores in these columns (only needed if including clonal abundance and diversity)
+df_all$subject_id <- stringr::str_replace_all(df_all$subject_id, "_", "")
+df_all$sample_id <- stringr::str_replace_all(df_all$sample_id , "_", "")
 
 # Annotate sample and samplepop (sample + population) by add ing all the conditions
 df_all$subj_locus <- as.factor(paste(df_all$sample_id, df_all$subject_id, df_all$pcr_target_locus, sep="_"))
 
-# Write table to file
-write.table(df_all, paste0(outdir,"/all_data.tsv"), sep = "\t", quote=F, row.names = F, col.names = T)
+# Uncomment to save a table with all the sequencess across samples together
+# write.table(df_all, paste0(outdir,"/all_data.tsv"), sep = "\t", quote=F, row.names = F, col.names = T)
 
 # Set number of bootrstraps
 nboot = 200
 ```
 
+
+<!-- Uncomment to include Clonal abundance and clonal diversity in the repertoire comparison report
+
 # Clonal abundance
 
 For plotting the clonal abundance, the clones were ordered by size from bigger clones to smaller clones (x-axis, Rank).
@@ -184,7 +175,15 @@ range of the bootstrap samples.
 
 All clonal abundance plots and tables with abundance values can be found under `repertoire_analysis/Abundance`.
 
-```{r clonal_abundance, echo=FALSE}
+-->
+
+```{r clonal_abundance, echo=FALSE, eval=FALSE}
+# Set line above to eval=TRUE to include clonal abundance
+diversity_dir <- paste(outdir, "Diversity", sep="/")
+abundance_dir <- paste(outdir, "Abundance", sep="/")
+dir.create(diversity_dir)
+dir.create(abundance_dir)
+
 abund <- estimateAbundance(df_all, group = "subj_locus", ci=0.95, nboot=nboot)
 abund@abundance$sample_id <- sapply(abund@abundance$subj_locus, function(x) unlist(strsplit(as.character(x), "_"))[1])
 abund@abundance$subject_id <- sapply(abund@abundance$subj_locus, function(x) unlist(strsplit(as.character(x), "_"))[2])
@@ -208,12 +207,14 @@ p_ca
 
 ```
 
-```{r plot_abundance, include = FALSE}
+```{r plot_abundance, include = FALSE, eval=FALSE}
+# Set to eval=TRUE to include clonal abundance
 ggsave(plot=p_ca, filename = paste0(abundance_dir,"/Clonal_abundance_subject.pdf"), device="pdf", width = 25, height = 10, units="cm")
 ggsave(plot=p_ca, filename = paste0(abundance_dir,"/Clonal_abundance_subject.png"), device="png", width = 25, height = 10, units="cm")
 write.table(abund@abundance, file = paste0(abundance_dir, "/Clonal_abundance_data_subject.tsv"), sep="\t", quote = F, row.names = F)
 ```
 
+<!-- Uncomment to include Clonal diversity and clonal diversity in the repertoire comparison report
 
 # Clonal diversity
 
@@ -252,9 +253,10 @@ To correct for the different number of sequences in each of the samples, the Boo
 in which `r nboot` random bootstrap samples were taken, with size the number of sequences in the sample with less sequences (N).
 The solid line shows the mean Diversity of the bootstrap samples, whereas the transparent area shows the full Diversity
 range of the bootstrap samples.
+-->
 
-
-```{r clonal_diversity, echo = FALSE}
+```{r clonal_diversity, echo = FALSE, eval=FALSE}
+# Set line above to eval=TRUE to include clonal diversity
 sample_div <- alphaDiversity(abund, group="subj_locus", min_q=0, max_q=4, step_q=0.05,
                             ci=0.95, nboot=nboot)
 sample_main <- paste0("Sample diversity (N=", sample_div@n[1], ")")
@@ -273,12 +275,14 @@ div_p <- ggplot(sample_div@diversity, aes(x = q, y = d, group=sample_id)) +
 
 div_p
 ```
-```{r plot_diversity, include = FALSE}
+```{r plot_diversity, include = FALSE, eval=FALSE}
+# Set to eval=TRUE to include clonal diversity
 ggsave(plot=div_p, filename=paste0(diversity_dir,"/Diversity_patient_grid.png"), device="png", width = 25, height = 10, units="cm")
 ggsave(plot=div_p, filename=paste0(diversity_dir,"/Diversity_patient_grid.pdf"), device="pdf", width = 25, height = 10, units="cm")
 write.table(sample_div@diversity, file = paste0(diversity_dir, "/Clonal_diversity_data_subject.tsv"), sep="\t", quote = F, row.names = F)
 ```
 
+
 # V gene usage
 
 ## V gene family usage

From 716fc5e3b16f1eb315bd75a2e4e8c2de9826794d Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Tue, 20 Jun 2023 14:26:36 -0400
Subject: [PATCH 02/12] add samplesheet check assembled

---
 bin/check_samplesheet.py                    | 90 +++++++++++++++------
 conf/modules.config                         | 10 +++
 modules/local/samplesheet_check.nf          |  3 +-
 subworkflows/local/assembled_input_check.nf |  4 +-
 4 files changed, 81 insertions(+), 26 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index a75fb3c5..478059bc 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -15,7 +15,8 @@ def parse_args(args=None):
     Epilog = "Example usage: python check_samplesheet.py <FILE_IN>"
 
     parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
-    parser.add_argument("FILE_IN", help="Input samplesheet file.")
+    parser.add_argument("file_in", help="Input samplesheet file.")
+    parser.add_argument("-a", "--assembled", help="Input samplesheet type", action="store_true", default=False)
     return parser.parse_args(args)
 
 
@@ -38,7 +39,7 @@ def print_error(error, context="Line", context_str=""):
     sys.exit(1)
 
 
-def check_samplesheet(file_in):
+def check_samplesheet(file_in, assembled):
     """
     This function checks that the samplesheet:
 
@@ -51,9 +52,10 @@ def check_samplesheet(file_in):
 
     sample_run_dict = {}
     with open(file_in, "r") as fin:
-        ## Check that required columns are present
+
+        # Defining minimum columns and required columns
         min_cols = 7
-        required_columns = [
+        required_columns_raw = [
             "sample_id",
             "filename_R1",
             "filename_R2",
@@ -66,7 +68,19 @@ def check_samplesheet(file_in):
             "biomaterial_provider",
             "age",
         ]
-        no_whitespaces = [
+        required_columns_assembled = [
+            "sample_id",
+            "filename",
+            "subject_id",
+            "species",
+            "pcr_target_locus",
+            "single_cell",
+            "sex",
+            "tissue",
+            "biomaterial_provider",
+            "age",
+        ]
+        no_whitespaces_raw = [
             "sample_id",
             "filename_R1",
             "filename_R2",
@@ -75,13 +89,52 @@ def check_samplesheet(file_in):
             "pcr_target_locus",
             "tissue",
         ]
+        no_whitespaces_assembled = [
+            "sample_id",
+            "filename",
+            "subject_id",
+            "species",
+            "pcr_target_locus",
+            "tissue",
+        ]
+
+        ## Read header
         header = [x.strip('"') for x in fin.readline().strip().split("\t")]
-        for col in required_columns:
-            if col not in header:
-                print("ERROR: Please check samplesheet header: {} ".format(",".join(header)))
-                print("Header is missing column {}".format(col))
-                print("Header must contain columns {}".format("\t".join(required_columns)))
-                raise IndexError("Header must contain columns {}".format("\t".join(required_columns)))
+        ## Read tab
+        tab = pd.read_csv(file_in, sep="\t", header=0)
+
+        # Check that all required columns for assembled and raw samplesheets are there, and do not contain whitespaces
+        if assembled:
+            for col in required_columns_assembled:
+                if col not in header:
+                    print("ERROR: Please check samplesheet header: {} ".format(",".join(header)))
+                    print("Header is missing column {}".format(col))
+                    print("Header must contain columns {}".format("\t".join(required_columns)))
+                    raise IndexError("Header must contain columns {}".format("\t".join(required_columns)))
+            for col in no_whitespaces_assembled:
+                values = tab[col].tolist()
+                if any([re.search(r"\s+", s) for s in values]):
+                    print_error(
+                        "The column {} contains values with whitespaces. Please ensure that there are no tabs, spaces or any other whitespaces in these columns as well: {}".format(
+                            col, no_whitespaces
+                        )
+                    )
+
+        else:
+            for col in required_columns_raw:
+                if col not in header:
+                    print("ERROR: Please check samplesheet header: {} ".format(",".join(header)))
+                    print("Header is missing column {}".format(col))
+                    print("Header must contain columns {}".format("\t".join(required_columns)))
+                    raise IndexError("Header must contain columns {}".format("\t".join(required_columns)))
+            for col in no_whitespaces_raw:
+                values = tab[col].tolist()
+                if any([re.search(r"\s+", s) for s in values]):
+                    print_error(
+                        "The column {} contains values with whitespaces. Please ensure that there are no tabs, spaces or any other whitespaces in these columns as well: {}".format(
+                            col, no_whitespaces
+                        )
+                    )
 
         ## Check that rows have the same fields as header, and at least the compulsory ones are provided
         for line_num, line in enumerate(fin):
@@ -103,7 +156,6 @@ def check_samplesheet(file_in):
                 )
 
         ## Check that sample ids are unique
-        tab = pd.read_csv(file_in, sep="\t", header=0)
         if len(tab["sample_id"]) != len(set(tab["sample_id"])):
             print_error(
                 "Sample IDs are not unique! The sample IDs in the input samplesheet should be unique for each sample."
@@ -111,7 +163,7 @@ def check_samplesheet(file_in):
 
         ## Check that pcr_target_locus is IG or TR
         for val in tab["pcr_target_locus"]:
-            if val not in ["IG", "TR"]:
+            if val.upper() not in ["IG", "TR"]:
                 print_error("pcr_target_locus must be one of: IG, TR.")
 
         ## Check that species is human or mouse
@@ -129,20 +181,12 @@ def check_samplesheet(file_in):
                     "The same subject_id cannot belong to different species! Check input file columns 'subject_id' and 'species'."
                 )
 
-        ## Check that values do not contain spaces in the no whitespaces columns
-        for col in no_whitespaces:
-            values = tab[col].tolist()
-            if any([re.search(r"\s+", s) for s in values]):
-                print_error(
-                    "The column {} contains values with whitespaces. Please ensure that there are no tabs, spaces or any other whitespaces in these columns as well: {}".format(
-                        col, no_whitespaces
-                    )
-                )
+
 
 
 def main(args=None):
     args = parse_args(args)
-    check_samplesheet(args.FILE_IN)
+    check_samplesheet(args.file_in, args.assembled)
 
 
 if __name__ == "__main__":
diff --git a/conf/modules.config b/conf/modules.config
index d16975a4..2203a65c 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -35,6 +35,16 @@ process {
         ]
     }
 
+    // Validate input assembled
+    withName: SAMPLESHEET_CHECK_ASSEMBLED {
+        publishDir = [
+            path: { "${params.outdir}/pipeline_info" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+        ext.args = '--assembled'
+    }
+
     withName: 'FASTP' {
             publishDir = [
                 [
diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf
index b9593c98..757851a7 100644
--- a/modules/local/samplesheet_check.nf
+++ b/modules/local/samplesheet_check.nf
@@ -18,8 +18,9 @@ process SAMPLESHEET_CHECK {
     task.ext.when == null || task.ext.when
 
     script: // This script is bundled with the pipeline, in nf-core/airrflow/bin/
+    def args = task.ext.args ?: ''
     """
-    check_samplesheet.py $samplesheet
+    check_samplesheet.py $samplesheet $args
     cp $samplesheet samplesheet.valid.tsv
 
     cat <<-END_VERSIONS > versions.yml
diff --git a/subworkflows/local/assembled_input_check.nf b/subworkflows/local/assembled_input_check.nf
index b37b359f..3b519c6f 100644
--- a/subworkflows/local/assembled_input_check.nf
+++ b/subworkflows/local/assembled_input_check.nf
@@ -3,6 +3,7 @@
  */
 
 include { VALIDATE_INPUT } from '../../modules/local/enchantr/validate_input'
+include { SAMPLESHEET_CHECK as SAMPLESHEET_CHECK_ASSEMBLED } from '../../modules/local/samplesheet_check'
 
 workflow ASSEMBLED_INPUT_CHECK {
     take:
@@ -12,8 +13,7 @@ workflow ASSEMBLED_INPUT_CHECK {
     cloneby
 
     main:
-    // TODO: validate input should check that sample_ids are unique
-
+    SAMPLESHEET_CHECK_ASSEMBLED ( samplesheet )
     VALIDATE_INPUT ( samplesheet, miairr, collapseby, cloneby ) //removed reassign
     ch_validated_input = VALIDATE_INPUT.out.validated_input
     ch_validated_input

From b6c89e90f7dbcf2611bcff82029630bc7aee4f60 Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Tue, 20 Jun 2023 14:41:12 -0400
Subject: [PATCH 03/12] fix black linting

---
 bin/check_samplesheet.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 478059bc..87475e56 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -52,7 +52,6 @@ def check_samplesheet(file_in, assembled):
 
     sample_run_dict = {}
     with open(file_in, "r") as fin:
-
         # Defining minimum columns and required columns
         min_cols = 7
         required_columns_raw = [
@@ -182,8 +181,6 @@ def check_samplesheet(file_in, assembled):
                 )
 
 
-
-
 def main(args=None):
     args = parse_args(args)
     check_samplesheet(args.file_in, args.assembled)

From db676779d77a746d0ef698444d2ea3a2b8c7a0a3 Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Tue, 20 Jun 2023 14:42:09 -0400
Subject: [PATCH 04/12] back to dev version

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index b25143ba..0f72f9dc 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -310,7 +310,7 @@ manifest {
     description     = """B and T cell repertoire analysis pipeline with the Immcantation framework."""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=22.10.1'
-    version         = '3.1.0'
+    version         = '3.2.0dev'
     doi             = '10.5281/zenodo.2642009'
 }
 

From c42be4f0315a4135ad2448a47e7bf7cc765f4273 Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Tue, 20 Jun 2023 14:45:48 -0400
Subject: [PATCH 05/12] fix var name

---
 bin/check_samplesheet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 87475e56..f3b10dc9 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -115,7 +115,7 @@ def check_samplesheet(file_in, assembled):
                 if any([re.search(r"\s+", s) for s in values]):
                     print_error(
                         "The column {} contains values with whitespaces. Please ensure that there are no tabs, spaces or any other whitespaces in these columns as well: {}".format(
-                            col, no_whitespaces
+                            col, no_whitespaces_assembled
                         )
                     )
 
@@ -131,7 +131,7 @@ def check_samplesheet(file_in, assembled):
                 if any([re.search(r"\s+", s) for s in values]):
                     print_error(
                         "The column {} contains values with whitespaces. Please ensure that there are no tabs, spaces or any other whitespaces in these columns as well: {}".format(
-                            col, no_whitespaces
+                            col, no_whitespaces_raw
                         )
                     )
 

From 402cb7f4edb6b4e44df73fc3db5f3900b3313544 Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Wed, 21 Jun 2023 09:22:56 -0400
Subject: [PATCH 06/12] Add params findthreshold

---
 conf/modules.config | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/conf/modules.config b/conf/modules.config
index 2203a65c..6db8fe20 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -408,6 +408,11 @@ process {
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
+        ext.args = ['findthreshold_method':'gmm',
+            'findthreshold_model':'gamma-norm',
+            'findthreshold_edge':0.9,
+            'findthreshold_cutoff':'user',
+            'findthreshold_spc':0.995]
     }
 
     withName: REPORT_THRESHOLD {

From b53a6a19aaf8cbfa0f3a53f2ee463b9648478057 Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Wed, 21 Jun 2023 09:33:47 -0400
Subject: [PATCH 07/12] update changelog

---
 CHANGELOG.md | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e7e1fd26..adda8a37 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,18 @@
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
-## [3.1] - 2023-06-05 "Protego"
+## [3.2.0dev] -
+
+### `Added`
+
+- Added parameters for FindThreshold in `modules.config`.
+
+### `Fixed`
+
+### `Dependencies`
+
+
+## [3.1.0] - 2023-06-05 "Protego"
 
 ### `Added`
 

From 359c4b12d8985cda2e26a44cddb2ba00cc48c045 Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Thu, 22 Jun 2023 13:17:17 -0400
Subject: [PATCH 08/12] enable convergence define clones report

---
 conf/modules.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/modules.config b/conf/modules.config
index 6db8fe20..1737a926 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -443,7 +443,7 @@ process {
         ]
         ext.args = ['outname':'', 'model':'hierarchical',
                     'method':'nt', 'linkage':'single',
-                    'skip_convergence':true,
+                    'skip_convergence':false,
                     'outputby':'sample_id', 'min_n':30]
     }
 

From 56e4c8669bc2c707a250e74b740ceac9f4c52ac0 Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Thu, 22 Jun 2023 15:00:38 -0400
Subject: [PATCH 09/12] allow for locus lowercase

---
 bin/check_samplesheet.py                        | 2 +-
 docs/usage.md                                   | 7 +++----
 modules/local/changeo/changeo_parsedb_select.nf | 4 ++--
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index f3b10dc9..bc686deb 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -46,7 +46,7 @@ def check_samplesheet(file_in, assembled):
     - contains the compulsory fields: sample_id, filename_R1, filename_R2, subject_id, pcr_target_locus, species, single_cell
     - sample ids are unique
     - samples from the same subject come from the same species
-    - pcr_target_locus is "IG" or "TR"
+    - pcr_target_locus is "IG"/"ig" or "TR"/"tr"
     - species is "human" or "mouse"
     """
 
diff --git a/docs/usage.md b/docs/usage.md
index f7f4f931..54c692dd 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -111,15 +111,14 @@ The metadata specified in the input file will then be automatically annotated in
 
 ## Assembled input samplesheet (bulk or single-cell)
 
-The required input file for processing raw BCR or TCR bulk targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename`, `subject_id`, `species`, `tissue`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required.
+The required input file for processing raw BCR or TCR bulk targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename`, `subject_id`, `species`, `tissue`, `single_cell`, `pcr_target_locus`, `sex`, `age` and `biomaterial_provider` are required.
 
 An example samplesheet is
 
 | filename                                                 | species | subject_id | sample_id                         | tissue     | sex  | age | biomaterial_provider | pcr_target_locus | single_cell |
 | -------------------------------------------------------- | ------- | ---------- | --------------------------------- | ---------- | ---- | --- | -------------------- | ---------------- | ----------- |
-| sc5p_v2_hs_PBMC_1k_b_airr_rearrangement.tsv              | human   | subject_x  | sc5p_v2_hs_PBMC_1k_5fb            | PBMC       | NA   | NA  | 10x Genomics         | ig               | TRUE        |
-| sc5p_v2_mm_c57bl6_splenocyte_1k_b_airr_rearrangement.tsv | mouse   | mouse_x    | sc5p_v2_mm_c57bl6_splenocyte_1k_b | splenocyte | NA   | NA  | 10x Genomics         | ig               | TRUE        |
-| bulk-Laserson-2014.fasta                                 | human   | PGP1       | PGP1                              | PBMC       | male | NA  | Laserson-2014        | ig               | FALSE       |
+| sc5p_v2_hs_PBMC_1k_b_airr_rearrangement.tsv              | human   | subject_x  | sc5p_v2_hs_PBMC_1k_5fb            | PBMC       | NA   | NA  | 10x Genomics         | IG               | TRUE        |
+| bulk-Laserson-2014.fasta                                 | human   | PGP1       | PGP1                              | PBMC       | male | NA  | Laserson-2014        | IG               | FALSE       |
 
 ## Supported AIRR metadata fields
 
diff --git a/modules/local/changeo/changeo_parsedb_select.nf b/modules/local/changeo/changeo_parsedb_select.nf
index 0bab80ab..9f592e7f 100644
--- a/modules/local/changeo/changeo_parsedb_select.nf
+++ b/modules/local/changeo/changeo_parsedb_select.nf
@@ -20,7 +20,7 @@ process CHANGEO_PARSEDB_SELECT {
     script:
     def args = task.ext.args ?: ''
     def args2 = task.ext.args2 ?: ''
-    if (meta.locus == 'IG'){
+    if (meta.locus.toUpperCase() == 'IG'){
         """
         ParseDb.py select -d $tab $args --outname ${meta.id} > ${meta.id}_select_command_log.txt
 
@@ -30,7 +30,7 @@ process CHANGEO_PARSEDB_SELECT {
             changeo: \$( ParseDb.py --version | awk -F' '  '{print \$2}' )
         END_VERSIONS
         """
-    } else if (meta.locus == 'TR'){
+    } else if (meta.locus.toUpperCase() == 'TR'){
         """
         ParseDb.py select -d $tab $args2 --outname ${meta.id} > "${meta.id}_command_log.txt"
 

From dfaff527741c8f9ef86902e2d55dc96bec589f55 Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Thu, 22 Jun 2023 15:04:24 -0400
Subject: [PATCH 10/12] fix linting

---
 .github/workflows/ci.yml              | 3 ++-
 .github/workflows/ci_immcantation.yml | 7 ++++++-
 CHANGELOG.md                          | 1 -
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3e586322..11eab3db 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -48,7 +48,8 @@ jobs:
         NXF_VER:
           - "22.10.1"
           - "latest-everything"
-        profile: ["test_tcr", "test_no_umi", "test_nocluster", "test_fetchimgt", "test_assembled_hs", "test_assembled_mm"]
+        profile:
+          ["test_tcr", "test_no_umi", "test_nocluster", "test_fetchimgt", "test_assembled_hs", "test_assembled_mm"]
       fail-fast: false
     steps:
       - name: Check out pipeline code
diff --git a/.github/workflows/ci_immcantation.yml b/.github/workflows/ci_immcantation.yml
index c01cacd5..8669923d 100644
--- a/.github/workflows/ci_immcantation.yml
+++ b/.github/workflows/ci_immcantation.yml
@@ -25,7 +25,12 @@ jobs:
         NXF_VER:
           - "22.10.1"
           - "latest-everything"
-        profile: ["test_assembled_immcantation_devel_hs", "test_assembled_immcantation_devel_mm", "test_raw_immcantation_devel"]
+        profile:
+          [
+            "test_assembled_immcantation_devel_hs",
+            "test_assembled_immcantation_devel_mm",
+            "test_raw_immcantation_devel",
+          ]
       fail-fast: false
     steps:
       - name: Check out pipeline code
diff --git a/CHANGELOG.md b/CHANGELOG.md
index adda8a37..c47dc1a8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,7 +13,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### `Dependencies`
 
-
 ## [3.1.0] - 2023-06-05 "Protego"
 
 ### `Added`

From 074c4f532baa98d7f0dfb8026be22586580920e6 Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Thu, 22 Jun 2023 15:08:17 -0400
Subject: [PATCH 11/12] update changelog

---
 CHANGELOG.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c47dc1a8..d34d1fec 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,10 +7,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### `Added`
 
-- Added parameters for FindThreshold in `modules.config`.
+- [#268](https://github.com/nf-core/airrflow/pull/268) Added parameters for FindThreshold in `modules.config`.
+- [#268](https://github.com/nf-core/airrflow/pull/268) Validate samplesheet also for `assembled` samplesheet.
 
 ### `Fixed`
 
+- [#268](https://github.com/nf-core/airrflow/pull/268) Allows for uppercase and lowercase locus in samplesheet `pcr_target_locus`.
+
 ### `Dependencies`
 
 ## [3.1.0] - 2023-06-05 "Protego"

From 9bcef93c5b3d42cdee09a5f7983576a1794beee4 Mon Sep 17 00:00:00 2001
From: Gisela Gabernet <gisela.gabernet@gmail.com>
Date: Thu, 22 Jun 2023 15:09:14 -0400
Subject: [PATCH 12/12] fix linting

---
 docs/usage.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/usage.md b/docs/usage.md
index 54c692dd..e54b10f8 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -115,10 +115,10 @@ The required input file for processing raw BCR or TCR bulk targeted sequencing d
 
 An example samplesheet is
 
-| filename                                                 | species | subject_id | sample_id                         | tissue     | sex  | age | biomaterial_provider | pcr_target_locus | single_cell |
-| -------------------------------------------------------- | ------- | ---------- | --------------------------------- | ---------- | ---- | --- | -------------------- | ---------------- | ----------- |
-| sc5p_v2_hs_PBMC_1k_b_airr_rearrangement.tsv              | human   | subject_x  | sc5p_v2_hs_PBMC_1k_5fb            | PBMC       | NA   | NA  | 10x Genomics         | IG               | TRUE        |
-| bulk-Laserson-2014.fasta                                 | human   | PGP1       | PGP1                              | PBMC       | male | NA  | Laserson-2014        | IG               | FALSE       |
+| filename                                    | species | subject_id | sample_id              | tissue | sex  | age | biomaterial_provider | pcr_target_locus | single_cell |
+| ------------------------------------------- | ------- | ---------- | ---------------------- | ------ | ---- | --- | -------------------- | ---------------- | ----------- |
+| sc5p_v2_hs_PBMC_1k_b_airr_rearrangement.tsv | human   | subject_x  | sc5p_v2_hs_PBMC_1k_5fb | PBMC   | NA   | NA  | 10x Genomics         | IG               | TRUE        |
+| bulk-Laserson-2014.fasta                    | human   | PGP1       | PGP1                   | PBMC   | male | NA  | Laserson-2014        | IG               | FALSE       |
 
 ## Supported AIRR metadata fields