diff --git a/docs/source/index.rst b/docs/source/index.rst index f80b1562..c444d231 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -13,6 +13,7 @@ Then, DROP can be executed in multiple ways (:doc:`pipeline`). installation prepare pipeline + output license help @@ -24,7 +25,7 @@ We recommend using a dedicated conda environment. (installation time: ~ 10min) .. code-block:: bash - mamba install -c conda-forge -c bioconda drop + mamba create -n drop -c conda-forge -c bioconda drop Test installation with demo project diff --git a/docs/source/output.rst b/docs/source/output.rst new file mode 100644 index 00000000..c4b0e2a6 --- /dev/null +++ b/docs/source/output.rst @@ -0,0 +1,110 @@ +Results and Output of DROP +=========================== + +DROP is intended to help researchers use RNA-Seq data in order to detect genes with aberrant expression, +aberrant splicing and mono-allelic expression. By simplifying the workflow process we hope to provide +easy-to-read HTML files and output files. This section explains the results files. The paths of the output +files correspond to the ones from the demo (that can be run with the following code snippet):: + + #install drop + mamba create -n drop_env -c conda-forge -c bioconda drop + conda activate drop_env + + mkdir drop_demo + cd drop_demo + drop demo + + snakemake -c1 + +Aberrant Expression ++++++++++++++++++++ + +HTML file +######### +Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``AberrantExpression`` +tab at the top of the screen. The Overview tab contains links to the: + +* Counts Summaries for each aberrant expression group + * number of local and external samples + * Mapped reads and size factors for each sample + * histograms showing the mean count distribution with different conditions + * expressed genes within each sample and as a dataset +* Outrider Summaries for each aberrant expression group + * aberrantly expressed genes per sample + * correlation between samples before and after the autoencoder + * biological coefficient of variation + * aberrant samples + * results table +* Files for each aberrant expression group + * OUTRIDER datasets + * Follow the `OUTRIDER vignette `_ for individual OUTRIDER object file (ods) analysis. + * Results tables + * ``results.tsv`` this text file contains only the significant genes and samples that meet the cutoffs defined in the config file for ``padjCutoff`` and ``zScoreCutoff`` + +Local result files +################## +Additionally the ``aberrantExpression`` module creates the file ``Output/processed_results/aberrant_expression/{annotation}/outrider/{drop_group}/OUTRIDER_results_all.Rds``. This file contains the entire OUTRIDER results table regardless of significance. + +Aberrant Splicing ++++++++++++++++++ + +HTML file +########## +Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``AberrantSplicing`` +tab at the top of the screen. The Overview tab contains links to the: + +* Counting Summaries for each aberrant splicing group + * number of local and external samples + * number introns/splice sites before and after merging + * comparison of local and external mean counts + * histograms showing the junction expression before and after filtering and variability +* FRASER Summaries for each aberrant splicing group + * the number of samples, introns, and splice sites + * correlation between samples before and after the autoencoder + * results table +* Files for each aberrant splicing group + * FRASER datasets (fds) + * Follow the `FRASER vignette `_ for individual FRASER object file (fds) analysis. + * Results tables + * ``results_per_junction.tsv`` this text file contains only significant junctions that meet the cutoffs defined in the config file. + +Local result files +################## +Additionally the ``aberrantSplicing`` module creates the following file ``Output/processed_results/aberrant_splicing/results/{annotation}/fraser/{drop_group}/results.tsv``. +This text file contains only significant junctions that meet the cutoffs defined in the config file, aggregated at the gene level. Any sample/gene pair is represented by only the most significant junction. + +Mono-allelic Expression ++++++++++++++++++++++++ + +HTML file +########## +Looking at the resulting ``Output/html/drop_demo_index.html`` we can see the ``MonoallelicExpression`` +tab at the top of the screen. The Overview tab contains links to the: + +* Results for each mae group + * number of samples, genes, and mono-allelically expressed heterozygous SNVs + * a cascade plot that shows additional filters + * histogram of inner cohort frequency + * summary of the cascade plot and results table +* Files for each mae group + * Allelic counts + * a directory containing the allelic counts of heterozygous variants + * Results data tables of each sample (.Rds) + * Rds objects containing the full results table regardless of MAE status + * Significant MAE results tables + * a link to the results file + * Only contains significant MAE for the alternative allele results and results that pass the config file cutoffs +* Quality Control + * QC Overview + * For each mae group QC checks for DNA/RNA matching + +Local result files +################## +Additionally the ``mae`` module creates the following files: + +* ``Output/processed_results/mae/{drop_group}/MAE_results_all_{annotation}.tsv.gz`` + * this file contains the MAE results of all heterozygous SNVs regardless of significance +* ``Output/processed_results/mae/{drop_group}/MAE_results_{annotation}.tsv`` + * this is the file linked in the HTML document and described above +* ``Output/processed_results/mae/{drop_group}/MAE_results_{annotation}_rare.tsv`` + * this file is a subset of ``MAE_results_{annotation}.tsv`` with only the variants that pass the allele frequency cutoffs. If ``add_AF`` is set to ``true`` in config file must meet minimum AF set by ``max_AF``. Additionally, the inner-cohort frequency must meet the ``maxVarFreqCohort`` cutoff diff --git a/docs/source/prepare.rst b/docs/source/prepare.rst index 249b438d..48754a3c 100644 --- a/docs/source/prepare.rst +++ b/docs/source/prepare.rst @@ -46,6 +46,7 @@ When providing a path to a file or directory, please provide the *full system pa Global parameters +++++++++++++++++ +These parameters are applied to multiple modules and as a result should be consistent throughout the data you are analyzing =================== ========== ======================================================================================================================================= ====== Parameter Type Description Default/Examples @@ -55,9 +56,9 @@ htmlOutputPath character Full path of the folder where the HTML files ar indexWithFolderName boolean If true, the basename of the project directory will be used as prefix for the index.html file ``true`` genomeAssembly character Either hg19/hs37d5 or hg38/GRCh38, depending on the genome assembly used for mapping ``/data/project1`` sampleAnnotation character Full path of the sample annotation table ``/data/project1/sample_annotation.tsv`` -root character Full path of the folder where the subdirectories processed_data and processed_results will be created containing DROP's output files. ``/data/project1`` +root character Full path of the folder where the sub-directories processed_data and processed_results will be created containing DROP's output files. ``/data/project1`` genome character Full path of a human reference genome fasta file ``/path/to/hg19.fa`` -genome dictionary (Optional) Multiple fasta files can be specified when RNA-seq BAM files belong to different genome assemblies (eg, ncbi, ucsc). ``ncbi: /path/to/hg19_ncbi.fa`` +genome dictionary (Optional) Multiple fasta files can be specified when RNA-seq BAM files belong to different genome. assemblies (eg, ncbi, ucsc). ``ncbi: /path/to/hg19_ncbi.fa`` ``ucsc: /path/to/hg19_ucsc.fa`` geneAnnotation dictionary A key-value list of the annotation name (key) and the full path to the GTF file (value). More than one annotation file can be provided. ``anno1: /path/to/gtf1.gtf`` @@ -73,6 +74,10 @@ tools dictionary A key-value list of different commands (key) an Export counts dictionary ++++++++++++++++++++++++ +These parameters are directly used by the ``exportCounts`` snakemake command. This section +is used to designate which aberrant expression and aberrant splicing groups should be exported +into datasets that can be shared. To avoid sharing sensitive data, only the canonical annotations +as described by `geneAnnotations` are exported. Only the groups excluded by `excludeGroups` are not exported. =============== ==== ========================================================================================================================== ====== Parameter Type Description Default/Examples @@ -84,6 +89,8 @@ excludeGroups list aberrant expression and aberrant splicing groups whose co Aberrant expression dictionary ++++++++++++++++++++++++++++++ +These parameters are directly used by the ``aberrantExpression`` snakemake command. Aberrant expression groups must have at least ``10`` +samples per group. To use external counts please see the ``Using External Counts`` section. ============================ ========= ================================================================================================================================= ====== Parameter Type Description Default/Examples @@ -102,6 +109,8 @@ maxTestedDimensionProportion numeric An integer that controls the maximum va Aberrant splicing dictionary ++++++++++++++++++++++++++++ +These parameters are directly used by the ``aberrantSplicing`` snakemake command. Aberrant splicing groups must have at least ``10`` +samples per group. To use external counts please see the ``Using External Counts`` section. ============================ ========= ============================================================================================ ====== Parameter Type Description Default/Examples @@ -122,8 +131,10 @@ maxTestedDimensionProportion numeric Same as in aberrant expression. ============================ ========= ============================================================================================ ====== -Mono-allelic expression dictionary +Mono-allelic expression (MAE) dictionary ++++++++++++++++++++++++++++++++++ +These parameters are directly used by the ``mae`` snakemake command. MAE groups are not bound by a minimum number of samples, +but require additional information in the sample annotation table. ===================== ========= ======================================================================================================================== ====== Parameter Type Description Default/Examples @@ -148,86 +159,117 @@ For example, if the AberrantExpression module is set to false, the ``Scripts/Ab Creating the sample annotation table ------------------------------------ - For a detailed explanation of the columns of the sample annotation, please refer to -Box 3 of the `DROP manuscript `_. +Box 3 of the `DROP manuscript `_. Although some information has been updated since puplication, please use this documentation as the preferred syntax/formatting. Each row of the sample annotation table corresponds to a unique pair of RNA and DNA samples derived from the same individual. An RNA assay can belong to one or more DNA assays, and vice-versa. If so, they must be specified in different rows. The required columns are ``RNA_ID``, ``RNA_BAM_FILE`` and ``DROP_GROUP``, plus other module-specific -ones (see DROP manuscript). +ones (see DROP manuscript). The following columns describe the RNA-seq experimental setup: ``PAIRED_END``, ``STRAND``, ``COUNT_MODE`` and ``COUNT_OVERLAPS``. They affect the counting procedures of the aberrant expression and splicing modules. For a detailed explanation, refer to the documentation of `HTSeq `_. -To run the MAE module, the columns ``DNA_ID`` and ``DNA_VCF_FILE`` are needed. - -In case external counts are included, add a new row for each sample from those -files (or a subset if not all samples are needed). Add the columns: ``GENE_COUNTS_FILE``, -``GENE_ANNOTATON``, ``SPLIT_COUNTS_FILE`` and ``NON_SPLIT_COUNTS_FILE``. See examples below. +To run the MAE module, the columns ``DNA_ID`` and ``DNA_VCF_FILE`` are needed. MAE can not be run +in samples using external counts as we need to use the ``RNA_BAM_FILE`` to count reads supporting +each allele of the heterozygous variants found in the ``DNA_VCF_FILE``. In case RNA-seq BAM files belong to different genome assemblies (eg, ncbi, ucsc), multiple reference genome fasta files can be specified. Add a column called `GENOME` that contains, for each sample, the key from the `genome` parameter in the config file that matches its genome assembly (eg, ncbi or ucsc). - The sample annotation file must be saved in the tab-separated values (tsv) format. The column order does not matter. Also, it does not matter where it is stored, as the path is specified in the config file. Here we provide some examples on how to deal with certain situations. For simplicity, we do not include all possible columns in the examples. -Example of RNA replicates -++++++++++++++++++++++++++++++++++ -====== ====== ========== =================== == -RNA_ID DNA_ID DROP_GROUP RNA_BAM_FILE DNA_VCF_FILE -====== ====== ========== =================== == -S10R_B S10G BLOOD /path/to/S10R_B.BAM /path/to/S10G.vcf.gz -S10R_M S10G MUSCLE /path/to/S10R_M.BAM /path/to/S10G.vcf.gz -====== ====== ========== =================== == - -Example of DNA replicates +Using External Counts ++++++++++++++++++++++++++++++++++ +DROP can utilize external counts for the ``aberrantExpression`` and ``aberrantSplicing`` modules +which can enhance the statistical power of these modules by providing more samples from which we +can build a distribution of counts and detect outliers. However this process introduces some +particular issues that need to be addressed to make sure it is a valuable addition to the experiment. -====== ====== ========== ================= == -RNA_ID DNA_ID DROP_GROUP RNA_BAM_FILE DNA_VCF_FILE -====== ====== ========== ================= == -S20R S20E WES /path/to/S20R.BAM /path/to/S20E.vcf.gz -S20R S20G WGS /path/to/S20R.BAM /path/to/S20G.vcf.gz -====== ====== ========== ================= == - -Example of a multi-sample vcf file -++++++++++++++++++++++++++++++++++ - -====== ====== ========== ================= == -RNA_ID DNA_ID DROP_GROUP RNA_BAM_FILE DNA_VCF_FILE -====== ====== ========== ================= == -S10R S10G WGS /path/to/S10R.BAM /path/to/multi_sample.vcf.gz -S20R S20G WGS /path/to/S20R.BAM /path/to/multi_sample.vcf.gz -====== ====== ========== ================= == - -External count matrices +In case external counts are included, add a new row for each sample from those +files (or a subset if not all samples are needed). Add the columns: ``GENE_COUNTS_FILE`` +(for aberrant expression), ``GENE_ANNOTATON``, and ``SPLICE_COUNTS_DIR`` (for aberrant splicing). +These columns should remain empty for samples processed locally (from ``RNA_BAM``). + +Aberrant Expression +#################### +Using external counts for aberrant expression forces you to use the exact same gene annotation for each +external sample as well as using the same gene annotation file specified in the config file +``Global parameters`` section. This is to avoid potential mismatching on counting, 2 different gene +annotations could drastically affect which reads are counted in which region drastically skewing the results. + +The user must also use special consideration when building the sample annotation table. Samples +using external counts need only ``RNA_ID`` which must exactly match the column header in the external count file +``DROP_GROUP``, ``GENE_COUNTS_FILE``, and ``GENE_ANNOTATION`` which must contain the exact key specified in the config. +The other columns should remain empty. + +Using ``exportCounts`` generates the sharable ``GENE_COUNTS_FILE`` file in the appropriate +``ROOT_DIR/Output/processed_results/exported_counts/`` sub-directory. + +Aberrant Splicing +################## +Using external counts for aberrant splicing reduces the number of introns processed to only those +that are exactly the same between the local and external junctions. Because rare junctions may be +personally identifiable the ``exportCounts`` command only exports regions canonically mentioned in the gtf file. +As a result, when merging the external counts with the local counts we only match introns that are **exact** between +the 2 sets, this is to ensure that if a region is missing we don't introduce 0 counts into the distribution calculations. + +The user must also use special consideration when building the sample annotation table. Samples +using external counts need only ``RNA_ID`` which must exactly match the column header in the external count file +``DROP_GROUP``, and ``SPLICE_COUNTS_DIR``. ``SPLICE_COUNTS_DIR`` is the directory containing the set of 5 needed count files. +The other columns should remain empty. + +Using ``exportCounts`` generates the necessary files in the appropriate +``ROOT_DIR/Output/processed_results/exported_counts/`` sub-directory + +``SPLICE_COUNTS_DIR`` should contain the following: + +* k_j_counts.tsv.gz +* k_theta_counts.tsv.gz +* n_psi3_counts.tsv.gz +* n_psi5_counts.tsv.gz +* n_theta_counts.tsv.gz + +Publicly available DROP external counts +####################################### +You can find different sets of publicly available external counts to add to your +analysis on our `github page `_ + +If you want to contribute with your own count matrices, please contact us: yepez at in.tum.de (yepez@in.tum.de) + +External count examples +++++++++++++++++++++++ In case counts from external matrices are to be integrated into the analysis, -the file must be specified in the GENE_COUNTS_FILE column. A new row must be -added for each sample from the count matrix that should be included in the -analysis. An RNA_BAM_FILE must not be specified. The DROP_GROUP of the local +the sample annotation must be built in a particular way +A new row must be added for each sample from the count matrix that should be included in the +analysis. The ``RNA_ID`` must match the column header of the external files, +the ``RNA_BAM_FILE`` must not be specified. The ``DROP_GROUP`` of the local and external samples that are to be analyzed together must be the same. -Similarly, the GENE_ANNOTATION of the external counts and the key of the `geneAnnotation` +For aberrant expression, the GENE_ANNOTATION of the external counts and the key of the `geneAnnotation` parameter from the config file must match. -====== ====== ========== ================= ============================== == -RNA_ID DNA_ID DROP_GROUP RNA_BAM_FILE GENE_COUNTS_FILE GENE_ANNOTATION -====== ====== ========== ================= ============================== == -S10R S10G BLOOD /path/to/S10R.BAM -EXT-1R BLOOD /path/to/externalCounts.tsv.gz gencode34 -EXT-2R BLOOD /path/to/externalCounts.tsv.gz gencode34 -====== ====== ========== ================= ============================== == +This example will use the ``DROP_GROUP`` BLOOD_AE for the aberrant expression module (containing S10R, EXT-1R, EXT-2R) and +the ``DROP_GROUP`` BLOOD_AS for the aberrant expression module (containing S10R, EXT-3R, EXT-4R) + +====== ====== ========== ================= ============================== =============== ======================== +RNA_ID DNA_ID DROP_GROUP RNA_BAM_FILE GENE_COUNTS_FILE GENE_ANNOTATION SPLICE_COUNTS_DIR +====== ====== ========== ================= ============================== =============== ======================== +S10R S10G BLOOD_AE /path/to/S10R.BAM +EXT-1R BLOOD_AE /path/to/externalCounts.tsv.gz gencode34 +EXT-2R BLOOD_AE /path/to/externalCounts.tsv.gz gencode34 +EXT-3R BLOOD_AS /path/to/externalCountDir +EXT-4R BLOOD_AS /path/to/externalCountDir +====== ====== ========== ================= ============================== =============== ======================== .. _filesdownload: @@ -248,8 +290,39 @@ Download it and indicate the full path to it in the ``hpoFile`` key. The file is only needed in case HPO terms are specified in the sample annotation. Otherwise, write ``null`` in the ``hpoFile`` key. + .. _advancedoptions: +Example of RNA replicates +++++++++++++++++++++++++++++++++++ + +====== ====== ========== =================== == +RNA_ID DNA_ID DROP_GROUP RNA_BAM_FILE DNA_VCF_FILE +====== ====== ========== =================== == +S10R_B S10G BLOOD /path/to/S10R_B.BAM /path/to/S10G.vcf.gz +S10R_M S10G MUSCLE /path/to/S10R_M.BAM /path/to/S10G.vcf.gz +====== ====== ========== =================== == + +Example of DNA replicates +++++++++++++++++++++++++++++++++++ + +====== ====== ========== ================= == +RNA_ID DNA_ID DROP_GROUP RNA_BAM_FILE DNA_VCF_FILE +====== ====== ========== ================= == +S20R S20E WES /path/to/S20R.BAM /path/to/S20E.vcf.gz +S20R S20G WGS /path/to/S20R.BAM /path/to/S20G.vcf.gz +====== ====== ========== ================= == + +Example of a multi-sample vcf file +++++++++++++++++++++++++++++++++++ + +====== ====== ========== ================= == +RNA_ID DNA_ID DROP_GROUP RNA_BAM_FILE DNA_VCF_FILE +====== ====== ========== ================= == +S10R S10G WGS /path/to/S10R.BAM /path/to/multi_sample.vcf.gz +S20R S20G WGS /path/to/S20R.BAM /path/to/multi_sample.vcf.gz +====== ====== ========== ================= == + Advanced options ---------------- @@ -284,4 +357,3 @@ In additon, DROP allows that BAM files from RNA-seq were aligned to one genome assembly (eg ucsc) and the corresponding VCF files from DNA sequencing to another genome assembly (eg ncbi). If so, the assembly of the reference genome fasta file must correspond to the one of the BAM file from RNA-seq. - diff --git a/drop/config/ExportCounts.py b/drop/config/ExportCounts.py index 5044d753..f02e7023 100644 --- a/drop/config/ExportCounts.py +++ b/drop/config/ExportCounts.py @@ -27,7 +27,7 @@ def __init__( self.CONFIG_KEYS = ["geneAnnotations", "excludeGroups"] self.config_dict = self.setDefaults(dict_, genome.annotation) self.outputRoot = outputRoot / "exported_counts" - self.sa = sampleAnnotation + self.sampleAnnotation = sampleAnnotation self.genomeAssembly = genome.assembly self.geneAnnotations = self.get("geneAnnotations") self.modules = { diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py index 2704cd89..4ea6dbe3 100644 --- a/drop/config/SampleAnnotation.py +++ b/drop/config/SampleAnnotation.py @@ -9,7 +9,7 @@ class SampleAnnotation: - FILE_TYPES = ["RNA_BAM_FILE", "DNA_VCF_FILE", "GENE_COUNTS_FILE"] + FILE_TYPES = ["RNA_BAM_FILE", "DNA_VCF_FILE", "GENE_COUNTS_FILE", "SPLICE_COUNTS_DIR"] SAMPLE_ANNOTATION_COLUMNS = FILE_TYPES + [ "RNA_ID", "DNA_ID", "DROP_GROUP", "GENE_ANNOTATION", "PAIRED_END", "COUNT_MODE", "COUNT_OVERLAPS", "STRAND", "GENOME" @@ -32,6 +32,7 @@ def __init__(self, file, root, genome): self.dnaIDs = self.createGroupIds(file_type="DNA_VCF_FILE", sep=',') # external counts self.extGeneCountIDs = self.createGroupIds(file_type="GENE_COUNTS_FILE", sep=',') + self.extSpliceCountIDs = self.createGroupIds(file_type="SPLICE_COUNTS_DIR", sep=',') def parse(self, sep='\t'): """ @@ -39,24 +40,25 @@ def parse(self, sep='\t'): clean columns and set types """ data_types = { - "RNA_ID": str, "DNA_ID": str, "DROP_GROUP": str, "GENE_ANNOTATION": str, - "PAIRED_END": bool, "COUNT_MODE": str, "COUNT_OVERLAPS": bool, "STRAND": str, "GENOME": str + "RNA_ID": str, "DNA_ID": str, "DROP_GROUP": str, + "PAIRED_END": bool, "COUNT_MODE": str, "COUNT_OVERLAPS": bool, "STRAND": str, } + optional_columns = {"GENE_COUNTS_FILE", "SPLICE_COUNTS_DIR", "GENE_ANNOTATION", "GENOME"} + sa = pd.read_csv(self.file, sep=sep, index_col=False) missing_cols = [x for x in self.SAMPLE_ANNOTATION_COLUMNS if x not in sa.columns.values] if len(missing_cols) > 0: - if "GENOME" in missing_cols: - # deal with missing columns in data types, remove it to fix checks later - del data_types["GENOME"] - self.SAMPLE_ANNOTATION_COLUMNS.remove("GENOME") - missing_cols.remove("GENOME") - if "GENE_ANNOTATION" in missing_cols and "ANNOTATION" in sa.columns.values: logger.info( "WARNING: GENE_ANNOTATION must be a column in the sample annotation table, ANNOTATION is the old column name and will be deprecated in the future\n") sa["GENE_ANNOTATION"] = sa.pop("ANNOTATION") missing_cols.remove("GENE_ANNOTATION") + for toDel_optional in (set(missing_cols) & optional_columns): + # deal with missing columns in data types, remove it to fix checks later + self.SAMPLE_ANNOTATION_COLUMNS.remove(toDel_optional) + missing_cols.remove(toDel_optional) + if len(missing_cols) > 0: raise ValueError(f"Incorrect columns in sample annotation file. Missing:\n{missing_cols}") @@ -80,10 +82,12 @@ def createSampleFileMapping(self): columns: [ID | ASSAY | FILE_TYPE | FILE_PATH ] """ - assay_mapping = {'RNA_ID': ['RNA_BAM_FILE', 'GENE_COUNTS_FILE'], 'DNA_ID': ['DNA_VCF_FILE']} + assay_mapping = {'RNA_ID': ['RNA_BAM_FILE', 'GENE_COUNTS_FILE', 'SPLICE_COUNTS_DIR'], 'DNA_ID': ['DNA_VCF_FILE']} assay_subsets = [] for id_, file_types in assay_mapping.items(): for file_type in file_types: + if file_type not in self.annotationTable.columns: + continue df = self.annotationTable[[id_, file_type]].dropna().drop_duplicates().copy() df.rename(columns={id_: 'ID', file_type: 'FILE_PATH'}, inplace=True) df['ASSAY'] = id_ @@ -150,13 +154,12 @@ def createGroupIds(self, group_key="DROP_GROUP", file_type=None, sep=','): ### Subsetting - def subsetSampleAnnotation(self, column, values, subset=None, exact_match=True): + def subsetSampleAnnotation(self, column, values, subset=None): """ subset by one or more values of different columns from sample file mapping :param column: valid column in sample annotation :param values: values of column to subset :param subset: subset sample annotation - :param exact_match: whether to match substrings in the sample annotation, false allows substring matching """ sa_cols = set(self.SAMPLE_ANNOTATION_COLUMNS) if subset is None: @@ -168,10 +171,16 @@ def subsetSampleAnnotation(self, column, values, subset=None, exact_match=True): if not sa_cols <= set(subset.columns): # check if mandatory cols not contained raise ValueError(f"Subset columns not the same as {sa_cols}\ngot: {subset.columns}") + + # if you don't want to subset + if values is None: + return subset # check if column is valid - if column not in sa_cols: + elif column not in sa_cols: raise KeyError(f"Column '{column}' not present in sample annotation.") - return utils.subsetBy(subset, column, values, exact_match=exact_match) + #subset column for matching values + else: + return utils.subsetBy(subset, column, values) def subsetFileMapping(self, file_type=None, sample_id=None): """ @@ -230,10 +239,9 @@ def getFilePaths(self, file_type, group=None): return self.getFilePath(sampleIDs, file_type, single_file=False) # build a dictionary from the drop group and column. like getImportCounts with skipping options and dict output - def getGenomes(self, value, group, file_type="RNA_ID", - column="GENOME", group_key="DROP_GROUP",exact_match = True,skip = False): + def getGenomes(self, value, group, file_type="RNA_ID", column="GENOME", group_key="DROP_GROUP", skip = False): """ - :param value: values to match in the column. Must be an exact match, passed to subsetting sample annotation + :param value: values to match in the column. :param group: a group of the group_key (DROP_GROUP) column. :return: dict file_type to column """ @@ -242,24 +250,30 @@ def getGenomes(self, value, group, file_type="RNA_ID", if skip: subset = None else: - subset = self.subsetSampleAnnotation(column, value,exact_match=True) + subset = self.subsetSampleAnnotation(column, value) # additionally subset for the group_key and the group - subset = self.subsetSampleAnnotation(group_key, group, subset,exact_match=exact_match) + subset = self.subsetSampleAnnotation(group_key, group, subset) return {sample_id: value for sample_id in subset[file_type].tolist()} - def getImportCountFiles(self, annotation, group, file_type="GENE_COUNTS_FILE", - annotation_key="GENE_ANNOTATION", group_key="DROP_GROUP",exact_match = True): + def getImportCountFiles(self, annotation, group, file_type: str = "GENE_COUNTS_FILE", + annotation_key: str = "GENE_ANNOTATION", group_key: str = "DROP_GROUP", + asSet: bool = True): """ - :param annotation: annotation name as specified in config and GENE_ANNOTATION column - :param group: a group of the DROP_GROUP column. exact match is passed to subsetter, false allows for substring matching + :param annotation: annotation name as specified in config and GENE_ANNOTATION column. Can be None + :param group: a group of the DROP_GROUP column. :return: set of unique external count file names """ + #subset for the annotation_key in the annotation group and the group_key in the group - subset = self.subsetSampleAnnotation(annotation_key, annotation,exact_match=exact_match) - subset = self.subsetSampleAnnotation(group_key, group, subset,exact_match=False) - return set(subset[file_type].tolist()) + subset = self.subsetSampleAnnotation(annotation_key, annotation) + subset = self.subsetSampleAnnotation(group_key, group, subset) + + ans = subset[file_type].tolist() + if asSet: + ans = set(ans) + return ans def getRow(self, column, value): sa = self.annotationTable @@ -277,15 +291,18 @@ def getGroupedIDs(self, assays): Get group to IDs mapping :param assays: list of or single assay the IDs should be from. Can be file_type or 'RNA'/'DNA' """ - assays = [assays] if isinstance(assays, str) else assays + if isinstance(assays, str): + assays = [assays] groupedIDs = defaultdict(list) for assay in assays: if "RNA" in assay: - groupedIDs.update(self.rnaIDs) + utils.deep_merge_dict(groupedIDs, self.rnaIDs, inplace=True) elif "DNA" in assay: - groupedIDs.update(self.dnaIDs) + utils.deep_merge_dict(groupedIDs, self.dnaIDs, inplace=True) elif "GENE_COUNT" in assay: - groupedIDs.update(self.extGeneCountIDs) + utils.deep_merge_dict(groupedIDs, self.extGeneCountIDs, inplace=True) + elif "SPLICE_COUNT" in assay: + utils.deep_merge_dict(groupedIDs, self.extSpliceCountIDs, inplace=True) else: raise ValueError(f"'{assay}' is not a valid assay name") return groupedIDs diff --git a/drop/config/submodules/AberrantExpression.py b/drop/config/submodules/AberrantExpression.py index 385c226d..cf7ad7ab 100644 --- a/drop/config/submodules/AberrantExpression.py +++ b/drop/config/submodules/AberrantExpression.py @@ -25,7 +25,7 @@ def __init__(self, config, sampleAnnotation, processedDataDir, processedResultsD please fix to only have either external count or BAM processing\n") # check number of IDs per group - all_ids = {g: self.rnaIDs[g] + self.extRnaIDs[g] for g in self.groups} + all_ids = self.sampleAnnotation.subsetGroups(self.groups, assay=["RNA", "GENE_COUNTS"]) self.checkSubset(all_ids) def setDefaultKeys(self, dict_): @@ -47,6 +47,11 @@ def getCountFiles(self, annotation, group): :param group: DROP group name from wildcard :return: list of files """ + + # if sample annotation table does not contain GENE_COUNTS_FILE column. return no external counts + if("GENE_COUNTS_FILE" not in self.sampleAnnotation.SAMPLE_ANNOTATION_COLUMNS): + return [] + bam_IDs = self.sampleAnnotation.getIDsByGroup(group, assay="RNA") file_stump = self.processedDataDir / "aberrant_expression" / annotation / "counts" / "{sampleID}.Rds" count_files = expand(str(file_stump), sampleID=bam_IDs) diff --git a/drop/config/submodules/AberrantSplicing.py b/drop/config/submodules/AberrantSplicing.py index 7a59fb61..228ba775 100644 --- a/drop/config/submodules/AberrantSplicing.py +++ b/drop/config/submodules/AberrantSplicing.py @@ -1,3 +1,6 @@ +import numpy as np +import pandas as pd + from snakemake.io import expand from drop import utils @@ -16,8 +19,16 @@ def __init__(self, config, sampleAnnotation, processedDataDir, processedResultsD # if self.run is false return without doing any config/sa checks for completeness if not self.run: return - self.rnaIDs = self.sampleAnnotation.subsetGroups(self.groups, assay="RNA") - self.checkSubset(self.rnaIDs) + + self.rnaIDs = self.sampleAnnotation.subsetGroups(self.groups, assay="RNA") + self.rnaExIDs = self.sampleAnnotation.subsetGroups(self.groups, assay="SPLICE_COUNT") + for g in self.groups: + if len(set(self.rnaIDs[g]) & set(self.rnaExIDs[g])) > 0: + raise ValueError(f"{set(self.rnaIDs[g]) & set(self.extRnaIDs[g])} has both BAM and external count file \ + please fix in sample annotation table to only have either external count or BAM processing\n") + + all_ids = self.sampleAnnotation.subsetGroups(self.groups, assay=["RNA", "SPLICE_COUNT"]) + self.checkSubset(all_ids) def setDefaultKeys(self, dict_): super().setDefaultKeys(dict_) @@ -44,7 +55,7 @@ def getSplitCountFiles(self, dataset): :return: list of files """ ids = self.sampleAnnotation.getIDsByGroup(dataset, assay="RNA") - file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-{dataset}" / \ + file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-local-{dataset}" / \ "sample_tmp" / "splitCounts" done_files = str(file_stump / "sample_{sample_id}.done") return expand(done_files, sample_id=ids) @@ -56,7 +67,30 @@ def getNonSplitCountFiles(self, dataset): :return: list of files """ ids = self.sampleAnnotation.getIDsByGroup(dataset, assay="RNA") - file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-{dataset}" / \ + file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-local-{dataset}" / \ "sample_tmp" / "nonSplitCounts" done_files = str(file_stump / "sample_{sample_id}.done") return expand(done_files, sample_id=ids) + + + def getExternalCounts(self, group: str, fileType: str = "k_j_counts"): + """ + Get externally provided splice count data dir based on the given group. + If a file type is given the corresponding file within the folder is returned. + :param group: DROP group name from wildcard + :param fileType: name of the file without extension which is to be returned + :return: list of directories or files + """ + + # if sample annotation table does not contain SPLICE_COUNTS_DIR column. return no external counts + if("SPLICE_COUNTS_DIR" not in self.sampleAnnotation.SAMPLE_ANNOTATION_COLUMNS): + return [] + + ids = self.sampleAnnotation.getIDsByGroup(group, assay="SPLICE_COUNT") + extCountFiles = self.sampleAnnotation.getImportCountFiles(annotation=None, group=group, + file_type="SPLICE_COUNTS_DIR", asSet=False) + if fileType is not None: + extCountFiles = np.asarray(extCountFiles)[pd.isna(extCountFiles) == False].tolist() + extCountFiles = [x + "/" + fileType + ".tsv.gz" for x in extCountFiles] + return extCountFiles + diff --git a/drop/config/submodules/MonoallelicExpression.py b/drop/config/submodules/MonoallelicExpression.py index a8b45406..2f3173cd 100644 --- a/drop/config/submodules/MonoallelicExpression.py +++ b/drop/config/submodules/MonoallelicExpression.py @@ -51,7 +51,7 @@ def setDefaultKeys(self, dict_): return dict_ def checkConfigSampleannotation(self): - subset = self.sampleAnnotation.subsetSampleAnnotation("DROP_GROUP", self.groups, exact_match=False) + subset = self.sampleAnnotation.subsetSampleAnnotation("DROP_GROUP", self.groups) if len(self.genomeFiles.keys()) > 1: # more than 1 value in config defined genome dictionary if "GENOME" not in subset.columns.values: # GENOME column not defined @@ -149,16 +149,16 @@ def setGenomeDict(self, genomeFiles): if len(genomeFiles) == 1: # globally defined in the config globalGenome = list(genomeFiles.values())[0] - # subset SA by the drop group (not exact match) and skip the filtering by SA-GENOME column + # subset SA by the drop group and skip the filtering by SA-GENOME column genomeDict = self.sampleAnnotation.getGenomes( globalGenome, self.groups, file_type="RNA_ID", column="DROP_GROUP", group_key="DROP_GROUP", - exact_match=False, skip=True + skip=True ) else: - # subset SA by the drop group (not exact match) and filter by SA-GENOME column. Must exactly match config key + # subset SA by the drop group and filter by SA-GENOME column. Must exactly match config key for gf in genomeFiles.keys(): genomeDict.update( self.sampleAnnotation.getGenomes( @@ -166,7 +166,7 @@ def setGenomeDict(self, genomeFiles): self.groups, file_type="RNA_ID", column="GENOME", group_key="DROP_GROUP", - exact_match=False, skip=False + skip=False ) ) diff --git a/drop/demo/config_relative.yaml b/drop/demo/config_relative.yaml index f79852d0..ea0b14b7 100755 --- a/drop/demo/config_relative.yaml +++ b/drop/demo/config_relative.yaml @@ -16,13 +16,14 @@ exportCounts: - v29 excludeGroups: - mae - - import_exp + - outrider_external + - fraser_external aberrantExpression: run: true groups: - outrider - - import_exp + - outrider_external fpkmCutoff: 1 implementation: autoencoder padjCutoff: 1 @@ -36,6 +37,7 @@ aberrantSplicing: run: true groups: - fraser + - fraser_external recount: true longRead: false keepNonStandardChrs: true @@ -53,8 +55,8 @@ mae: groups: - mae gatkIgnoreHeaderCheck: true - padjCutoff: .05 - allelicRatioCutoff: 0.8 + padjCutoff: .5 + allelicRatioCutoff: 0.7 addAF: false maxAF: .001 maxVarFreqCohort: 1 diff --git a/drop/demo/sample_annotation_relative.tsv b/drop/demo/sample_annotation_relative.tsv index 0da07a27..9b720f9b 100755 --- a/drop/demo/sample_annotation_relative.tsv +++ b/drop/demo/sample_annotation_relative.tsv @@ -1,23 +1,25 @@ -RNA_ID RNA_BAM_FILE DNA_VCF_FILE DNA_ID DROP_GROUP PAIRED_END COUNT_MODE COUNT_OVERLAPS STRAND HPO_TERMS GENE_COUNTS_FILE GENE_ANNOTATION GENOME +RNA_ID RNA_BAM_FILE DNA_VCF_FILE DNA_ID DROP_GROUP PAIRED_END COUNT_MODE COUNT_OVERLAPS STRAND HPO_TERMS GENE_COUNTS_FILE GENE_ANNOTATION GENOME SPLICE_COUNTS_DIR HG00096.1.M_111124_6 Data/rna_bam/HG00096.1.M_111124_6_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00096 outrider,mae TRUE IntersectionStrict TRUE no HP:0009802,HP:0010896 HG00103.4.M_120208_3 Data/rna_bam/HG00103.4.M_120208_3_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00103 outrider,mae TRUE IntersectionStrict TRUE no HP:0004582,HP:0031959 -HG00106.4.M_120208_5 Data/rna_bam/HG00106.4.M_120208_5_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00106 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0002895,HP:0006731 -HG00111.2.M_111215_4 Data/rna_bam/HG00111.2.M_111215_4_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00111 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0100491,HP:0100871 -HG00116.2.M_120131_1 Data/rna_bam/HG00116.2.M_120131_1_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00116 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0030613,HP:0012767 -HG00126.1.M_111124_8 Data/rna_bam/HG00126.1.M_111124_8_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00126 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0000290,HP:0000293 -HG00132.2.M_111215_4 Data/rna_bam/HG00132.2.M_111215_4_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00132 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0006489,HP:0006490 -HG00149.1.M_111124_6 Data/rna_bam/HG00149.1.M_111124_6_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00149 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0000014,HP:0000020,HP:0032663 -HG00150.4.M_120208_7 Data/rna_bam/HG00150.4.M_120208_7_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00150 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0030809,HP:0006144 -HG00176.4.M_120208_2 Data/rna_bam/HG00176.4.M_120208_2_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00176 outrider,import_exp TRUE IntersectionStrict TRUE no HP:0005215,HP:0010234 -HG00096.1.M_111124_6_trunc Data/rna_bam/HG00096.1.M_111124_6_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00096 fraser TRUE IntersectionStrict TRUE no HP:0009802,HP:0010896 -HG00103.4.M_120208_3_trunc Data/rna_bam/HG00103.4.M_120208_3_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00103 fraser TRUE IntersectionStrict TRUE no HP:0004582,HP:0031959 -HG00106.4.M_120208_5_trunc Data/rna_bam/HG00106.4.M_120208_5_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00106 fraser TRUE IntersectionStrict TRUE no HP:0002895,HP:0006731 -HG00111.2.M_111215_4_trunc Data/rna_bam/HG00111.2.M_111215_4_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00111 fraser TRUE IntersectionStrict TRUE no HP:0100491,HP:0100871 -HG00116.2.M_120131_1_trunc Data/rna_bam/HG00116.2.M_120131_1_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00116 fraser TRUE IntersectionStrict TRUE no HP:0030613,HP:0012767 +HG00106.4.M_120208_5 Data/rna_bam/HG00106.4.M_120208_5_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00106 outrider,outrider_external TRUE IntersectionStrict TRUE no HP:0002895,HP:0006731 +HG00111.2.M_111215_4 Data/rna_bam/HG00111.2.M_111215_4_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00111 outrider,outrider_external TRUE IntersectionStrict TRUE no HP:0100491,HP:0100871 +HG00116.2.M_120131_1 Data/rna_bam/HG00116.2.M_120131_1_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00116 outrider,outrider_external TRUE IntersectionStrict TRUE no HP:0030613,HP:0012767 +HG00126.1.M_111124_8 Data/rna_bam/HG00126.1.M_111124_8_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00126 outrider,outrider_external TRUE IntersectionStrict TRUE no HP:0000290,HP:0000293 +HG00132.2.M_111215_4 Data/rna_bam/HG00132.2.M_111215_4_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00132 outrider,outrider_external TRUE IntersectionStrict TRUE no HP:0006489,HP:0006490 +HG00149.1.M_111124_6 Data/rna_bam/HG00149.1.M_111124_6_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00149 outrider,outrider_external TRUE IntersectionStrict TRUE no HP:0000014,HP:0000020,HP:0032663 +HG00150.4.M_120208_7 Data/rna_bam/HG00150.4.M_120208_7_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00150 fraser_external,outrider,outrider_external TRUE IntersectionStrict TRUE no HP:0030809,HP:0006144 +HG00176.4.M_120208_2 Data/rna_bam/HG00176.4.M_120208_2_chr21.bam Data/dna_vcf/demo_chr21.vcf.gz HG00176 fraser_external,outrider,outrider_external TRUE IntersectionStrict TRUE no HP:0005215,HP:0010234 +HG00096.1.M_111124_6_trunc Data/rna_bam/HG00096.1.M_111124_6_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00096 fraser,fraser_external TRUE IntersectionStrict TRUE no HP:0009802,HP:0010896 +HG00103.4.M_120208_3_trunc Data/rna_bam/HG00103.4.M_120208_3_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00103 fraser,fraser_external TRUE IntersectionStrict TRUE no HP:0004582,HP:0031959 +HG00106.4.M_120208_5_trunc Data/rna_bam/HG00106.4.M_120208_5_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00106 fraser,fraser_external TRUE IntersectionStrict TRUE no HP:0002895,HP:0006731 +HG00111.2.M_111215_4_trunc Data/rna_bam/HG00111.2.M_111215_4_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00111 fraser,fraser_external TRUE IntersectionStrict TRUE no HP:0100491,HP:0100871 +HG00116.2.M_120131_1_trunc Data/rna_bam/HG00116.2.M_120131_1_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00116 fraser,fraser_external TRUE IntersectionStrict TRUE no HP:0030613,HP:0012767 HG00126.1.M_111124_8_trunc Data/rna_bam/HG00126.1.M_111124_8_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00126 fraser TRUE IntersectionStrict TRUE no HP:0000290,HP:0000293 HG00132.2.M_111215_4_trunc Data/rna_bam/HG00132.2.M_111215_4_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00132 fraser TRUE IntersectionStrict TRUE no HP:0006489,HP:0006490 HG00149.1.M_111124_6_trunc Data/rna_bam/HG00149.1.M_111124_6_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00149 fraser TRUE IntersectionStrict TRUE no HP:0000014,HP:0000020,HP:0032663 HG00150.4.M_120208_7_trunc Data/rna_bam/HG00150.4.M_120208_7_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00150 fraser TRUE IntersectionStrict TRUE no HP:0030809,HP:0006144 HG00176.4.M_120208_2_trunc Data/rna_bam/HG00176.4.M_120208_2_chr21.bam_trunc.bam Data/dna_vcf/demo_chr21.vcf.gz HG00176 fraser TRUE IntersectionStrict TRUE no HP:0005215,HP:0010234 -HG00178.4.M_120208_8 import_exp Data/external_geneCounts.tsv.gz v29 -HG00181.4.M_120208_4 import_exp Data/external_geneCounts.tsv.gz v29 +HG00178.4.M_120208_8 outrider_external Data/external_count_data/geneCounts.tsv.gz v29 +HG00181.4.M_120208_4 fraser_external,outrider_external Data/external_count_data/geneCounts.tsv.gz v29 Data/external_count_data +HG00191.3.M_120208_3 fraser_external Data/external_count_data +HG00201.1.M_120208_6 fraser_external Data/external_count_data diff --git a/drop/download_data.sh b/drop/download_data.sh index ba2ac064..3e9ff804 100644 --- a/drop/download_data.sh +++ b/drop/download_data.sh @@ -2,12 +2,12 @@ set -e # get data -resource_url="https://www.cmm.in.tum.de/public/paper/drop_analysis/resource.tar.gz" +resource_url="https://www.cmm.in.tum.de/public/paper/drop_analysis/resource_splice_merge.tar.gz" tmpdir="$(dirname "$(mktemp)")" -wget -nc -P $tmpdir $resource_url +wget -c -O $tmpdir/resource_exsplice.tar.gz $resource_url mkdir -p Data if [ -z "$(ls Data)" ]; then - tar -zxvf "$tmpdir/resource.tar.gz" -C . + tar -zxvf "$tmpdir/resource_exsplice.tar.gz" -C . rm -rf Data mv resource Data else diff --git a/drop/installRPackages.R b/drop/installRPackages.R index 33455546..cab1e63f 100644 --- a/drop/installRPackages.R +++ b/drop/installRPackages.R @@ -1,3 +1,5 @@ +options(timeout = 600) + options(repos=structure(c(CRAN="https://cloud.r-project.org")), warn = -1) if (!requireNamespace('BiocManager', quietly = TRUE)) { diff --git a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R index c7a8666c..240e1682 100644 --- a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R +++ b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R @@ -32,19 +32,27 @@ suppressPackageStartupMessages({ }) ods <- readRDS(snakemake@input$ods) + +has_external <- any(as.logical(colData(ods)$isExternal)) +cnts_mtx_local <- counts(ods, normalized = F)[,!as.logical(ods@colData$isExternal)] cnts_mtx <- counts(ods, normalized = F) -#' Number of samples: `r ncol(ods)` +#' ## Number of samples: +#' Local: `r sum(!as.logical(ods@colData$isExternal))` +#' External: `r sum(as.logical(ods@colData$isExternal))` #' #' # Count Quality Control #' -#' Compare number of records vs. read counts +#' Compare number of records vs. read counts +#' The `Obtained Read Count Ratio` plot does not include external counts +#' because there are no raw reads to be counted. #' bam_coverage <- fread(snakemake@input$bam_cov) bam_coverage[, sampleID := as.character(sampleID)] coverage_dt <- merge(bam_coverage, data.table(sampleID = colnames(ods), - read_count = colSums(cnts_mtx)), + read_count = colSums(cnts_mtx), + isExternal = ods@colData$isExternal), by = "sampleID", sort = FALSE) # read count setorder(coverage_dt, read_count) @@ -60,55 +68,81 @@ coverage_dt[, size_factors := sizeFactors(ods)] setorder(coverage_dt, size_factors) coverage_dt[, sf_rank := 1:.N] -p_depth <- ggplot(coverage_dt, aes(count_rank, read_count)) + - geom_point() + +p_depth <- ggplot(coverage_dt, aes(x = count_rank, y = read_count, col= isExternal )) + + geom_point(size = 3,show.legend = has_external) + theme_cowplot() + background_grid() + labs(title = "Obtained Read Counts", x="Sample Rank", y = "Reads Counted") + - ylim(c(0,NA)) + ylim(c(0,NA)) + + scale_color_brewer(palette="Dark2") -p_frac <- ggplot(coverage_dt, aes(frac_rank, counted_frac)) + - geom_point() + +p_frac <- ggplot(coverage_dt, aes(x = frac_rank, y = counted_frac, col = isExternal)) + + geom_point(size = 3,show.legend = has_external) + theme_cowplot() + background_grid() + labs(title = "Obtained Read Count Ratio", x = "Sample Rank", y = "Percent Reads Counted") + - ylim(c(0,NA)) + ylim(c(0,NA)) + + scale_color_brewer(palette="Dark2") #+ QC, fig.height=6, fig.width=12 -plot_grid(p_depth, p_frac) +plot_grid(p_depth, p_frac) -p_sf <- ggplot(coverage_dt, aes(sf_rank, size_factors)) + - geom_point() + +p_sf <- ggplot(coverage_dt, aes(sf_rank, size_factors, col = isExternal)) + + geom_point(size = 3,show.legend = has_external) + ylim(c(0,NA)) + theme_cowplot() + background_grid() + - labs(title = 'Size Factors', x = 'Sample Rank', y = 'Size Factors') + labs(title = 'Size Factors', x = 'Sample Rank', y = 'Size Factors') + + scale_color_brewer(palette="Dark2") -p_sf_cov <- ggplot(coverage_dt, aes(read_count, size_factors)) + - geom_point() + +p_sf_cov <- ggplot(coverage_dt, aes(read_count, size_factors, col = isExternal)) + + geom_point(size = 3,show.legend = has_external) + ylim(c(0,NA)) + theme_cowplot() + background_grid() + labs(title = 'Size Factors vs. Read Counts', - x = 'Read Counts', y = 'Size Factors') + x = 'Read Counts', y = 'Size Factors') + + scale_color_brewer(palette="Dark2") #+ sizeFactors, fig.height=6, fig.width=12 plot_grid(p_sf, p_sf_cov) #' # Filtering +#' **local**: A pre-filtered summary of counts using only the local (from BAM) counts. Omitted if no external counts +#' **all**: A pre-filtered summary of counts using only the merged local (from BAM) and external counts +#' **passed FPKM**: Passes the user defined FPKM cutoff in at least 5% of genes +#' **min 1 read**: minimum of 1 read expressed in 5% of genes +#' **min 10 reads**: minimum of 10 reads expressed in 5% of genes + quant <- .95 -filter_mtx <- list( - all = cnts_mtx, - passed_FPKM = cnts_mtx[rowData(ods)$passedFilter,], - min_1 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 1, ], - min_10 = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 10, ] -) -filter_dt <- lapply(names(filter_mtx), function(filter_name) { - mtx <- filter_mtx[[filter_name]] - data.table(gene_ID = rownames(mtx), median_counts = rowMeans(mtx), filter = filter_name) -}) %>% rbindlist -filter_dt[, filter := factor(filter, levels = c('all', 'passed_FPKM', 'min_1', 'min_10'))] + +if(has_external){ + filter_mtx <- list( + local = cnts_mtx_local, + all = cnts_mtx, + `passed FPKM` = cnts_mtx[rowData(ods)$passedFilter,], + `min 1 read` = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 1, ], + `min 10 reads` = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 10, ] + ) + filter_dt <- lapply(names(filter_mtx), function(filter_name) { + mtx <- filter_mtx[[filter_name]] + data.table(gene_ID = rownames(mtx), median_counts = rowMeans(mtx), filter = filter_name) + }) %>% rbindlist + filter_dt[, filter := factor(filter, levels = c('local', 'all', 'passed FPKM', 'min 1 read', 'min 10 reads'))] +} else{ + filter_mtx <- list( + all = cnts_mtx, + `passed FPKM` = cnts_mtx[rowData(ods)$passedFilter,], + `min 1 read` = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 1, ], + `min 10 reads` = cnts_mtx[rowQuantiles(cnts_mtx, probs = quant) > 10, ] + ) + filter_dt <- lapply(names(filter_mtx), function(filter_name) { + mtx <- filter_mtx[[filter_name]] + data.table(gene_ID = rownames(mtx), median_counts = rowMeans(mtx), filter = filter_name) + }) %>% rbindlist + filter_dt[, filter := factor(filter, levels = c('all', 'passed FPKM', 'min 1 read', 'min 10 reads'))] +} binwidth <- .2 p_hist <- ggplot(filter_dt, aes(x = median_counts, fill = filter)) + @@ -135,24 +169,24 @@ p_dens <- ggplot(filter_dt, aes(x = median_counts, col = filter)) + #+ meanCounts, fig.height=6, fig.width=12 plot_grid(p_hist, p_dens) +#' ### Expressed Genes +exp_genes_cols <- c(Rank = "expressedGenesRank",`Expressed\ngenes` = "expressedGenes", + `Union of\nexpressed genes` = "unionExpressedGenes", + `Intersection of\nexpressed genes` = "intersectionExpressedGenes", + `Genes passed\nfiltering` = "passedFilterGenes", `Is External` = "isExternal") + +expressed_genes <- as.data.table(colData(ods)[,exp_genes_cols]) +colnames(expressed_genes) <- names(exp_genes_cols) + #+ expressedGenes, fig.height=6, fig.width=8 -plotExpressedGenes(ods) + +plotExpressedGenes(ods) + theme_cowplot() + - background_grid(major = "y") - -expressed_genes <- as.data.table(colData(ods)) -expressed_genes <- expressed_genes[, .(expressedGenes, unionExpressedGenes, - intersectionExpressedGenes, passedFilterGenes, - expressedGenesRank)] - -#+echo=F -rank_1 <- expressed_genes[expressedGenesRank == 1] -#' **Rank 1:** -#' `r as.character(rank_1$expressedGenes)` expressed genes -#+echo=F -rank_n <- expressed_genes[expressedGenesRank == .N] -#' **Rank `r rank_n$expressedGenesRank`:** -#' `r as.character(rank_n$expressedGenes)` expressed genes -#' `r as.character(rank_n$unionExpressedGenes)` expressed genes (union) -#' `r as.character(rank_n$intersectionExpressedGenes)` expressed genes (intersection) -#' `r as.character(rank_n$passedFilterGenes)` genes passed the filter + background_grid(major = "y") + + geom_point(data =melt(expressed_genes,id.vars = c("Rank","Is External")), + aes(x = Rank, y = value, col = variable, shape = `Is External`),show.legend = has_external) + +if(has_external){ + DT::datatable(expressed_genes[order(Rank)],rownames = F) +} else{ + DT::datatable(expressed_genes[order(Rank),-"Is External"],rownames = F) +} diff --git a/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R b/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R index c1a58e70..82405251 100644 --- a/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R +++ b/drop/modules/aberrant-expression-pipeline/Counting/filterCounts.R @@ -36,5 +36,13 @@ ods <- filterExpression(ods, gtfFile=txdb, filter=FALSE, # add column for genes with at least 1 gene rowData(ods)$counted1sample = rowSums(assay(ods)) > 0 +has_external <- !(all(ods@colData$GENE_COUNTS_FILE == "") || is.null(ods@colData$GENE_COUNTS_FILE)) +if(has_external){ + ods@colData$isExternal <- as.factor(ods@colData$GENE_COUNTS_FILE != "") +}else{ + ods@colData$isExternal <- as.factor(FALSE) +} + + # Save the ods before filtering to preserve the original number of genes saveRDS(ods, snakemake@output$ods) diff --git a/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R b/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R index f18ad3dc..36fd13de 100644 --- a/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R +++ b/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R @@ -52,7 +52,7 @@ plotEncDimSearch(ods) + labs(title = dataset_title) + theme_cowplot() + background_grid() + - scale_color_brewer(palette = "Set1") + scale_color_brewer(palette="Dark2") #' ### Aberrantly expressed genes per sample @@ -63,18 +63,18 @@ plotAberrantPerSample(ods, main = dataset_title, #' ### Batch correction #+ countCorHeatmap, fig.height=8, fig.width=8 -plotCountCorHeatmap(ods, normalized = FALSE, +plotCountCorHeatmap(ods, normalized = FALSE, colGroups = "isExternal", colColSet = "Dark2", main = paste0('Raw Counts (', dataset_title, ')')) -plotCountCorHeatmap(ods, normalized = TRUE, +plotCountCorHeatmap(ods, normalized = TRUE, ,colGroups = "isExternal", colColSet = "Dark2", main = paste0('Normalized Counts (', dataset_title, ')')) #' ### Expression by gene per sample -#+ geneSampleHeatmap, fig.height=12, fig.width=6 -plotCountGeneSampleHeatmap(ods, normalized = FALSE, nGenes = 50, +#+ geneSampleHeatmap, fig.height=12, fig.width=8 +plotCountGeneSampleHeatmap(ods, normalized = FALSE, nGenes = 50, colGroups = "isExternal", colColSet = "Dark2", main = paste0('Raw Counts (', dataset_title, ')'), bcvQuantile = .95, show_names = 'row') -plotCountGeneSampleHeatmap(ods, normalized = TRUE, nGenes = 50, +plotCountGeneSampleHeatmap(ods, normalized = TRUE, nGenes = 50, colGroups = "isExternal", colColSet = "Dark2", main = paste0('Normalized Counts (',dataset_title,')'), bcvQuantile = .95, show_names = 'row') @@ -110,13 +110,14 @@ ggplot(bcv_dt, aes(when, BCV)) + #' ## Results res <- fread(snakemake@input$results) -#' Samples with at least one outlier gene: `r res[, uniqueN(sampleID)]` -#' +#' Samples with at least one outlier gene: `r res[, uniqueN(sampleID)]` + #' ### Aberrant samples +#' An aberrant sample is one that has more than 0.1% of the total genes called as outliers. if (nrow(res) > 0) { - ab_table <- res[AberrantBySample > nrow(ods)/1000, .N, by = .(sampleID)] %>% unique + ab_table <- res[AberrantBySample > nrow(ods)/1000, .("Outlier genes" = .N), by = .(sampleID)] %>% unique if (nrow(ab_table) > 0) { - setorder(ab_table, N) + setorder(ab_table, "Outlier genes") DT::datatable(ab_table) } else { print("no aberrant samples") @@ -134,8 +135,8 @@ fwrite(res, file, sep = '\t', quote = F) #+ echo=FALSE, results='asis' cat(paste0("Download OUTRIDER results table")) -res[, pValue := format(pValue, scientific = T, digits = 2)] -res[, padjust := format(padjust, scientific = T, digits = 2)] +res[, pValue := format(pValue, scientific = T, digits = 3)] +res[, padjust := format(padjust, scientific = T, digits = 3)] DT::datatable( head(res, 1000), diff --git a/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R b/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R index b0da16ff..c76b252a 100644 --- a/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R +++ b/drop/modules/aberrant-expression-pipeline/OUTRIDER/results.R @@ -31,7 +31,8 @@ suppressPackageStartupMessages({ }) ods <- readRDS(snakemake@input$ods) -res <- results(ods, all = TRUE) +res <- results(ods, padjCutoff = snakemake@params$padjCutoff, + zScoreCutoff = snakemake@params$zScoreCutoff, all = TRUE) # Add fold change res[, foldChange := round(2^l2fc, 2)] diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R b/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R index 72e17db9..499e6a9a 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/00_define_datasets_from_anno.R @@ -38,9 +38,10 @@ mapping <- fread(fileMapFile) subset_ids <- snakemake@params$ids annoSub <- anno[RNA_ID %in% subset_ids] -colData <- merge( - annoSub[,.(sampleID = RNA_ID, STRAND, PAIRED_END)], +setnames(annoSub, "RNA_ID", "sampleID") +colData <- merge(annoSub, mapping[FILE_TYPE == "RNA_BAM_FILE", .(sampleID=ID, bamFile=FILE_PATH)]) +setcolorder(colData, unique(c("sampleID", "STRAND", "PAIRED_END", "bamFile"), colnames(annoSub))) #' #' ## Dataset: `r name` diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R index c54948ee..41e2ae5f 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_0_countRNA_init.R @@ -6,15 +6,15 @@ #' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "01_0_init.Rds")`' #' params: #' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' -#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' #' input: #' - colData: '`sm cfg.getProcessedDataDir() + #' "/aberrant_splicing/annotations/{dataset}.tsv"`' #' output: #' - fdsobj: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/fds-object.RDS"`' +#' "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/fds-object.RDS"`' #' - done_fds: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/fds.done" `' +#' "/aberrant_splicing/datasets/cache/raw-local-{dataset}/fds.done" `' #' type: script #'--- @@ -31,7 +31,7 @@ col_data <- fread(colDataFile) fds <- FraserDataSet(colData = col_data, workingDir = workingDir, - name = paste0("raw-", dataset)) + name = paste0("raw-local-", dataset)) # Add paired end and strand specificity to the fds pairedEnd(fds) <- colData(fds)$PAIRED_END diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R index 5da348af..36eeb798 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R @@ -6,13 +6,13 @@ #' - snakemake: '`sm str(tmp_dir / "AS" / "{dataset}" / "splitReads" / "{sample_id}.Rds")`' #' params: #' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' -#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' +#' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' #' input: #' - done_fds: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/fds.done"`' +#' "/aberrant_splicing/datasets/cache/raw-local-{dataset}/fds.done"`' #' output: #' - done_sample_splitCounts: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}" +#' "/aberrant_splicing/datasets/cache/raw-local-{dataset}" #' +"/sample_tmp/splitCounts/sample_{sample_id}.done"`' #' threads: 3 #' type: script @@ -28,7 +28,7 @@ params <- snakemake@config$aberrantSplicing genomeAssembly <- snakemake@config$genomeAssembly # Read FRASER object -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) +fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset)) # Get sample id from wildcard sample_id <- snakemake@wildcards[["sample_id"]] diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R index 1ea2a86e..4fb08ce7 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_2_countRNA_splitReads_merge.R @@ -12,13 +12,13 @@ #' - sample_counts: '`sm lambda w: cfg.AS.getSplitCountFiles(w.dataset)`' #' output: #' - countsJ: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsJ.h5"`' +#' "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/rawCountsJ.h5"`' #' - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_splitCounts.rds"`' +#' "/aberrant_splicing/datasets/cache/raw-local-{dataset}/gRanges_splitCounts.rds"`' #' - gRangesNonSplitCounts: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_NonSplitCounts.rds"`' +#' "/aberrant_splicing/datasets/cache/raw-local-{dataset}/gRanges_NonSplitCounts.rds"`' #' - spliceSites: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/spliceSites_splitCounts.rds"`' +#' "/aberrant_splicing/datasets/cache/raw-local-{dataset}/spliceSites_splitCounts.rds"`' #' type: script #'--- @@ -36,11 +36,11 @@ register(MulticoreParam(snakemake@threads)) setAutoBPPARAM(MulticoreParam(snakemake@threads)) # Read FRASER object -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) +fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset)) # If samples are recounted, remove the merged ones splitCountsDir <- file.path(workingDir, "savedObjects", - paste0("raw-", dataset), 'splitCounts') + paste0("raw-local-", dataset), 'splitCounts') if(params$recount == TRUE & dir.exists(splitCountsDir)){ unlink(splitCountsDir, recursive = TRUE) } diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R index c5d9e6a7..ce983be5 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_3_countRNA_nonSplitReads_samplewise.R @@ -9,10 +9,10 @@ #' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' #' input: #' - spliceSites: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/spliceSites_splitCounts.rds"`' +#' "/aberrant_splicing/datasets/cache/raw-local-{dataset}/spliceSites_splitCounts.rds"`' #' output: #' - done_sample_nonSplitCounts : '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/sample_tmp/nonSplitCounts/sample_{sample_id}.done"`' +#' "/aberrant_splicing/datasets/cache/raw-local-{dataset}/sample_tmp/nonSplitCounts/sample_{sample_id}.done"`' #' threads: 3 #' type: script #'--- @@ -26,7 +26,7 @@ workingDir <- snakemake@params$workingDir params <- snakemake@config$aberrantSplicing # Read FRASER object -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) +fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset)) # Get sample id from wildcard sample_id <- snakemake@wildcards[["sample_id"]] diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R index 83c48939..85c330b6 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_4_countRNA_nonSplitReads_merge.R @@ -11,10 +11,12 @@ #' input: #' - sample_counts: '`sm lambda w: cfg.AS.getNonSplitCountFiles(w.dataset)`' #' - gRangesNonSplitCounts: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_NonSplitCounts.rds"`' +#' "/aberrant_splicing/datasets/cache/raw-local-{dataset}/gRanges_NonSplitCounts.rds"`' #' output: -#' - countsSS: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsSS.h5"`' +### - countsSS: '`sm cfg.getProcessedDataDir() + +### "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/rawCountsSS.h5"`' +#' - done: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/merge_theta.done"`' #' type: script #'--- @@ -30,14 +32,14 @@ register(MulticoreParam(snakemake@threads)) setAutoBPPARAM(MulticoreParam(snakemake@threads)) # Read FRASER object -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) +fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset)) # Read splice site coordinates from RDS splitCounts_gRanges <- readRDS(snakemake@input$gRangesNonSplitCounts) # If samples are recounted, remove the merged ones nonSplitCountsDir <- file.path(workingDir, "savedObjects", - paste0("raw-", dataset), 'nonSplitCounts') + paste0("raw-local-", dataset), 'nonSplitCounts') if(params$recount == TRUE & dir.exists(nonSplitCountsDir)){ unlink(nonSplitCountsDir, recursive = TRUE) } @@ -50,3 +52,5 @@ nonSplitCounts <- getNonSplitReadCountsForAllSamples(fds=fds, longRead=params$longRead) message(date(), ":", dataset, " nonSplit counts done") + +file.create(snakemake@output$done) diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R index da3d38b6..7724b478 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/01_5_countRNA_collect.R @@ -8,17 +8,15 @@ #' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' #' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets"`' #' input: -#' - countsJ: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsJ.h5"`' -#' - countsSS: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/rawCountsSS.h5"`' -#' - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/gRanges_splitCounts.rds"`' -#' - spliceSites: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/cache/raw-{dataset}/spliceSites_splitCounts.rds"`' +#' - countsSSdone: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/merge_theta.done"`' +#' - gRangesSplitCounts: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/cache/raw-local-{dataset}/gRanges_splitCounts.rds"`' +#' - spliceSites: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/cache/raw-local-{dataset}/spliceSites_splitCounts.rds"`' #' output: #' - counting_done: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/counting.done" `' +#' "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/counting.done" `' #' type: script #'--- @@ -27,15 +25,15 @@ source(snakemake@params$setup, echo=FALSE) dataset <- snakemake@wildcards$dataset workingDir <- snakemake@params$workingDir - +saveDir <- dirname(snakemake@input$countsSSdone) # Read FRASER object -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) +fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset)) splitCounts_gRanges <- readRDS(snakemake@input$gRangesSplitCounts) spliceSiteCoords <- readRDS(snakemake@input$spliceSites) # Get splitReads and nonSplitRead counts in order to store them in FRASER object -splitCounts_h5 <- HDF5Array::HDF5Array(snakemake@input$countsJ, "rawCountsJ") +splitCounts_h5 <- HDF5Array::HDF5Array(file.path(saveDir, "rawCountsJ.h5"), "rawCountsJ") splitCounts_se <- SummarizedExperiment( colData = colData(fds), rowRanges = splitCounts_gRanges, @@ -43,7 +41,7 @@ splitCounts_se <- SummarizedExperiment( ) -nonSplitCounts_h5 <- HDF5Array::HDF5Array(snakemake@input$countsSS, "rawCountsSS") +nonSplitCounts_h5 <- HDF5Array::HDF5Array(file.path(saveDir, "rawCountsSS.h5"), "rawCountsSS") nonSplitCounts_se <- SummarizedExperiment( colData = colData(fds), rowRanges = spliceSiteCoords, diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R index 24e5cb9b..3f35f6aa 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/02_psi_value_calculation_FraseR.R @@ -10,10 +10,10 @@ #' threads: 30 #' input: #' - counting_done: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/counting.done" `' +#' "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/counting.done" `' #' output: #' - theta: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/theta.h5"`' +#' "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/theta.h5"`' #' type: script #'--- @@ -28,7 +28,7 @@ register(MulticoreParam(snakemake@threads)) # Limit number of threads for DelayedArray operations setAutoBPPARAM(MulticoreParam(snakemake@threads)) -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) +fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset)) # Calculating PSI values fds <- calculatePSIValues(fds) diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R index e42522fc..a9355b6b 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/03_filter_expression_FraseR.R @@ -7,14 +7,16 @@ #' params: #' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`' #' - workingDir: '`sm cfg.getProcessedDataDir() + "/aberrant_splicing/datasets/"`' +#' - exCountIDs: '`sm lambda w: sa.getIDsByGroup(w.dataset, assay="SPLICE_COUNT")`' #' input: -#' - theta: '`sm cfg.getProcessedDataDir()+ -#' "/aberrant_splicing/datasets/savedObjects/raw-{dataset}/theta.h5"`' +#' - theta: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/savedObjects/raw-local-{dataset}/theta.h5"`' +#' - exCounts: '`sm lambda w: cfg.AS.getExternalCounts(w.dataset, "k_j_counts")`' #' output: #' - fds: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/{dataset}/fds-object.RDS"`' -#' - done: '`sm cfg.getProcessedDataDir() + -#' "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `' +#' "/aberrant_splicing/datasets/savedObjects/{dataset}/fds-object.RDS"`' +#' - done: '`sm cfg.getProcessedDataDir() + +#' "/aberrant_splicing/datasets/savedObjects/{dataset}/filter.done" `' #' threads: 3 #' type: script #'--- @@ -27,23 +29,56 @@ opts_chunk$set(fig.width=12, fig.height=8) # input dataset <- snakemake@wildcards$dataset workingDir <- snakemake@params$workingDir -params <- snakemake@config$aberrantSplicing +params <- snakemake@config$aberrantSplicing +exCountIDs <- snakemake@params$exCountIDs +exCountFiles <- snakemake@input$exCounts +sample_anno_file <- snakemake@config$sampleAnnotation +minExpressionInOneSample <- params$minExpressionInOneSample +minDeltaPsi <- params$minDeltaPsi -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) +fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset)) register(MulticoreParam(snakemake@threads)) # Limit number of threads for DelayedArray operations setAutoBPPARAM(MulticoreParam(snakemake@threads)) -# Apply filter -minExpressionInOneSample <- params$minExpressionInOneSample -minDeltaPsi <- params$minDeltaPsi +# Add external data if provided by dataset +if(length(exCountIDs) > 0){ + message("create new merged fraser object") + fds <- saveFraserDataSet(fds,dir = workingDir, name=paste0("raw-", dataset)) + for(resource in unique(exCountFiles)){ + exSampleIDs <- exCountIDs[exCountFiles == resource] + exAnno <- fread(sample_anno_file, key="RNA_ID")[J(exSampleIDs)] + setnames(exAnno, "RNA_ID", "sampleID") + + ctsNames <- c("k_j", "k_theta", "n_psi3", "n_psi5", "n_theta") + ctsFiles <- paste0(dirname(resource), "/", ctsNames, "_counts.tsv.gz") + + # Merging external counts restricts the junctions to those that + # are only present in both the counted (fromBam) junctions AND the + # junctions from the external counts. + fds <- mergeExternalData(fds=fds, countFiles=ctsFiles, + sampleIDs=exSampleIDs, annotation=exAnno) + fds@colData$isExternal <- as.factor(!is.na(fds@colData$SPLICE_COUNTS_DIR)) + } +} else { + message("symLink fraser dir") + file.symlink(paste0(workingDir, "savedObjects/","raw-local-", dataset), + paste0(workingDir, "savedObjects/","raw-", dataset)) + + fds@colData$isExternal <- as.factor(FALSE) + workingDir(fds) <- workingDir + name(fds) <- paste0("raw-", dataset) +} + +# filter for expression and write it out to disc. fds <- filterExpressionAndVariability(fds, - minExpressionInOneSample = minExpressionInOneSample, - minDeltaPsi = minDeltaPsi, - filter=FALSE) -devNull <- saveFraserDataSet(fds) + minExpressionInOneSample = minExpressionInOneSample, + minDeltaPsi = minDeltaPsi, + filter=FALSE) + +devNull <- saveFraserDataSet(fds,dir = workingDir) # Keep junctions that pass filter name(fds) <- dataset @@ -55,5 +90,5 @@ if (params$filter == TRUE) { seqlevels(fds) <- seqlevelsInUse(fds) colData(fds)$sampleID <- as.character(colData(fds)$sampleID) -fds <- saveFraserDataSet(fds) +fds <- saveFraserDataSet(fds,dir = workingDir) file.create(snakemake@output$done) diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R index e2c61d86..b5d19e34 100644 --- a/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R +++ b/drop/modules/aberrant-splicing-pipeline/Counting/Summary.R @@ -28,22 +28,70 @@ suppressPackageStartupMessages({ dataset <- snakemake@wildcards$dataset workingDir <- snakemake@params$workingDir -fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) +fdsLocal <- loadFraserDataSet(dir=workingDir, name=paste0("raw-local-", dataset)) +fdsMerge <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset)) -#' Number of samples: `r nrow(colData(fds))` +has_external <- !(all(is.na(fdsMerge@colData$SPLICE_COUNTS_DIR)) || is.null(fdsMerge@colData$SPLICE_COUNTS_DIR)) +if(has_external){ + fdsMerge@colData$isExternal <- as.factor(!is.na(fdsMerge@colData$SPLICE_COUNTS_DIR)) +}else{ + fdsMerge@colData$isExternal <- as.factor(FALSE) +} +devNull <- saveFraserDataSet(fdsMerge,dir=workingDir, name=paste0("raw-", dataset)) + + +#' ## Number of samples: +#' Local: `r sum(!as.logical(fdsMerge@colData$isExternal))` +#' External: `r sum(as.logical(fdsMerge@colData$isExternal))` +#' +#' **Using external counts** +#' External counts introduce some complexity into the problem of counting junctions +#' because it is unknown whether or not a junction is not counted (because there are no reads) +#' compared to filtered and not present due to legal/personal sharing reasons. As a result, +#' after merging the local (counted from BAM files) counts and the external counts, only the junctions that are +#' present in both remain. As a result it is likely that the number of junctions will decrease after merging. +#' #' -#' Number of introns (psi5 or psi3): `r length(rowRanges(fds, type = "psi5"))` +#' ### Number of introns (psi5 or psi3) before and after merging: +#' Local: `r length(rowRanges(fdsLocal, type = "psi5"))` +#' Merged: `r length(rowRanges(fdsMerge, type = "psi5"))` #' -#' Number of splice sites (theta): `r length(rowRanges(fds, type = "theta"))` +#' ### Number of splice sites (theta) before and after merging: +#' Local: `r length(rowRanges(fdsLocal, type = "theta"))` +#' Merged: `r length(rowRanges(fdsMerge, type = "theta"))` #' -#' Introns that passed filter -table(mcols(fds, type="j")[,"passed"]) + +#' ### Comparison of local and external counts +if(has_external){ + externalCountIDs <- colData(fdsMerge)[as.logical(colData(fdsMerge)[,"isExternal"]),"sampleID"] + localCountIDs <- colData(fdsMerge)[!as.logical(colData(fdsMerge)[,"isExternal"]),"sampleID"] + + cts <- K(fdsMerge,"psi5") + ctsLocal<- cts[,localCountIDs] + ctsExt<- cts[,externalCountIDs] + + rowMeanLocal <- rowMeans(ctsLocal) + rowMeanExt <- rowMeans(ctsExt) + + dt <- data.table("Mean counts of local samples" = rowMeanLocal, + "Mean counts of external samples" = rowMeanExt) + + ggplot(dt,aes(x = `Mean counts of local samples`, y= `Mean counts of external samples`)) + + geom_hex() + theme_cowplot(font_size = 16) + + theme_bw() + scale_x_log10() + scale_y_log10() + + geom_abline(slope = 1, intercept =0) + + scale_color_brewer(palette="Dark2") +}else{ + print("No external counts, comparison is ommitted") +} #' ## Expression filtering #' Min expression cutoff: `r snakemake@config$aberrantSplicing$minExpressionInOneSample` -plotFilterExpression(fds) + theme_cowplot(font_size = 16) +plotFilterExpression(fdsMerge) + theme_cowplot(font_size = 16) #' ## Variability filtering #' Variability cutoff: `r snakemake@config$aberrantSplicing$minDeltaPsi` -plotFilterVariability(fds) + theme_cowplot(font_size = 16) +plotFilterVariability(fdsMerge) + theme_cowplot(font_size = 16) +#' Introns that passed filter (after merging) +table(mcols(fdsMerge, type="j")[,"passed"]) diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R index a24168a5..91343626 100644 --- a/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R +++ b/drop/modules/aberrant-splicing-pipeline/FRASER/Summary.R @@ -24,13 +24,19 @@ source(snakemake@params$setup, echo=FALSE) suppressPackageStartupMessages({ library(cowplot) + library("RColorBrewer") }) #+ input dataset <- snakemake@wildcards$dataset annotation <- snakemake@wildcards$annotation +padj_cutoff <- snakemake@config$aberrantSplicing$padjCutoff +zScore_cutoff <- snakemake@config$aberrantSplicing$zScoreCutoff +deltaPsi_cutoff <- snakemake@config$aberrantSplicing$deltaPsiCutoff + fds <- loadFraserDataSet(file=snakemake@input$fdsin) +hasExternal <- length(levels(colData(fds)$isExternal) > 1) #' Number of samples: `r nrow(colData(fds))` #' @@ -53,44 +59,50 @@ for(type in psiTypes){ } #' ## Aberrantly spliced genes per sample -plotAberrantPerSample(fds, aggregate=TRUE, main=dataset_title) + +plotAberrantPerSample(fds, padjCutoff = padj_cutoff, zScoreCutoff = zScore_cutoff, deltaPsiCutoff = deltaPsi_cutoff, + aggregate=TRUE, main=dataset_title) + theme_cowplot(font_size = 16) + theme(legend.position = "top") -#' ## Batch Correlation: Samples x samples +#' ## Batch Correlation: samples x samples topN <- 30000 topJ <- 10000 +anno_color_scheme <- brewer.pal(n = 3, name = 'Dark2')[1:2] for(type in psiTypes){ before <- plotCountCorHeatmap( - fds, + object=fds, type = type, logit = TRUE, topN = topN, topJ = topJ, plotType = "sampleCorrelation", normalized = FALSE, - annotation_col = NA, + annotation_col = "isExternal", annotation_row = NA, sampleCluster = NA, + minDeltaPsi = snakemake@config$aberrantSplicing$minDeltaPsi, plotMeanPsi=FALSE, plotCov = FALSE, - annotation_legend = TRUE + annotation_legend = TRUE, + annotation_colors = list(isExternal = c("FALSE" = anno_color_scheme[1],"TRUE" = anno_color_scheme[2])) ) before after <- plotCountCorHeatmap( - fds, + object=fds, type = type, logit = TRUE, topN = topN, topJ = topJ, plotType = "sampleCorrelation", normalized = TRUE, - annotation_col = NA, + annotation_col = "isExternal", annotation_row = NA, sampleCluster = NA, + minDeltaPsi = snakemake@config$aberrantSplicing$minDeltaPsi, plotMeanPsi=FALSE, plotCov = FALSE, - annotation_legend = TRUE + annotation_legend = TRUE, + annotation_colors = list(isExternal = c("FALSE" = anno_color_scheme[1],"TRUE" = anno_color_scheme[2])) ) after } diff --git a/drop/modules/mae-pipeline/MAE/Results.R b/drop/modules/mae-pipeline/MAE/Results.R index d120388d..d11aa255 100644 --- a/drop/modules/mae-pipeline/MAE/Results.R +++ b/drop/modules/mae-pipeline/MAE/Results.R @@ -36,6 +36,7 @@ suppressPackageStartupMessages({ library(GenomicRanges) library(SummarizedExperiment) library(R.utils) + library(dplyr) }) # Read all MAE results files @@ -119,14 +120,16 @@ fwrite(res[MAE_ALT == TRUE & rare == TRUE], snakemake@output$res_signif_rare, # Add columns for plot res[, N := .N, by = ID] -res[MAE == TRUE, N_MAE := .N, by = ID] -res[MAE == TRUE & MAE_ALT == FALSE, N_MAE_REF := .N, by = ID] -res[MAE_ALT == TRUE, N_MAE_ALT := .N, by = ID] -res[MAE == TRUE & MAE_ALT == FALSE & rare == TRUE, N_MAE_REF_RARE := .N, by = ID] -res[MAE_ALT == TRUE & rare == TRUE, N_MAE_ALT_RARE := .N, by = ID] - -rd <- unique(res[,.(ID, N, N_MAE, N_MAE_REF, N_MAE_ALT, N_MAE_REF_RARE, N_MAE_ALT_RARE)]) -melt_dt <- melt(rd, id.vars = 'ID') +plot_res <- res[,.(N = .N, + N_MAE = sum(MAE==T), + N_MAE_REF=sum(MAE==T & MAE_ALT == F), + N_MAE_ALT=sum(MAE_ALT == T), + N_MAE_REF_RARE = sum(MAE ==T & MAE_ALT==F & rare == T), + N_MAE_ALT_RARE = sum(MAE_ALT ==T & rare ==T) + ),by = ID] + + +melt_dt <- melt(plot_res, id.vars = 'ID') melt_dt[variable == 'N', variable := '>10 counts'] melt_dt[variable == 'N_MAE', variable := '+MAE'] melt_dt[variable == 'N_MAE_REF', variable := '+MAE for\nREF'] @@ -136,20 +139,36 @@ melt_dt[variable == 'N_MAE_ALT_RARE', variable := '+MAE for ALT\n& rare'] #' #' ## Cascade plot +#' a cascade plot that shows a progression of added filters +#' - >10 counts: only variants supported by more than 10 counts +#' - +MAE: and shows mono allelic expression +#' - +MAE for REF : the monoallelic expression favors the reference allele +#' - +MAE for ALT : the monoallelic expression favors the alternative allele +#' - rare: +#' - if `add_AF` is set to true in config file must meet minimum AF set by the config value `max_AF` +#' - must meet the inner-cohort frequency `maxVarFreqCohort` cutoff + ggplot(melt_dt, aes(variable, value)) + geom_boxplot() + - scale_y_log10() + theme_bw(base_size = 14) + - labs(y = 'Heterozygous SNVs per patient', x = '') + + scale_y_log10(limits = c(1,NA)) + theme_bw(base_size = 14) + + labs(y = 'Heterozygous SNVs per patient', x = '') + annotation_logticks(sides = "l") #' -#' ## Variant Frequency within Cohort Histogram +#' ## Variant Frequency within Cohort ggplot(unique(res[,cohort_freq,by =.(gene_name, contig, position)]),aes(x = cohort_freq)) + geom_histogram( binwidth = 0.02) + - geom_vline(xintercept = maxCohortFreq, col = "red") + - xlab("Variant frequency in cohort") + ylab("Count") + geom_vline(xintercept = maxCohortFreq, col = "red",linetype="dashed") + theme_bw(base_size = 14) + + xlim(0,NA) + xlab("Variant frequency in cohort") + ylab("Variants") #' Median of each category DT::datatable(melt_dt[, .(median = median(value, na.rm = T)), by = variable]) + +# round numbers +if(nrow(res) > 0){ + res[, pvalue := signif(pvalue, 3)] + res[, padj := signif(padj, 3)] + res[, log2FC := signif(log2FC, 3)] +} #' #' ## MAE Results table DT::datatable( diff --git a/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R index 25543292..eba0bd9b 100644 --- a/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R +++ b/drop/modules/mae-pipeline/QC/DNA_RNA_matrix_plot.R @@ -32,8 +32,6 @@ identityCutoff <- .85 ggplot(melt_mat, aes(value)) + geom_histogram(fill = 'cadetblue4', binwidth = 0.05, center = .025) + theme_bw(base_size = 14) + - labs(x = 'Proportion of matching DNA-RNA variants', y = 'DNA-RNA combinations') + - scale_y_log10() + annotation_logticks(sides = "l") + expand_limits(x=c(0,1)) + geom_vline(xintercept=identityCutoff, linetype='dashed', color = 'firebrick') @@ -42,7 +40,7 @@ ggplot(melt_mat, aes(value)) + geom_histogram(fill = 'cadetblue4', binwidth = 0. #' Number of samples: `r nrow(qc_mat)` #' -#' Number of samples that match with another: `r length(qc_mat[qc_mat > identityCutoff])` +#' Number of samples that match RNA and DNA: `r length(qc_mat[qc_mat > identityCutoff])` #' #' Median of proportion of matching variants in matching samples: `r round(median(qc_mat[qc_mat > identityCutoff]), 2)` #' diff --git a/drop/requirementsR.txt b/drop/requirementsR.txt index 3a896286..65e4152f 100644 --- a/drop/requirementsR.txt +++ b/drop/requirementsR.txt @@ -1,7 +1,7 @@ package version devtools gagneurlab/OUTRIDER 1.6.1 -c-mertes/FRASER 1.2.1 +c-mertes/FRASER 1.6.1 gagneurlab/tMAE 1.0.4 VariantAnnotation rmarkdown diff --git a/drop/template/Scripts/AberrantExpression/Overview.R b/drop/template/Scripts/AberrantExpression/Overview.R index a3c6f167..2b25b548 100644 --- a/drop/template/Scripts/AberrantExpression/Overview.R +++ b/drop/template/Scripts/AberrantExpression/Overview.R @@ -90,10 +90,14 @@ sample <- res[1, sampleID] #' ### Volcano plot #' setting basePlot = FALSE creates an interactive plot #' that allows finding the gene(s) of interest -OUTRIDER::plotVolcano(ods, sample, basePlot = TRUE) +OUTRIDER::plotVolcano(ods, sample, basePlot = TRUE, + zScoreCutoff = snakemake@config$aberrantExpression$zScoreCutoff, + padjCutoff = snakemake@config$aberrantExpression$padjCutoff) #' ### Gene expression plot (normalized counts) -OUTRIDER::plotExpressionRank(ods, gene, basePlot = TRUE) +OUTRIDER::plotExpressionRank(ods, gene, basePlot = TRUE, + zScoreCutoff = snakemake@config$aberrantExpression$zScoreCutoff, + padjCutoff = snakemake@config$aberrantExpression$padjCutoff) #' ### Expected vs observed counts OUTRIDER::plotExpectedVsObservedCounts(ods, gene, basePlot = TRUE) diff --git a/drop/template/Scripts/AberrantSplicing/Overview.R b/drop/template/Scripts/AberrantSplicing/Overview.R index 6bcff37b..f00a7913 100644 --- a/drop/template/Scripts/AberrantSplicing/Overview.R +++ b/drop/template/Scripts/AberrantSplicing/Overview.R @@ -87,7 +87,9 @@ siteIndex <- 4 #' ### Volcano plot # set basePlot to FALSE to create an interactive plot -FRASER::plotVolcano(fds, sample, type = 'psi3', basePlot = TRUE) +FRASER::plotVolcano(fds, sample, type = 'psi3', basePlot = TRUE, + deltaPsiCutoff = snakemake@config$aberrantSplicing$deltaPsiCutoff, + padjCutoff = snakemake@config$aberrantSplicing$padjCutoff) #' ### Expression plot FRASER::plotExpression(fds, type = 'psi3', site = siteIndex, basePlot = TRUE) diff --git a/drop/template/Scripts/MonoallelicExpression/Overview.R b/drop/template/Scripts/MonoallelicExpression/Overview.R index 5720c6a8..29ee9212 100644 --- a/drop/template/Scripts/MonoallelicExpression/Overview.R +++ b/drop/template/Scripts/MonoallelicExpression/Overview.R @@ -9,6 +9,7 @@ #' - datasets: '`sm cfg.MAE.groups`' #' - qc_groups: '`sm cfg.MAE.qcGroups`' #' - htmlDir: '`sm config["htmlOutputPath"] + "/MonoallelicExpression"`' +#' - resultsDir: '`sm cfg.getProcessedResultsDir() + "/mae"`' #' input: #' - functions: '`sm cfg.workDir / "Scripts/html_functions.R"`' #' - allelic_counts: '`sm expand(cfg.getProcessedDataDir() + @@ -24,7 +25,7 @@ #' "dna_rna_qc_matrix.Rds", qc_group=cfg.MAE.qcGroups)`' #' output: #' html_document: -#' code_folding: hide +#' code_folding: show #' code_download: TRUE #'--- @@ -37,6 +38,7 @@ source(snakemake@input$functions) datasets <- sort(snakemake@params$datasets) annotations <- snakemake@params$annotations htmlDir <- snakemake@params$htmlDir +resultsDir <- snakemake@params$resultsDir results_links <- sapply( annotations, function(v) build_link_list( @@ -45,6 +47,13 @@ results_links <- sapply( ) ) +table_links <- sapply( + annotations, function(v) build_link_list( + file_paths = file.path(resultsDir, paste0(datasets, '/MAE_results_', v, '.tsv')), + captions = paste0(datasets) + ) +) + #' #' **Datasets:** `r paste(datasets, collapse = ', ')` #' @@ -55,35 +64,17 @@ results_links <- sapply( #' #' ## Files #' * [Allelic counts](`r file.path(snakemake@config$root, 'processed_data/mae/allelic_counts/')`) -#' * [Results tables of each sample](`r file.path(snakemake@config$root, 'processed_results/mae/samples/')`) -#' * [Aggregated results tables of each group](`r paste('* ', snakemake@input$results_tables, collapse = '\n')`) -#' -#' ## Analyze Individual Results -# Read the first results table -res_sample <- readRDS(snakemake@input$results_obj[[1]]) +#' * [Results data tables of each sample (.Rds)](`r file.path(snakemake@config$root, 'processed_results/mae/samples/')`) -#+echo=F -library(tMAE) - -if(is.null(res_sample$rare)){ - g1 <- plotMA4MAE(res_sample) - g2 <- plotAllelicCounts(res_sample) -} else { - g1 <- plotMA4MAE(res_sample, rare_column = 'rare') - g2 <- plotAllelicCounts(res_sample, rare_column = 'rare') -} - -#' ### MA plot: fold change vs RNA coverage -g1 +#' +#' `r display_text(caption = 'Significant MAE results tables ', links = table_links)` -#' ### Alternative vs Reference plot -g2 #' ## Quality Control: VCF-BAM Matching #+ eval=TRUE, echo=FALSE qc_groups <- sort(snakemake@params$qc_groups) qc_links <- build_link_list( - file_paths = file.path(htmlDir, paste0('QC', qc_groups, '.html')), + file_paths = file.path(htmlDir, paste0('QC/', qc_groups, '.html')), captions = qc_groups ) @@ -95,3 +86,36 @@ qc_matrix_links <- build_link_list( #' `r display_text(caption = 'QC Overview ', links = qc_links)` #' `r display_text(caption = 'DNA-RNA matrix ', links = qc_matrix_links)` #' + +#+ eval=TRUE, echo=TRUE +#' ## Analyze Individual Results +# Read the first results table +res_sample <- readRDS(snakemake@input$results_obj[[1]]) +print(unique(res_sample$ID)) + +#+echo=F +library(tMAE) + +if(is.na(res_sample$rare)){ + g1 <- plotMA4MAE(res_sample, + padjCutoff = snakemake@config$mae$padjCutoff, + allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff ) + g2 <- plotAllelicCounts(res_sample, + padjCutoff = snakemake@config$mae$padjCutoff, + allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff ) +} else { + g1 <- plotMA4MAE(res_sample, rare_column = 'rare', + padjCutoff = snakemake@config$mae$padjCutoff, + allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff ) + g2 <- plotAllelicCounts(res_sample, rare_column = 'rare', + padjCutoff = snakemake@config$mae$padjCutoff, + allelicRatioCutoff = snakemake@config$mae$allelicRatioCutoff ) +} + +#' ### MA plot: fold change vs RNA coverage +#+echo=F +g1 + +#' ### Alternative vs Reference plot +#+echo=F +g2 diff --git a/drop/utils.py b/drop/utils.py index b325f38f..fae88237 100644 --- a/drop/utils.py +++ b/drop/utils.py @@ -1,6 +1,7 @@ from pathlib import Path from snakemake.logging import logger import wbuild +import copy def returnPath(path, str_=True): @@ -65,22 +66,45 @@ def getWBuildSnakefile(str_=True): return returnPath(wb_path / "wBuild.snakefile", str_=str_) -def subsetBy(df, column, values, exact_match=True): +def subsetBy(df, column, values): """ Subset by one or more values of different columns from data frame :param df: data frame :param column: column to subset by :param values: values to subset by - :param exact_match: default True. when False match substrings. Important for subsetting drop groups :return: df subset by values and column """ if values is None: return df - elif isinstance(values, str) and exact_match : - return df[df[column] == values] - elif not isinstance(values,str) and exact_match: - return df[df[column].isin(values)] - elif isinstance(values,str) and not exact_match: - return df[df[column].str.contains(values)] - else: - return df[df[column].str.contains("|".join(values))] + + inner_regex = values + if not isinstance(values, str) : + inner_regex = "(" + "|".join(values) + ")" + + return df[df[column].str.contains("(?:^|,)" + inner_regex + "(?:,|$)", na = False)] + +def deep_merge_dict(dict1: dict, dict2: dict, inplace: bool = False): + """ + Merges two dictionaries and all is children recursively + + :param dict1: dictionary to be merged into + :param dict2: dictionary to be merged + :param inplace: if False, default, a new dictionary will be returned als in-place merging is performed. + """ + if not inplace: + dict1 = copy.deepcopy(dict1) + dict2 = copy.deepcopy(dict2) + + for k, v in dict2.items(): + if isinstance(dict1.get(k), dict) and isinstance(v, dict): + dict1[k] = deep_merge_dict(dict1[k], v, inplace=inplace) + elif k not in dict1: + dict1[k] = v + elif isinstance(dict1.get(k), list) and isinstance(v, list): + dict1[k] = list(dict.fromkeys(dict1[k] + v)) + elif isinstance(dict1.get(k), str) and isinstance(v, str): + dict1[k] = [dict1.get(k), v] + else: + raise TypeError(f"{k} has different types that can not be merged.") + + return dict1 diff --git a/environment.yml b/environment.yml index ec905be2..53be3e1c 100644 --- a/environment.yml +++ b/environment.yml @@ -8,3 +8,6 @@ dependencies: - drop - flake8 - bioconductor-bsgenome.hsapiens.ucsc.hg19 + + # required for downloading R packages through github + - unzip diff --git a/tests/config/test_AE.py b/tests/config/test_AE.py index fa1e1023..ed211c24 100644 --- a/tests/config/test_AE.py +++ b/tests/config/test_AE.py @@ -3,7 +3,7 @@ class Test_AE_Config: def test_config(self, dropConfig,demo_dir): assert dropConfig.AE.getWorkdir() == f"{demo_dir}/Scripts/AberrantExpression/pipeline" dict_ = { - 'groups': ['outrider', 'import_exp'], + 'groups': ['outrider', 'outrider_external'], 'fpkmCutoff': 1, 'implementation': 'autoencoder', 'padjCutoff': 1, @@ -35,9 +35,9 @@ def test_getCountsFiles(self, demo_dir, dropConfig): # import count counts_files_true = counts_files_true[2:] - counts_files_true.append(f"{demo_dir}/Data/external_geneCounts.tsv.gz") + counts_files_true.append(f"{demo_dir}/Data/external_count_data/geneCounts.tsv.gz") counts_files_true.sort() - counts_files_test = dropConfig.AE.getCountFiles(annotation="v29", group="import_exp") + counts_files_test = dropConfig.AE.getCountFiles(annotation="v29", group="outrider_external") counts_files_test.sort() assert counts_files_true == counts_files_test diff --git a/tests/config/test_AS.py b/tests/config/test_AS.py index df6b527d..5ab6d1e2 100644 --- a/tests/config/test_AS.py +++ b/tests/config/test_AS.py @@ -3,7 +3,7 @@ class Test_AS_Config: def test_config(self, dropConfig,demo_dir): assert dropConfig.AS.getWorkdir() == f"{demo_dir}/Scripts/AberrantSplicing/pipeline" dict_ = { - 'groups': ['fraser'], + 'groups': ['fraser', 'fraser_external'], 'recount': True, 'longRead': False, 'keepNonStandardChrs': True, @@ -19,7 +19,7 @@ def test_config(self, dropConfig,demo_dir): assert dict_.items() <= dropConfig.AS.dict_.items() def test_getSplitCountFiles(self, demo_dir, dropConfig): - counts_dir = f"{demo_dir}/Output/processed_data/aberrant_splicing/datasets/cache/raw-fraser/sample_tmp/" \ + counts_dir = f"{demo_dir}/Output/processed_data/aberrant_splicing/datasets/cache/raw-local-fraser/sample_tmp/" \ "splitCounts" ids = [ 'HG00096.1.M_111124_6_trunc', 'HG00103.4.M_120208_3_trunc', 'HG00111.2.M_111215_4_trunc', @@ -35,7 +35,7 @@ def test_getSplitCountFiles(self, demo_dir, dropConfig): assert counts_files_true == counts_files_test def test_getNonSplitCountFiles(self, demo_dir, dropConfig): - counts_dir = f"{demo_dir}/Output/processed_data/aberrant_splicing/datasets/cache/raw-fraser/sample_tmp/" \ + counts_dir = f"{demo_dir}/Output/processed_data/aberrant_splicing/datasets/cache/raw-local-fraser/sample_tmp/" \ "nonSplitCounts" ids = [ 'HG00096.1.M_111124_6_trunc', 'HG00103.4.M_120208_3_trunc', 'HG00111.2.M_111215_4_trunc', diff --git a/tests/config/test_MAE.py b/tests/config/test_MAE.py index 54a2d8e6..c1f480e9 100644 --- a/tests/config/test_MAE.py +++ b/tests/config/test_MAE.py @@ -7,8 +7,8 @@ def test_config(self,dropConfig,demo_dir): 'groups': ['mae'], 'qcGroups': ['mae'], 'gatkIgnoreHeaderCheck': True, - 'padjCutoff': 0.05, - 'allelicRatioCutoff': 0.8, + 'padjCutoff': 0.5, + 'allelicRatioCutoff': 0.7, 'maxAF': 0.001, 'addAF': False, 'maxVarFreqCohort': 1, diff --git a/tests/config/test_SampleAnnotation.py b/tests/config/test_SampleAnnotation.py index ba93966f..311789f3 100644 --- a/tests/config/test_SampleAnnotation.py +++ b/tests/config/test_SampleAnnotation.py @@ -13,9 +13,9 @@ def test_columns(self, sampleAnnotation): def test_mapping(self, sampleAnnotation): # ID mappings/groups - assert sampleAnnotation.idMapping.shape == (22, 2) - assert sampleAnnotation.sampleFileMapping.shape == (32, 4) - true_mapping = {'mae': 2, 'import_exp': 8, 'outrider': 10, 'fraser': 10} + assert sampleAnnotation.idMapping.shape == (24, 2) + assert sampleAnnotation.sampleFileMapping.shape == (35, 4) + true_mapping = {'mae': 2, 'outrider_external': 8, 'outrider': 10, 'fraser': 10, 'fraser_external': 7} assert true_mapping == {k: len(v) for k, v in sampleAnnotation.rnaIDs.items()} assert true_mapping == {k: len(v) for k, v in sampleAnnotation.dnaIDs.items()} @@ -23,8 +23,9 @@ def test_mapping(self, sampleAnnotation): "sample_id,file_type,file_name", [ ("HG00096.1.M_111124_6", "RNA_BAM_FILE", "Data/rna_bam/HG00096.1.M_111124_6_chr21.bam"), - ("HG00178.4.M_120208_8", "GENE_COUNTS_FILE", "Data/external_geneCounts.tsv.gz"), - ("HG00096", "DNA_VCF_FILE", "Data/dna_vcf/demo_chr21.vcf.gz") + ("HG00178.4.M_120208_8", "GENE_COUNTS_FILE", "Data/external_count_data/geneCounts.tsv.gz"), + ("HG00096", "DNA_VCF_FILE", "Data/dna_vcf/demo_chr21.vcf.gz"), + ("HG00201.1.M_120208_6", "SPLICE_COUNTS_DIR", "Data/external_count_data") ] ) def test_filePaths(self, demo_dir, sampleAnnotation, sample_id, file_type, file_name): @@ -35,7 +36,7 @@ def test_filePaths(self, demo_dir, sampleAnnotation, sample_id, file_type, file_ @pytest.mark.parametrize( "annotation,group,files", [ - ("v29", "import_exp", {'Data/external_geneCounts.tsv.gz'}) + ("v29", "outrider_external", {'Data/external_count_data/geneCounts.tsv.gz'}) ] ) def test_import(self, demo_dir, sampleAnnotation, annotation, group, files): diff --git a/tests/pipeline/test_AE.py b/tests/pipeline/test_AE.py index 1251c5ec..2befe1e2 100644 --- a/tests/pipeline/test_AE.py +++ b/tests/pipeline/test_AE.py @@ -50,7 +50,7 @@ def test_results(self, demo_dir): assert "res: 4310 15" in r.stdout def test_import_results(self, demo_dir): - output_dir = "Output/processed_results/aberrant_expression/v29/outrider/import_exp" + output_dir = "Output/processed_results/aberrant_expression/v29/outrider/outrider_external" r_cmd = """ # ods object ods <- readRDS(file.path("{}", "ods.Rds")) @@ -69,17 +69,12 @@ def test_import_results(self, demo_dir): def no_import(self, demo_dir): LOGGER.info("dryrun without import counts...") - # remove last 2 lines of sample annotation - run("head -n -2 Data/sample_annotation.tsv > Data/sample_annotation_noimp.tsv", demo_dir) - # adapt config - run("sed 's/sample_annotation.tsv/sample_annotation_noimp.tsv/' config.yaml | " - "sed '/import_exp/d' > config_noimp.yaml", demo_dir) + run("sed '/outrider_external/d' config.yaml > config_noimp.yaml", demo_dir) yield demo_dir # reset changed files back to original - run("rm Data/sample_annotation_noimp.tsv", demo_dir) run("rm config_noimp.yaml", demo_dir) def test_no_import(self, no_import): diff --git a/tests/pipeline/test_MAE.py b/tests/pipeline/test_MAE.py index d014790f..f5ea3e76 100644 --- a/tests/pipeline/test_MAE.py +++ b/tests/pipeline/test_MAE.py @@ -32,7 +32,7 @@ def test_counts(self, demo_dir): assert "[1] 235" in r.stdout @pytest.mark.usefixtures("pipeline_run") - def test_results(self, demo_dir): + def test_all_results(self, demo_dir): results_file = "Output/processed_results/mae/mae/MAE_results_all_v29.tsv.gz" r_cmd = """ library(data.table) @@ -41,3 +41,14 @@ def test_results(self, demo_dir): """.format(results_file) r = runR(r_cmd, demo_dir) assert "[1] 253" in r.stdout + + @pytest.mark.usefixtures("pipeline_run") + def test_sig_results(self, demo_dir): + results_file = "Output/processed_results/mae/mae/MAE_results_v29.tsv" + r_cmd = """ + library(data.table) + res <- fread("{}") + print(nrow(res)) + """.format(results_file) + r = runR(r_cmd, demo_dir) + assert "[1] 3" in r.stdout