From b3a6ea0b1e77f621d51bb4b44ece63484e806c03 Mon Sep 17 00:00:00 2001 From: js2264 Date: Thu, 19 Oct 2023 11:45:22 +0200 Subject: [PATCH] doc: remove useless callout boxes --- _common.R | 2 +- data-representation.qmd | 43 +++++++----------------------- interactions-centric.qmd | 26 +++--------------- matrix-centric.qmd | 24 ++++------------- parsing.qmd | 13 +++------ principles.qmd | 20 +++++--------- topological-features.qmd | 57 ++++++++++++++-------------------------- 7 files changed, 48 insertions(+), 137 deletions(-) diff --git a/_common.R b/_common.R index e23ef44..789e562 100644 --- a/_common.R +++ b/_common.R @@ -17,7 +17,7 @@ options( cli.num_colors = 0, cli.hyperlink = FALSE, pillar.bold = TRUE, - width = 77 # 80 - 3 for #> comment + width = 9999 # 80 - 3 for #> comment ) ggplot2::theme_set(ggplot2::theme_gray(12)) diff --git a/data-representation.qmd b/data-representation.qmd index 557c847..825c081 100644 --- a/data-representation.qmd +++ b/data-representation.qmd @@ -238,7 +238,7 @@ involve comparisons *between* ranges **in a single** GRanges object. ::: {.callout-note} ## Note Compared to previous section, the result of each function described below -depends on the entire set of ranges in the input `GRanges` object. +depends on the **entire set of ranges** in the input `GRanges` object. ::: - Computing the "inverse" genomic ranges, i.e. ranges in-between the input ranges: @@ -345,18 +345,13 @@ none of the remaining ranges from `query` overlap with ranges from `subject`. To directly **subset** ranges from `query` overlapping with ranges from a `subject` (e.g. to only keep *peaks* overlapping a *TSS*), we can use the -`subsetByOverlaps` function. +`subsetByOverlaps` function. The output of `subsetByOverlaps` is a subset of the original `GRanges` object +provided as a `query`, with retained ranges being unmodified. ```{r} subsetByOverlaps(peaks, TSSs) ``` -::: {.callout-note} -## Note -The output of `subsetByOverlaps` is a subset of the original `GRanges` object -provided as a `query`, with retained ranges being unmodified. -::: - - Counting overlaps between a query and a subject Finally, the `countOverlaps` is used to count, for each range in a `query`, how @@ -506,8 +501,6 @@ The way `GInteractions` objects are printed in an R console mimics that of `GRanges`, but pairs two "ends" (a.k.a. *anchors*) of an interaction together, each end being represented as a separate `GRanges` range. -::: {.callout-note} -## Notes - Note that it is possible to have interactions joining two identical anchors. ```{r} @@ -525,7 +518,6 @@ gi[4] ```{r} gi[5] ``` -::: ### `GInteractions` specific slots @@ -650,9 +642,7 @@ Note that for "trans" inter-chromosomal interactions, i.e. interactions with anc different chromosomes, the notion of genomic distance is meaningless and for this reason, `pairdist` returns a `NA` value. -::: {.callout-tip} -## Advanced `pairdist` arguments -The `type` argument can be tweaked to specify which type of "distance" should +The `type` argument of the `pairdist()` function can be tweaked to specify which type of "distance" should be computed: - `mid`: The distance between the midpoints of the two regions @@ -665,7 +655,6 @@ be computed: - `diag`: The difference between the anchor indices is returned. This corresponds to a diagonal on the interaction space when bins are used in the 'regions' slot of 'x'. -::: #### `GInteractions` overlap methods @@ -929,8 +918,6 @@ to describe important topological features. the contact matrix has been created. This is often useful to estimate some Hi-C metrics. - `metadata` is a `list` to further describe the experiment. -::: {.callout-tip} -## `HiCExperiment` slots These pieces of information are called `slots`. They can be directly accessed using `getter` functions, bearing the same name than the slot. @@ -953,10 +940,6 @@ pairsFile(hic) metadata(hic) ``` -::: - -::: {.callout-note} -## Notes `import` also works for other types of `ContactFile` (`HicFile`, `HicproFile`, `PairsFile`), e.g. @@ -977,17 +960,19 @@ pf <- PairsFile(pairsf) pairs <- import(pf) pairs ``` -::: #### Customizing the `import` To reduce the `import` to only parse the data that is relevant to the study, two arguments can be passed to `import`, along with a `ContactFile`. -::: {.callout-tip} -## Key `import` arguments: +::: {.callout-warning icon='true'} + +##### Key `import` arguments: + - `focus`: This can be used to **only parse data for a specific genomic location**. - `resolution`: This can be used to choose which resolution to parse the contact matrix at (this is ignored if the `ContactFile` is not multi-resolution, e.g. `.cool` or HiC-Pro generated matrices) + ::: - Import interactions within a single chromosome: @@ -1059,12 +1044,9 @@ The imported genomic interactions can be directly **exposed** using the interactions(yeast_hic) ``` -::: {.callout-note} -## Note Because genomic interactions are actually stored as `GInteractions`, `regions` and `anchors` work on `HiCExperiment` objects just as they work with `GInteractions`! -::: ```{r} regions(yeast_hic) @@ -1119,8 +1101,6 @@ head(scores(yeast_hic, "count")) head(scores(yeast_hic, "balanced")) ``` -::: {.callout-tip} -## Tip Calling `interactions(hic)` returns a `GInteractions` with `scores` already stored in extra columns. This short-hand allows one to dynamically check `scores` directly from the `interactions` output. @@ -1130,7 +1110,6 @@ interactions(yeast_hic) head(interactions(yeast_hic)$count) ``` -::: #### topologicalFeatures @@ -1173,8 +1152,7 @@ pairsFile(yeast_hic) readLines(pairsFile(yeast_hic), 25) ``` -::: {.callout-important} -## Importing a `PairsFile` +#### Importing a `PairsFile` The `.pairs` file linked to a `HiCExperiment` object can itself be imported in a `GInteractions` object: @@ -1185,7 +1163,6 @@ import(pairsFile(yeast_hic), format = 'pairs') Note that these `GInteractions` are **not** binned, contrary to `interactions` extracted from a `HiCExperiment`. Anchors of the interactions listed in the `GInteractions` imported from a disk-stored `.pairs` file are all of width `1`. -::: ## Visual summary of the `HiCExperiment` data structure diff --git a/interactions-centric.qmd b/interactions-centric.qmd index b0065f2..03ea607 100644 --- a/interactions-centric.qmd +++ b/interactions-centric.qmd @@ -115,15 +115,12 @@ pf <- PairsFile(pairsf) pf ``` -::: {.callout-note} -## Reminder! -`PairsFile` connections can be imported directly into a `GInteractions` object -with `import()`: +If needed, `PairsFile` connections can be imported directly into a `GInteractions` object +with `import()`. ```{r} import(pf) ``` -::: We can compute a P(s) per chromosome from this `.pairs` file using the `distanceLaw` function. @@ -133,15 +130,6 @@ ps <- distanceLaw(pf, by_chr = TRUE) ps ``` -::: {.callout-note} -## Note -Because this is a toy dataset, contacts are only provided for the chromosome `II`. - -```{r} -table(ps$chr) -``` -::: - The `plotPs()` and `plotPsSlope()` functions are convenient `ggplot2`-based functions with pre-configured settings optimized for P(s) visualization. @@ -198,13 +186,8 @@ plotPsSlope(ps_from_hic, aes(x = binned_distance, y = slope)) The ratio between cis interactions and trans interactions is often used to assess the overall quality of a Hi-C dataset. It can be computed *per chromosome* -using the `cisTransRatio()` function. - -::: {.callout-tip} -## Tip! -You will need to provide a genome-wide `HiCExperiment` to estimate +using the `cisTransRatio()` function. You will need to provide a **genome-wide** `HiCExperiment` to estimate cis/trans ratios! -::: ```{r} full_hic <- import(cf, resolution = 2000) @@ -223,13 +206,10 @@ ggplot(ct, aes(x = chr, y = cis_pct)) + labs(x = 'Chromosomes', y = '% of cis contacts') ``` -::: {.callout-important} -## Watch out Cis/trans contact ratios will greatly vary **depending on the cell cycle phase the sample is in!** For instance, chromosomes during the mitosis phase of the cell cycle have very little trans contacts, due to their structural organization and individualization. -::: ## Virtual 4C profiles diff --git a/matrix-centric.qmd b/matrix-centric.qmd index f75628a..7cd89cd 100644 --- a/matrix-centric.qmd +++ b/matrix-centric.qmd @@ -114,20 +114,15 @@ data can be normalized using matrix balancing approaches disk-stored matrices using out-of-memory strategies (e.g. with `cooler balance <.cool>`). However, if contact matrix files are imported into a `HiCExperiment` object but no `balanced` scores are available, -in-memory balancing can be performed using the `normalize` function. +in-memory balancing can be performed using the `normalize` function. This +adds an extra `ICE` element in `scores` list (while the `interactions` +themselves are unmodified). ```{r} normalized_hic <- normalize(hic) normalized_hic ``` -::: {.callout-note} -## Note -The only change done to the `HiCExperiment` object by the `normalize` function is -the addition of a single extra `ICE` in `scores` list. The `interactions` -themselves are unmodified. -::: - It is possible to plot the different `scores` of the resulting object to visualize the newly computed `scores`. In this example, `ICE` scores should be nearly identical to `balanced` scores, which were originally imported @@ -161,23 +156,14 @@ This is sometimes called "detrending", as it effectively removes the average polymer behavior from the balanced matrix. The `detrend` function performs this operation on a given `HiCExperiment` object. +It adds two extra elements in `scores` list: `expected` and `detrended` metrics +(while the `interactions` themselves are unmodified). ```{r} detrended_hic <- detrend(hic) detrended_hic ``` -::: {.callout-note} -## Note -The only change done to the `HiCExperiment` object by the `detrend` function is -the addition of two extra `scores`: - -1. `expected` -2. `detrended` - -The `interactions` themselves are unmodified. -::: - Topological features will be visually more prominent in the O/E `detrended` Hi-C map. :::{.column-page-right} diff --git a/parsing.qmd b/parsing.qmd index 1881b98..a8c7687 100644 --- a/parsing.qmd +++ b/parsing.qmd @@ -212,9 +212,6 @@ telomere <- GRanges("II:700001-813184") subsetByOverlaps(hic, telomere) |> interactions() ``` -::: {.callout-important icon='true'} -## `type` argument - By default, `subsetByOverlaps(hic, telomere)` will only recover interactions **constrained** within `telomere`, i.e. interactions for which both ends are in `telomere`. @@ -225,7 +222,6 @@ at least one of their anchors within `telomere`. ```{r} subsetByOverlaps(hic, telomere, type = "any") |> interactions() ``` -::: #### `["..."]` @@ -287,10 +283,9 @@ hic["II:300001-320000|IV:1-100000"] ```{r} hic[c('II', 'III', 'IV')] ``` -::: -::: {.callout-note} -## Note +Some notes: + - This last example (subsetting for a vector of several chromosomes) is the only scenario for which `[`-based in-memory subsetting of pre-imported data is the only way to go, as such subsetting is not possible with `focus` @@ -301,6 +296,7 @@ efficiently using the `focus` argument when `import`ing data into a - However, keep in mind that subsetting preserves extra data, e.g. added `scores`, `topologicalFeatures`, `metadata` or `pairsFile`, whereas this information is lost using `focus` with `import`. + ::: ### Zooming on a `HiCExperiment` @@ -431,8 +427,6 @@ topologicalFeatures(hic, 'loops') hic ``` -::: {.callout-note} -## Note All these objects can be used in `*Overlap` methods, as they all extend the `GRanges` class of objects. @@ -444,7 +438,6 @@ countOverlaps( subject = topologicalFeatures(hic, 'loops') ) ``` -::: #### `pairsFile` diff --git a/principles.qmd b/principles.qmd index 70f0735..0b281ba 100644 --- a/principles.qmd +++ b/principles.qmd @@ -50,11 +50,9 @@ clusters of 3D contacts. ### Sequencing -Hi-C libraries are traditionally sequenced with short-read technology, and are by essence paired-end libraries. For this reason, the end result of the experimental side of the Hi-C consists of two fastq files, each one containing sequences for one extremity of the DNA fragments purified during Hi-C. These are the two files we need +Hi-C libraries are traditionally sequenced with short-read technology, and are by essence paired-end libraries. For this reason, the end result of the experimental side of the Hi-C consists of **two fastq files**, each one containing sequences for one extremity of the DNA fragments purified during Hi-C. These are the two files we need to move on to the computational side of Hi-C. -::: {.callout-important} -### What is a fastq file? Fastq files are plain text files (usually compressed, with the `.gz` extension). They are generated by the sequencing machine during a sequencing run, and for Hi-C, necessarily come in pairs, generally called `*_R1.fq.gz` and `*_R2.fq.gz`. @@ -78,16 +76,15 @@ GCTGTTGTTGTTGTTGTATTTGCA ``` These two reads are the first listed in their respective file. -Notice how they bear the same name (first line): they form a pair. The second +Notice how they bear the same name (first line): **they form a pair**. The second line corresponds to the sequence read by the sequencer, the third line is a single `+` separator, and the last line indicates the per-base sequencing quality following a nebulous cypher. -::: ## Hi-C file formats Two important output files are typically generated during Hi-C data pre-processing: -- A "pairs" file -- A binned "contact matrix" file +- A **"pairs" file**; +- A **binned "contact matrix"** file We will now describe the structure of these different types of files. Directly jump to the [next chapter](data-representation.qmd) if you want to know @@ -100,8 +97,6 @@ output of processing Hi-C fastq files. It stores information about putative proximity contacts identified by digestion/religation, in the lossless, human-readable, indexable format: the `.pairs` format. -::: {.callout-important} -### What is a .pairs file? A `.pairs` file is organized in a `header` followed by a `body`: - `header`: starts with `#` @@ -145,7 +140,6 @@ EAS139:136:FC706VJ:2:8762:23765:128766 chr1 50000 chr1 70000 + + EAS139:136:FC706VJ:2:2342:15343:9863 chr1 60000 chr2 10000 + + EAS139:136:FC706VJ:2:1286:25:275154 chr1 30000 chr3 40000 + - ``` -::: [More information](https://github.com/4dn-dcic/pairix/blob/master/pairs_format_specification.md) about the conventions related to this text file are provided by the [4DN consortium](https://www.4dnucleome.org/), @@ -388,16 +382,16 @@ HiCool( ) ``` -::: {.callout-tip} -##### HiCool arguments +#### HiCool arguments + Several arguments can be passed to `HiCool` and some are worth mentioning them: + - `restriction`: (default: `"DpnII,HinfI"`) - `resolutions`: (default: `NULL`, automatically inferring resolutions based on genome size) - `iterative`: (default: `TRUE`) - `filter`: (default: `TRUE`) - `balancing_args`: (default: `" --cis-only --min-nnz 3 --mad-max 7 "`) - `threads`: (default: `1L`) -::: Other `HiCool` arguments can be listed by checking `HiCool` documentation in R: `?HiCool::HiCool`. diff --git a/topological-features.qmd b/topological-features.qmd index 4b02c42..ec6a003 100644 --- a/topological-features.qmd +++ b/topological-features.qmd @@ -78,9 +78,6 @@ microC_compts <- getCompartments(microC, genome = phasing_track) microC_compts ``` -::: {.callout-note} -## Note - `getCompartments()` is an endomorphism: it returns the original object, enriched with two new pieces of information: @@ -95,7 +92,6 @@ topologicalFeatures(microC_compts, "compartments") ```{r} metadata(microC_compts)$eigens ``` -::: ### Exporting compartment tracks @@ -157,13 +153,8 @@ plotSaddle(microC_compts, nbins = 25, BPPARAM = SerialParam(progressbar = FALSE) Here, the top-left small corner represents average O/E scores between strong B compartments and the bottom-right larger corner represents average O/E scores -between strong A compartments. - -::: {.callout-note} -## Note -Only `chr17` interactions are contained in this dataset, explaining the grainy +between strong A compartments. Note that only `chr17` interactions are contained in this dataset, explaining the grainy aspect of the saddle plot. -::: ## Topological domains @@ -197,16 +188,6 @@ hic <- zoom(microC, 5000) |> hic ``` -::: {.callout-note} -## Note - -The `getDiamondInsulation` function can be parallelized over multiple -threads by specifying the Bioconductor generic `BPPARAM` argument. -::: - -::: {.callout-note} -## Note - `getDiamondInsulation()` is an endomorphism: it returns the original object, enriched with two new pieces of information: @@ -221,6 +202,12 @@ topologicalFeatures(hic, "borders") ```{r} metadata(hic)$insulation ``` + +::: {.callout-note} +## Note + +The `getDiamondInsulation` function can be parallelized over multiple +threads by specifying the Bioconductor generic `BPPARAM` argument. ::: ### Exporting insulation scores tracks @@ -303,9 +290,6 @@ hic ## metadata(1): chromosight_args ``` -::: {.callout-note} -## Note - `getLoops()` is an endomorphism: it returns the original object, enriched with two new pieces of information: @@ -402,26 +386,14 @@ metadata(hic)$chromosight_args ## $`--threads` ## [1] 1 ``` -::: - -#### Exporting chromatin loops - -```{r eval = FALSE} -loops <- topologicalFeatures(hic, "loops") -loops <- loops[loops$score >= 0.4 & loops$qvalue <= 1e-6] -GenomicInteractions::export.bedpe(loops, 'loops.bedpe') -``` - -#### Visualizing chromatin loops -::: {.callout-tip} -## Chromosight users +#### Importing loops from files If you are using `chromosight` directly from the terminal (i.e. outside `R`), you can import the annotated loops in `R` as follows: ```{r eval = FALSE} -df <- readr::read_tsv("...") +df <- readr::read_tsv("...") ## Here put your loops file loops <- InteractionSet::GInteractions( anchor1 = GenomicRanges::GRanges( df$chrom1, IRanges::IRanges(df$start1+1, df$end1) @@ -436,7 +408,16 @@ loops <- InteractionSet::GInteractions( qvalue = df$qvalue ) ``` -::: + +#### Exporting chromatin loops + +```{r eval = FALSE} +loops <- topologicalFeatures(hic, "loops") +loops <- loops[loops$score >= 0.4 & loops$qvalue <= 1e-6] +GenomicInteractions::export.bedpe(loops, 'loops.bedpe') +``` + +#### Visualizing chromatin loops ```{r eval = FALSE} plotMatrix(