From fb7b93a55c87d94feb29eca999160fd974eccd2a Mon Sep 17 00:00:00 2001 From: "Documenter.jl" Date: Tue, 30 Jul 2024 12:45:42 +0000 Subject: [PATCH] build based on 5304b9e --- dev/.documenter-siteinfo.json | 2 +- dev/01_Change_B_factors/index.html | 2 +- .../index.html | 2 +- dev/03_RMSF/{3c638126.svg => 1cf94683.svg} | 56 +- dev/03_RMSF/{0421e1b1.svg => 609273e0.svg} | 656 +++++++++--------- dev/03_RMSF/index.html | 4 +- dev/Example/index.html | 2 +- dev/Information/index.html | 12 +- dev/Information_API/index.html | 37 +- dev/Installation/index.html | 2 +- dev/MSA/index.html | 22 +- dev/MSA_API/index.html | 32 +- dev/PDB/index.html | 2 +- dev/PDB_API/index.html | 16 +- dev/Pfam/index.html | 2 +- dev/Pfam_API/index.html | 4 +- dev/References/index.html | 2 +- dev/SIFTS/index.html | 2 +- dev/SIFTS_API/index.html | 4 +- dev/Scripts/index.html | 2 +- dev/Utils_API/index.html | 8 +- dev/index.html | 2 +- dev/inf_entropy.png | Bin 16075 -> 17625 bytes dev/search_index.js | 2 +- 24 files changed, 444 insertions(+), 431 deletions(-) rename dev/03_RMSF/{3c638126.svg => 1cf94683.svg} (84%) rename dev/03_RMSF/{0421e1b1.svg => 609273e0.svg} (68%) diff --git a/dev/.documenter-siteinfo.json b/dev/.documenter-siteinfo.json index fe4dec9a..e4235ee8 100644 --- a/dev/.documenter-siteinfo.json +++ b/dev/.documenter-siteinfo.json @@ -1 +1 @@ -{"documenter":{"julia_version":"1.10.4","generation_timestamp":"2024-07-30T12:18:27","documenter_version":"1.5.0"}} \ No newline at end of file +{"documenter":{"julia_version":"1.10.4","generation_timestamp":"2024-07-30T12:45:35","documenter_version":"1.5.0"}} \ No newline at end of file diff --git a/dev/01_Change_B_factors/index.html b/dev/01_Change_B_factors/index.html index 2b34828f..c89e7c4a 100644 --- a/dev/01_Change_B_factors/index.html +++ b/dev/01_Change_B_factors/index.html @@ -48,4 +48,4 @@ res.atoms[i] = change_b_factor(atom, hydrophobicity[res.id.name]) end end -end

Finally, we can save the changed residues in a new PDB file.

write_file("4zj9_hydrophobicity.pdb", pdb_residues, PDBFile)

Discussion

While we have focused on changing the B-factor field of a PDBAtom, you can use the same approach to change other fields. However, if you want to change atom coordinates, it is better to use the change_coordinates function from the PDB module of MIToS.

MIToS atoms and residues generally stores the string present in the input file without surrounding spaces. You can use the Format module to create these strings and strip to get rid of the spaces. You can see the PDB format description to know what is the format of the expected string or see the MIToS PDB print_file source code to get a quick idea.


This page was generated using Literate.jl.

+end

Finally, we can save the changed residues in a new PDB file.

write_file("4zj9_hydrophobicity.pdb", pdb_residues, PDBFile)

Discussion

While we have focused on changing the B-factor field of a PDBAtom, you can use the same approach to change other fields. However, if you want to change atom coordinates, it is better to use the change_coordinates function from the PDB module of MIToS.

MIToS atoms and residues generally stores the string present in the input file without surrounding spaces. You can use the Format module to create these strings and strip to get rid of the spaces. You can see the PDB format description to know what is the format of the expected string or see the MIToS PDB print_file source code to get a quick idea.


This page was generated using Literate.jl.

diff --git a/dev/02_Linking_structural_and_evolutionary_information/index.html b/dev/02_Linking_structural_and_evolutionary_information/index.html index 41fb36fa..95c0af4d 100644 --- a/dev/02_Linking_structural_and_evolutionary_information/index.html +++ b/dev/02_Linking_structural_and_evolutionary_information/index.html @@ -257,4 +257,4 @@ 39 => "32" 51 => "44" 61 => "54" - ⋮ => ⋮

This page was generated using Literate.jl.

+ ⋮ => ⋮

This page was generated using Literate.jl.

diff --git a/dev/03_RMSF/3c638126.svg b/dev/03_RMSF/1cf94683.svg similarity index 84% rename from dev/03_RMSF/3c638126.svg rename to dev/03_RMSF/1cf94683.svg index 7d288ae4..0aa5684f 100644 --- a/dev/03_RMSF/3c638126.svg +++ b/dev/03_RMSF/1cf94683.svg @@ -1,43 +1,43 @@ - + - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dev/03_RMSF/0421e1b1.svg b/dev/03_RMSF/609273e0.svg similarity index 68% rename from dev/03_RMSF/0421e1b1.svg rename to dev/03_RMSF/609273e0.svg index 6f0c5024..dbaebf5a 100644 --- a/dev/03_RMSF/0421e1b1.svg +++ b/dev/03_RMSF/609273e0.svg @@ -1,343 +1,343 @@ - + - + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dev/03_RMSF/index.html b/dev/03_RMSF/index.html index affe5414..81ec948a 100644 --- a/dev/03_RMSF/index.html +++ b/dev/03_RMSF/index.html @@ -2,7 +2,7 @@ Root Mean Squared Fluctuation (RMSF) · MIToS

Root Mean Squared Fluctuation (RMSF)

md # md #

Problem description

The Root Mean Squared Fluctuation (RMSF) is a common way to measure residue flexibility in a structural ensemble. It is a measure of how far is the residue moving from its average position in the group of structures. Usually, we represent a residue position with the spatial coordinates of its alpha carbon.

The protein structures should be previously superimposed to calculate the RMSF, for example, by using the superimpose function of the PDB module of MIToS. In this example, we are going to measure the RMSF of each residue from an NMR ensemble using the rmsf function.

The structure superimposition could be the most complicated step of the process, depending on the input data. In particular, it structures come from different PDB structures or homologous proteins can require the use of external programs, as MAMMOTH-mult or MUSTANG among others, tailored for this task.

In this case, we are going to use an NMR ensemble. Therefore, we are not going to need to superimpose the structures as NMR models have the same protein sequence and are, usually, well-aligned.

MIToS solution

import MIToS
 using MIToS.PDB
 using Plots

Lets read the NMR ensemble:

pdb_file = abspath(pathof(MIToS), "..", "..", "test", "data", "1AS5.pdb")
-pdb_res = read_file(pdb_file, PDBFile, occupancyfilter = true)

We set occupancyfilter to true to ensure that we have one single set of coordinates for each atom. That filter isn't essential for NMR structures, but It can avoid multiple alpha carbons in crystallographic structures with disordered atoms. We can get an idea of the alpha carbon positions by plotting these residues:

scatter(pdb_res, legend = false)
Example block output

As we saw in the previous plot, the structure doesn't need to be superimposed. Now, we are going to separate each model into different vectors, storing each vector into a Dict:

models = Dict{String,Vector{PDBResidue}}()
+pdb_res = read_file(pdb_file, PDBFile, occupancyfilter = true)

We set occupancyfilter to true to ensure that we have one single set of coordinates for each atom. That filter isn't essential for NMR structures, but It can avoid multiple alpha carbons in crystallographic structures with disordered atoms. We can get an idea of the alpha carbon positions by plotting these residues:

scatter(pdb_res, legend = false)
Example block output

As we saw in the previous plot, the structure doesn't need to be superimposed. Now, we are going to separate each model into different vectors, storing each vector into a Dict:

models = Dict{String,Vector{PDBResidue}}()
 for res in pdb_res
     push!(get!(models, res.id.model, []), res)
 end

Then, we simply need to collect all the PDB models in the values of the Dict, to get the vector of PDBResidues vectors required to calculate the RMSF.

pdb_models = collect(values(models))

And, finally, call the rmsf function on the list of structures. It is important that all the vectors has the same number of PDBResidues. This function assumes that the nth element of each vector corresponds to the same residue:

RMSF = rmsf(pdb_models)
21-element Vector{Float64}:
@@ -25,4 +25,4 @@
  0.8242922195831439
  1.004681790419235
  1.4029641961626411
- 3.0733292325145656

This return the vector of RMSF values for each residue, calculated using the coordinates of the alpha carbons. You can plot this vector to get an idea of the which are the most flexible position in your structure:

plot(RMSF, legend = false, xlab = "Residue", ylab = "RMSF [Å]")
Example block output

This page was generated using Literate.jl.

+ 3.0733292325145656

This return the vector of RMSF values for each residue, calculated using the coordinates of the alpha carbons. You can plot this vector to get an idea of the which are the most flexible position in your structure:

plot(RMSF, legend = false, xlab = "Residue", ylab = "RMSF [Å]")
Example block output

This page was generated using Literate.jl.

diff --git a/dev/Example/index.html b/dev/Example/index.html index e0e74313..2d9507d1 100644 --- a/dev/Example/index.html +++ b/dev/Example/index.html @@ -42,4 +42,4 @@ 131 │ 0.0891384 0.0324601 0.126517 -0.0500363 132 │ 0.0731305 -0.104509 -0.0734673 -0.107737 133 │ 0.312175 -0.00813555 NaN -0.0685581 -134 │ -0.0747719 -0.188341 … -0.0685581 NaN

Once the Plots package is installed and loaded, you can use its capabilities to visualize this results:

heatmap(ZBLMIp, yflip = true, c = :grays)

MIToS in system command line

Calculate ZBLMIp on the system shell is easy using the script called BLMI.jl in the MIToS_Scripts.jl package. This script reads a MSA file, and writes a file with the same base name of the input but with the .BLMI.csv extension.

julia BLMI.jl PF14972.stockholm.gz
+134 │ -0.0747719 -0.188341 … -0.0685581 NaN

Once the Plots package is installed and loaded, you can use its capabilities to visualize this results:

heatmap(ZBLMIp, yflip = true, c = :grays)

MIToS in system command line

Calculate ZBLMIp on the system shell is easy using the script called BLMI.jl in the MIToS_Scripts.jl package. This script reads a MSA file, and writes a file with the same base name of the input but with the .BLMI.csv extension.

julia BLMI.jl PF14972.stockholm.gz
diff --git a/dev/Information/index.html b/dev/Information/index.html index b5005ac3..44fdea08 100644 --- a/dev/Information/index.html +++ b/dev/Information/index.html @@ -1,5 +1,5 @@ -Information · MIToS

Information

The Information module of MIToS defines types and functions useful to calculate information measures (e.g. Mutual Information (MI) and Entropy) over a Multiple Sequence Alignment (MSA). This module was designed to count Residues (defined in the MSA module) in special contingency tables (as fast as possible) and to derive probabilities from these counts. Also, includes methods for applying corrections to those tables, e.g. pseudocounts and pseudo frequencies. Finally, Information allows to use these probabilities and counts to estimate information measures and other frequency based values.

using MIToS.Information # to load the Information module

Features

  • Estimate multi dimensional frequencies and probability tables from sequences, MSAs, etc...
  • Correction for small number of observations
  • Correction for data redundancy on a MSA
  • Estimate information measures
  • Calculate corrected mutual information between residues

Contents

Counting residues

MIToS Information module defines a multidimensional ContingencyTable type and two types wrapping it, Frequencies and Probabilities, to store occurrences or probabilities. The ContingencyTable type stores the contingency matrix, its marginal values and total. These types are parametric, taking three ordered parameters:

  • T : The type used for storing the counts or probabilities, e.g. Float64. It's possible to use BigFloat if more precision it's needed.
  • N : It's the dimension of the table and should be an Int.
  • A : This should be a type, subtype of ResidueAlphabet, i.e.: UngappedAlphabet, GappedAlphabet or ReducedAlphabet.
Note

ContingencyTable can be used for storing probabilities or counts. The wrapper types Probabilities and Frequencies are mainly intended to dispatch in methods that need to know if the matrix has probabilities or counts, e.g. shannon_entropy. In general, the use of ContingencyTable is recommended over the use of Probabilities and Frequencies.

In this way, a matrix for storing pairwise probabilities of residues (without gaps) can be initialized using:

using MIToS.Information
+Information · MIToS

Information

Extracting evolutionary signals, such as conservation and coevolution, from Multiple Sequence Alignments (MSAs) is a common task in bioinformatics. There are several methods to estimate these signals, including information measures like Shannon Entropy—to assess the conservation of a position—and Mutual Information—to assess the coevolution between two positions. The Information module of MIToS defines types and functions useful for calculating those information measures over an MSA. This module was designed to count Residues (defined in the MSA module) in special contingency tables (as fast as possible) and to derive probabilities from these counts. It also includes methods for applying corrections to those tables, e.g., pseudo counts and pseudo frequencies. Finally, Information allows using probabilities and counts to estimate information measures and other frequency-based values.

using MIToS.Information # to load the Information module

Features

  • Estimate multi-dimensional frequencies (counts) and probability tables from sequences, MSA columns, etc...
  • Corrections for a small number of observations
  • Corrections for data redundancy on an MSA
  • Estimate information measures such as Shannon entropy, mutual information, etc...
  • Calculate corrected mutual information between residues

Contents

Counting residues

MIToS Information module defines a multidimensional ContingencyTable type and two types wrapping it, Frequencies and Probabilities, to store occurrences or probabilities. The ContingencyTable type stores the contingency matrix, its marginal values and total. These types are parametric, taking three ordered parameters:

  • T : The type used for storing the counts or probabilities, e.g. Float64. It's possible to use BigFloat if more precision it's needed.
  • N : It's the dimension of the table and should be an Int.
  • A : This should be a type, subtype of ResidueAlphabet, i.e.: UngappedAlphabet, GappedAlphabet or ReducedAlphabet.
Note

ContingencyTable can be used for storing probabilities or counts. The wrapper types Probabilities and Frequencies are mainly intended to dispatch in methods that need to know if the matrix has probabilities or counts, e.g. shannon_entropy. In general, the use of ContingencyTable is recommended over the use of Probabilities and Frequencies.

In this way, a matrix for storing pairwise probabilities of residues (without gaps) can be initialized using:

using MIToS.Information
 
 Pij = ContingencyTable(Float64, Val{2}, UngappedAlphabet())
MIToS.Information.ContingencyTable{Float64, 2, MIToS.MSA.UngappedAlphabet} : 
 
@@ -437,7 +437,7 @@
 msa = read_file(
     "https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz",
     Stockholm,
-)
AnnotatedMultipleSequenceAlignment with 835 annotations : 811×113 Named Matrix{MIToS.MSA.Residue}
+)
AnnotatedMultipleSequenceAlignment with 836 annotations : 811×113 Named Matrix{MIToS.MSA.Residue}
                  Seq ╲ Col │  53   54   55   56   57  …  428  429  430  431  432
 ───────────────────────────┼────────────────────────────────────────────────────
 A0A370X5B3_9GAMM/1736-1853 │   -    -    -    -    -  …    Y    A    Y    R    L
@@ -519,14 +519,14 @@
   - samples     Int       100     Number of samples for Z-score
   - fixedgaps   Bool      true    Fix gaps positions for the random samples
   - alphabet    ResidueAlphabet UngappedAlphabet()  Residue alphabet to be used

This function returns:

  - Z score
-  - MI or MIp
source

The second, implemented in the BLMI function, has the same corrections that the above algorithm, but use BLOSUM62 pseudo frequencies. This function is slower than buslje09 (at the same number of samples), but gives better performance (for structural contact prediction) when the MSA has less than 400 clusters after a Hobohm I at 62% identity.

MIToS.Information.BLMIFunction

BLMI takes an MSA and calculates a Z score (ZBLMI) and a corrected MI/MIp as described on Busjle et al. 2009 but using using BLOSUM62 pseudo frequencies instead of a fixed pseudocount.

Keyword argument, type, default value and descriptions:

  - beta        Float64   8.512   β for BLOSUM62 pseudo frequencies
+  - MI or MIp
source

The second, implemented in the BLMI function, has the same corrections that the above algorithm, but use BLOSUM62 pseudo frequencies. This function is slower than buslje09 (at the same number of samples), but gives better performance (for structural contact prediction) when the MSA has less than 400 clusters after a Hobohm I at 62% identity.

MIToS.Information.BLMIFunction

BLMI takes an MSA and calculates a Z score (ZBLMI) and a corrected MI/MIp as described on Busjle et al. 2009 but using using BLOSUM62 pseudo frequencies instead of a fixed pseudocount.

Keyword argument, type, default value and descriptions:

  - beta        Float64   8.512   β for BLOSUM62 pseudo frequencies
   - lambda      Float64   0.0     Low count value
   - threshold             62      Percent identity threshold for sequence clustering (Hobohm I)
   - maxgap      Float64   0.5     Maximum fraction of gaps in positions included in calculation
   - apc         Bool      true    Use APC correction (MIp)
   - samples     Int       50      Number of samples for Z-score
   - fixedgaps   Bool      true    Fix gaps positions for the random samples

This function returns:

  - Z score (ZBLMI)
-  - MI or MIp using BLOSUM62 pseudo frequencies (BLMI/BLMIp)

References

source

Example: Estimating corrected MI from an MSA

using MIToS.MSA
+  - MI or MIp using BLOSUM62 pseudo frequencies (BLMI/BLMIp)

References

source

Example: Estimating corrected MI from an MSA

using MIToS.MSA
 using MIToS.Information
 
 msa = read_file(
@@ -576,11 +576,11 @@
 432         │  -0.00724753     0.0093646  …   -0.00883419           NaN

Visualize Mutual Information

You can use the function of the Plots package to visualize the Mutual Information (MI) network between residues. As an example, we are going to visualize the MI between residues of the Pfam domain PF18883. The heatmap is the simplest way to visualize the values of the Mutual Information matrix.

using Plots
 gr()
 
-heatmap(ZMIp, yflip = true)

ZMIp is a Z score of the corrected MIp against its distribution on a random MSA (shuffling the residues in each sequence), so pairs with highest values are more likely to co-evolve. Here, we are going to use the top 1% pairs of MSA columns.

using PairwiseListMatrices # to use getlist
+heatmap(ZMIp, yflip = true)

ZMIp is a Z score of the corrected MIp against its distribution on a random MSA (shuffling the residues in each sequence), so pairs with highest values are more likely to coevolve. Here, we are going to use the top 1% pairs of MSA columns.

using PairwiseListMatrices # to use getlist
 using Statistics # to use quantile
 
 threshold = quantile(getlist(ZMIp), 0.99)
6.967400960553533
ZMIp[ZMIp.<threshold] .= NaN
 heatmap(ZMIp, yflip = true)

We are going to calculate the cMI (cumulative mutual information) value of each node. Where cMI is a mutual information score per position that characterizes the extent of mutual information "interactions" in its neighbourhood. This score is calculated as the sum of MI values above a certain threshold for every amino acid pair where the particular residue appears. This value defines to what degree a given amino acid takes part in a mutual information network and we are going to indicate it using the node color. To calculate cMI we are going to use the cumulative function:

cMI = cumulative(ZMIp, threshold)
1×107 Named Matrix{Float64}
 Function ╲ Col2 │      57       58       59  …      430      431      432
 ────────────────┼────────────────────────────────────────────────────────
-cumulative      │     0.0      0.0      0.0  …      0.0      0.0      0.0
+cumulative │ 0.0 0.0 0.0 … 0.0 0.0 0.0 diff --git a/dev/Information_API/index.html b/dev/Information_API/index.html index b180fb41..82912397 100644 --- a/dev/Information_API/index.html +++ b/dev/Information_API/index.html @@ -1,14 +1,14 @@ -Information · MIToS

Information

MIToS.InformationModule

The Information module of MIToS defines types and functions useful to calculate information measures (e.g. Mutual Information (MI) and Entropy) over a Multiple Sequence Alignment (MSA). This module was designed to count Residues (defined in the MSA module) in special contingency tables (as fast as possible) and to derive probabilities from this counts. Also, includes methods for applying corrections to that tables, e.g. pseudocounts and pseudo frequencies. Finally, Information allows to use this probabilities and counts to estimate information measures and other frequency based values.

Features

  • Estimate multi dimensional frequencies and probabilities tables from sequences, MSAs, etc...
  • Correction for small number of observations
  • Correction for data redundancy on a MSA
  • Estimate information measures
  • Calculate corrected mutual information between residues
using MIToS.Information
source

Contents

Types

MIToS.Information.AdditiveSmoothingType

Additive Smoothing or fixed pseudocount λ for ResidueCount (in order to estimate probabilities when the number of samples is low).

Common values of λ are:

  • 0 : No cell frequency prior, gives you the maximum likelihood estimator.
  • 0.05 is the optimum value for λ found in Buslje et al. 2009, similar results was obtained for λ in the range [0.025, 0.075].
  • 1 / p : Perks prior (Perks, 1947) where p the number of parameters (i.e. residues, pairs of residues) to estimate. If p is the number of residues (20 without counting gaps), this gives you 0.05.
  • sqrt(n) / p : Minimax prior (Trybula, 1958) where n is the number of samples and p the number of parameters to estimate. If the number of samples n is 400 (minimum number of sequence clusters for achieve good performance in Buslje et al. 2009) for estimating 400 parameters (pairs of residues without counting gaps) this gives you 0.05.
  • 0.5 : Jeffreys prior (Jeffreys, 1946).
  • 1 : Bayes-Laplace uniform prior, aka. Laplace smoothing.

References

source
MIToS.Information.BLOSUM_PseudofrequenciesType

BLOSUM_Pseudofrequencies type. It takes to arguments/fields:

  • α : Usually the number of sequences or sequence clusters in the MSA.
  • β : The weight of the pseudofrequencies, a value close to 8.512 when α is the number of sequence clusters.
source
MIToS.Information.ContingencyTableType

A ContingencyTable is a multidimensional array. It stores the contingency matrix, its marginal values and total. The type also has an internal and private temporal array and an alphabet object. It's a parametric type, taking three ordered parameters:

  • T : The element type of the multidimensional array.
  • N : It's the dimension of the array and should be an Int.
  • A : This should be a type, subtype of ResidueAlphabet, i.e.: UngappedAlphabet, GappedAlphabet or ReducedAlphabet.

A ContingencyTable can be created from an alphabet if all the parameters are given. Otherwise, you need to give a type, a number (Val) and an alphabet. You can also create a ContingencyTable using a matrix and a alphabet. For example:

ContingencyTable{Float64,2,UngappedAlphabet}(UngappedAlphabet())
+Information · MIToS

Information

MIToS.InformationModule

The Information module of MIToS defines types and functions useful to calculate information measures (e.g. Mutual Information (MI) and Entropy) over a Multiple Sequence Alignment (MSA). This module was designed to count Residues (defined in the MSA module) in special contingency tables (as fast as possible) and to derive probabilities from this counts. Also, includes methods for applying corrections to that tables, e.g. pseudocounts and pseudo frequencies. Finally, Information allows to use this probabilities and counts to estimate information measures and other frequency based values.

Features

  • Estimate multi dimensional frequencies and probabilities tables from sequences, MSAs, etc...
  • Correction for small number of observations
  • Correction for data redundancy on a MSA
  • Estimate information measures
  • Calculate corrected mutual information between residues
using MIToS.Information
source

Contents

Types

MIToS.Information.AdditiveSmoothingType

Additive Smoothing or fixed pseudocount λ for ResidueCount (in order to estimate probabilities when the number of samples is low).

Common values of λ are:

  • 0 : No cell frequency prior, gives you the maximum likelihood estimator.
  • 0.05 is the optimum value for λ found in Buslje et al. 2009, similar results was obtained for λ in the range [0.025, 0.075].
  • 1 / p : Perks prior (Perks, 1947) where p the number of parameters (i.e. residues, pairs of residues) to estimate. If p is the number of residues (20 without counting gaps), this gives you 0.05.
  • sqrt(n) / p : Minimax prior (Trybula, 1958) where n is the number of samples and p the number of parameters to estimate. If the number of samples n is 400 (minimum number of sequence clusters for achieve good performance in Buslje et al. 2009) for estimating 400 parameters (pairs of residues without counting gaps) this gives you 0.05.
  • 0.5 : Jeffreys prior (Jeffreys, 1946).
  • 1 : Bayes-Laplace uniform prior, aka. Laplace smoothing.

References

source
MIToS.Information.BLOSUM_PseudofrequenciesType

BLOSUM_Pseudofrequencies type. It takes to arguments/fields:

  • α : Usually the number of sequences or sequence clusters in the MSA.
  • β : The weight of the pseudofrequencies, a value close to 8.512 when α is the number of sequence clusters.
source
MIToS.Information.ContingencyTableType

A ContingencyTable is a multidimensional array. It stores the contingency matrix, its marginal values and total. The type also has an internal and private temporal array and an alphabet object. It's a parametric type, taking three ordered parameters:

  • T : The element type of the multidimensional array.
  • N : It's the dimension of the array and should be an Int.
  • A : This should be a type, subtype of ResidueAlphabet, i.e.: UngappedAlphabet, GappedAlphabet or ReducedAlphabet.

A ContingencyTable can be created from an alphabet if all the parameters are given. Otherwise, you need to give a type, a number (Val) and an alphabet. You can also create a ContingencyTable using a matrix and a alphabet. For example:

ContingencyTable{Float64,2,UngappedAlphabet}(UngappedAlphabet())
 ContingencyTable(Float64, Val{2}, UngappedAlphabet())
-ContingencyTable(zeros(Float64, 20, 20), UngappedAlphabet())
source
MIToS.Information.ProbabilitiesType

A Probabilities object wraps a ContingencyTable storing probabilities. It doesn't perform any check. If the total isn't one, you must use normalize or normalize!on the ContingencyTable before wrapping it to make the sum of the probabilities equal to one.

source

Constants

MIToS.Information.BLOSUM62_PijConstant

Table with conditional probabilities of residues based on BLOSUM62. The normalization is done row based. The firts row contains the P(aa|A) and so one.

source

Macros

Methods and functions

Base.count!Method

It populates a ContingencyTable (first argument) using the frequencies in the sequences (last positional arguments). The dimension of the table must match the number of sequences and all the sequences must have the same length. You must indicate the used weights and pseudocounts as second and third positional arguments respectively. You can use NoPseudofrequencies() and NoClustering() to avoid the use of sequence weighting and pseudocounts, respectively.

DEPRECATED: Use frequencies! instead. Note that frequencies! defines the weigths and pseudocounts using keyword arguments instead of positional arguments.

source
Base.countMethod

It returns a ContingencyTable wrapped in a Frequencies type with the frequencies of residues in the sequences that takes as arguments. The dimension of the table is equal to the number of sequences. You can use the keyword arguments alphabet, weights and pseudocounts to indicate the alphabet of the table (default to UngappedAlphabet()), a clustering result (default to NoClustering()) and the pseudocounts (default to NoPseudocount()) to be used during the estimation of the frequencies.

DEPRECATED: Use frequencies instead. Note that frequencies defines the alphabet, weigths and pseudocounts using keyword arguments instead of positional arguments.

source
MIToS.Information.BLMIMethod

BLMI takes an MSA and calculates a Z score (ZBLMI) and a corrected MI/MIp as described on Busjle et al. 2009 but using using BLOSUM62 pseudo frequencies instead of a fixed pseudocount.

Keyword argument, type, default value and descriptions:

  - beta        Float64   8.512   β for BLOSUM62 pseudo frequencies
+ContingencyTable(zeros(Float64, 20, 20), UngappedAlphabet())
source
MIToS.Information.ProbabilitiesType

A Probabilities object wraps a ContingencyTable storing probabilities. It doesn't perform any check. If the total isn't one, you must use normalize or normalize!on the ContingencyTable before wrapping it to make the sum of the probabilities equal to one.

source

Constants

MIToS.Information.BLOSUM62_PijConstant

Table with conditional probabilities of residues based on BLOSUM62. The normalization is done row based. The firts row contains the P(aa|A) and so one.

source

Macros

Methods and functions

Base.count!Method

It populates a ContingencyTable (first argument) using the frequencies in the sequences (last positional arguments). The dimension of the table must match the number of sequences and all the sequences must have the same length. You must indicate the used weights and pseudocounts as second and third positional arguments respectively. You can use NoPseudofrequencies() and NoClustering() to avoid the use of sequence weighting and pseudocounts, respectively.

DEPRECATED: Use frequencies! instead. Note that frequencies! defines the weigths and pseudocounts using keyword arguments instead of positional arguments.

source
Base.countMethod

It returns a ContingencyTable wrapped in a Frequencies type with the frequencies of residues in the sequences that takes as arguments. The dimension of the table is equal to the number of sequences. You can use the keyword arguments alphabet, weights and pseudocounts to indicate the alphabet of the table (default to UngappedAlphabet()), a clustering result (default to NoClustering()) and the pseudocounts (default to NoPseudocount()) to be used during the estimation of the frequencies.

DEPRECATED: Use frequencies instead. Note that frequencies defines the alphabet, weigths and pseudocounts using keyword arguments instead of positional arguments.

source
MIToS.Information.BLMIMethod

BLMI takes an MSA and calculates a Z score (ZBLMI) and a corrected MI/MIp as described on Busjle et al. 2009 but using using BLOSUM62 pseudo frequencies instead of a fixed pseudocount.

Keyword argument, type, default value and descriptions:

  - beta        Float64   8.512   β for BLOSUM62 pseudo frequencies
   - lambda      Float64   0.0     Low count value
   - threshold             62      Percent identity threshold for sequence clustering (Hobohm I)
   - maxgap      Float64   0.5     Maximum fraction of gaps in positions included in calculation
   - apc         Bool      true    Use APC correction (MIp)
   - samples     Int       50      Number of samples for Z-score
   - fixedgaps   Bool      true    Fix gaps positions for the random samples

This function returns:

  - Z score (ZBLMI)
-  - MI or MIp using BLOSUM62 pseudo frequencies (BLMI/BLMIp)

References

source
MIToS.Information.apply_pseudofrequencies!Method

apply_pseudofrequencies!{T}(Pab::ContingencyTable{T,2,UngappedAlphabet}, pseudofrequencies::BLOSUM_Pseudofrequencies)

When a BLOSUM_Pseudofrequencies(α,β) is used, this function applies pseudofrequencies Gab over Pab, as a weighted mean of both. It uses the conditional probability matrix BLOSUM62_Pij and the real frequencies/probabilities Pab to estimate the pseudofrequencies Gab. α is the weight of the real frequencies Pab and β the weight of the pseudofrequencies.

Gab = Σcd Pcd ⋅ BLOSUM62( a | c ) ⋅ BLOSUM62( b | d ) Pab = (α ⋅ Pab + β ⋅ Gab )/(α + β)

source
MIToS.Information.buslje09Method

buslje09 takes a MSA and calculates a Z score and a corrected MI/MIp as described on Busjle et al. 2009.

keyword argument, type, default value and descriptions:

  - lambda      Float64   0.05    Low count value
+  - MI or MIp using BLOSUM62 pseudo frequencies (BLMI/BLMIp)

References

source
MIToS.Information.apply_pseudofrequencies!Method

apply_pseudofrequencies!{T}(Pab::ContingencyTable{T,2,UngappedAlphabet}, pseudofrequencies::BLOSUM_Pseudofrequencies)

When a BLOSUM_Pseudofrequencies(α,β) is used, this function applies pseudofrequencies Gab over Pab, as a weighted mean of both. It uses the conditional probability matrix BLOSUM62_Pij and the real frequencies/probabilities Pab to estimate the pseudofrequencies Gab. α is the weight of the real frequencies Pab and β the weight of the pseudofrequencies.

Gab = Σcd Pcd ⋅ BLOSUM62( a | c ) ⋅ BLOSUM62( b | d ) Pab = (α ⋅ Pab + β ⋅ Gab )/(α + β)

source
MIToS.Information.buslje09Method

buslje09 takes a MSA and calculates a Z score and a corrected MI/MIp as described on Busjle et al. 2009.

keyword argument, type, default value and descriptions:

  - lambda      Float64   0.05    Low count value
   - clustering  Bool      true    Sequence clustering (Hobohm I)
   - threshold             62      Percent identity threshold for clustering
   - maxgap      Float64   0.5     Maximum fraction of gaps in positions included in calculation
@@ -16,11 +16,11 @@
   - samples     Int       100     Number of samples for Z-score
   - fixedgaps   Bool      true    Fix gaps positions for the random samples
   - alphabet    ResidueAlphabet UngappedAlphabet()  Residue alphabet to be used

This function returns:

  - Z score
-  - MI or MIp
source
MIToS.Information.cumulativeMethod

cumulative allows to calculate cumulative scores (i.e. cMI) as defined in Marino Buslje et al. 2010:

"We calculated a cumulative mutual information score (cMI) for each residue as the sum of MI values above a certain threshold for every amino acid pair where the particular residue appears. This value defines to what degree a given amino acid takes part in a mutual information network."

References

source
MIToS.Information.delete_dimensions!Method

delete_dimensions!(out::ContingencyTable, in::ContingencyTable, dimensions::Int...)

This function fills a ContingencyTable with the counts/probabilities on in after the deletion of dimensions. i.e. This is useful for getting Pxy from Pxyz.

source
MIToS.Information.delete_dimensionsMethod

delete_dimensions(in::ContingencyTable, dimensions::Int...)

This function creates a ContingencyTable with the counts/probabilities on in after the deletion of dimensions. i.e. This is useful for getting Pxy from Pxyz.

source
MIToS.Information.frequencies!Method
frequencies!(table, seqs...; weights::WeightTypes, pseudocounts::Pseudocount)

It populates a ContingencyTable or Frequencies table (first argument) using the frequencies in the given sequences (last positional arguments). The dimension of the table must match the number of sequences and all the sequences must have the same length. You must indicate the used weights and pseudocounts as keyword arguments. Those arguments default to NoClustering() and NoPseudocount() respectively, to avoid the use of sequence weighting and pseudocounts.

source
MIToS.Information.frequenciesMethod
frequencies(seqs...; alphabet=UngappedAlphabet(), weights=NoClustering(), pseudocounts=NoPseudocount()

This function returns a Frequencies object wrapping a ContingencyTable with the frequencies of residues in the sequences that takes as arguments. The dimension of the table is equal to the number of sequences. You can use the keyword arguments alphabet, weights and pseudocounts to indicate the alphabet of the table, a clustering result and the pseudocounts to be used during the estimation of the frequencies.

source
MIToS.Information.cumulativeMethod

cumulative allows to calculate cumulative scores (i.e. cMI) as defined in Marino Buslje et al. 2010:

"We calculated a cumulative mutual information score (cMI) for each residue as the sum of MI values above a certain threshold for every amino acid pair where the particular residue appears. This value defines to what degree a given amino acid takes part in a mutual information network."

References

source
MIToS.Information.delete_dimensions!Method

delete_dimensions!(out::ContingencyTable, in::ContingencyTable, dimensions::Int...)

This function fills a ContingencyTable with the counts/probabilities on in after the deletion of dimensions. i.e. This is useful for getting Pxy from Pxyz.

source
MIToS.Information.delete_dimensionsMethod

delete_dimensions(in::ContingencyTable, dimensions::Int...)

This function creates a ContingencyTable with the counts/probabilities on in after the deletion of dimensions. i.e. This is useful for getting Pxy from Pxyz.

source
MIToS.Information.frequencies!Method
frequencies!(table, seqs...; weights::WeightTypes, pseudocounts::Pseudocount)

It populates a ContingencyTable or Frequencies table (first argument) using the frequencies in the given sequences (last positional arguments). The dimension of the table must match the number of sequences and all the sequences must have the same length. You must indicate the used weights and pseudocounts as keyword arguments. Those arguments default to NoClustering() and NoPseudocount() respectively, to avoid the use of sequence weighting and pseudocounts.

source
MIToS.Information.frequenciesMethod
frequencies(seqs...; alphabet=UngappedAlphabet(), weights=NoClustering(), pseudocounts=NoPseudocount()

This function returns a Frequencies object wrapping a ContingencyTable with the frequencies of residues in the sequences that takes as arguments. The dimension of the table is equal to the number of sequences. You can use the keyword arguments alphabet, weights and pseudocounts to indicate the alphabet of the table, a clustering result and the pseudocounts to be used during the estimation of the frequencies.

source
MIToS.Information.gaussdcaMethod

Wrapper function to GaussDCA.gDCA. You need to install GaussDCA:

using Pkg
 
-Pkg.add(PackageSpec(url = "https://github.com/carlobaldassi/GaussDCA.jl", rev = "master"))

Look into GaussDCA.jl README for further information. If you use this wrapper, please cite the GaussDCA publication and the package's doi.

It's possible to indicate the path to the julia binary where GaussDCA is installed. However, it's recommended to use the same version where MIToS is installed. That is because this function use serialize/deserialize to transfer data between the processes.

GaussDCA Publication: Baldassi, Carlo, Marco Zamparo, Christoph Feinauer, Andrea Procaccini, Riccardo Zecchina, Martin Weigt, and Andrea Pagnani. "Fast and accurate multivariate Gaussian modeling of protein families: predicting residue contacts and protein-interaction partners." PloS one 9, no. 3 (2014): e92721.

source
MIToS.Information.kullback_leiblerMethod
kullback_leibler(msa::AbstractArray{Residue}; background::Union{Array{T,N}, Probabilities{T,N,A}, ContingencyTable{T,N,A}}=BLOSUM62_Pi, base::Number=ℯ, kargs...)

It calculates the Kullback-Leibler (KL) divergence from a multiple sequence alignment (MSA). You can use the keyword argument background to set the background distribution. This argument can take an Array, Probabilities, or ContingencyTable object. The background distribution must have the same size and alphabet as the probabilities. The default is the BLOSUM62_Pi table. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.

The other keyword arguments are passed to the mapfreq function.

source
MIToS.Information.kullback_leiblerMethod
kullback_leibler(probabilities::Probabilities{T,N,A}, background::Union{
+Pkg.add(PackageSpec(url = "https://github.com/carlobaldassi/GaussDCA.jl", rev = "master"))

Look into GaussDCA.jl README for further information. If you use this wrapper, please cite the GaussDCA publication and the package's doi.

It's possible to indicate the path to the julia binary where GaussDCA is installed. However, it's recommended to use the same version where MIToS is installed. That is because this function use serialize/deserialize to transfer data between the processes.

GaussDCA Publication: Baldassi, Carlo, Marco Zamparo, Christoph Feinauer, Andrea Procaccini, Riccardo Zecchina, Martin Weigt, and Andrea Pagnani. "Fast and accurate multivariate Gaussian modeling of protein families: predicting residue contacts and protein-interaction partners." PloS one 9, no. 3 (2014): e92721.

source
MIToS.Information.kullback_leiblerMethod
kullback_leibler(msa::AbstractArray{Residue}; background::Union{Array{T,N}, Probabilities{T,N,A}, ContingencyTable{T,N,A}}=BLOSUM62_Pi, base::Number=ℯ, kargs...)

It calculates the Kullback-Leibler (KL) divergence from a multiple sequence alignment (MSA). You can use the keyword argument background to set the background distribution. This argument can take an Array, Probabilities, or ContingencyTable object. The background distribution must have the same size and alphabet as the probabilities. The default is the BLOSUM62_Pi table. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.

The other keyword arguments are passed to the mapfreq function.

source
MIToS.Information.kullback_leiblerMethod
kullback_leibler(probabilities::Probabilities{T,N,A}, background::Union{
     AbstractArray{T,N}, Probabilities{T,N,A}, ContingencyTable{T,N,A}}=BLOSUM62_Pi, 
-    base::Number=ℯ)

It calculates the Kullback-Leibler (KL) divergence from a table of Probabilities. You can use the keyword argument background to set the background distribution. This argument can take an Array, Probabilities, or ContingencyTable object. The background distribution must have the same size and alphabet as the probabilities. The default is the BLOSUM62_Pi table. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.

source
MIToS.Information.mapcolfreq!Method

It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each column from the msa (second argument).

  • weights (default: NoClustering()): Weights to be used for table counting.
  • pseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.
  • pseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.
source
MIToS.Information.mapcolpairfreq!Method

It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each pair of columns from the msa (second argument).

  • weights (default: NoClustering()): Weights to be used for table counting.

  • pseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.

  • pseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.

  • usediagonal (default: true): If true, the function will be also applied to the diagonal elements.

  • diagonalvalue (default: zero): Value to fill diagonal elements if usediagonal is false.

source
MIToS.Information.mapfreqMethod
mapfreq(f, msa; rank = 1, dims = 2, alphabet = UngappedAlphabet(), 
+    base::Number=ℯ)

It calculates the Kullback-Leibler (KL) divergence from a table of Probabilities. You can use the keyword argument background to set the background distribution. This argument can take an Array, Probabilities, or ContingencyTable object. The background distribution must have the same size and alphabet as the probabilities. The default is the BLOSUM62_Pi table. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.

source
MIToS.Information.mapcolfreq!Method

It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each column from the msa (second argument).

  • weights (default: NoClustering()): Weights to be used for table counting.
  • pseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.
  • pseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.
source
MIToS.Information.mapcolpairfreq!Method

It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each pair of columns from the msa (second argument).

  • weights (default: NoClustering()): Weights to be used for table counting.

  • pseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.

  • pseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.

  • usediagonal (default: true): If true, the function will be also applied to the diagonal elements.

  • diagonalvalue (default: zero): Value to fill diagonal elements if usediagonal is false.

source
MIToS.Information.mapfreqMethod
mapfreq(f, msa; rank = 1, dims = 2, alphabet = UngappedAlphabet(), 
     weights = NoClustering(), pseudocounts = NoPseudocount(), 
     pseudofrequencies = NoPseudofrequencies(), probabilities = true, 
     usediagonal = false, diagonalvalue = NaN, kargs...)

It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (depending on the probabilities keyword argument) calculated on sequences (dims = 1) or columns (dims = 2, the default) of an msa (second argument). If rank = 1, the default, the function is applied to each sequence or column. If rank = 2, the function is applied to each pair of sequences or columns. In that case, we can set the usediagonal keyword argument to true to apply the function to pairs of the same sequence or column. The diagonalvalue keyword argument is used to set the value of the diagonal elements if usediagonal is false. By default, the function is not applied to the diagonal elements (i.e. usediagonal = false) and the diagonalvalue is set to NaN. The alphabet keyword argument can be used to set the alphabet used to construct the contingency table. The function also accepts the following keyword arguments:

  • weights (default: NoClustering()): Weights to be used for table counting.
  • pseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.
  • pseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.

Note that the pseudofrequencies argument is only valid if probabilities = true. All the other keyword arguments are passed to the function f.

julia> using Random, MIToS.MSA, MIToS.Information
@@ -50,8 +50,8 @@
 1              │ 1.0
 2              │ 1.0
 3              │ 1.0
-
source
MIToS.Information.mapseqfreq!Method

It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each sequence from the msa (second argument).

  • weights (default: NoClustering()): Weights to be used for table counting.
  • pseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.
  • pseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.
source
MIToS.Information.mapseqpairfreq!Method

It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each pair of sequences from the msa (second argument).

  • weights (default: NoClustering()): Weights to be used for table counting.

  • pseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.

  • pseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.

  • usediagonal (default: true): If true, the function will be also applied to the diagonal elements.

  • diagonalvalue (default: zero): Value to fill diagonal elements if usediagonal is false.

source
MIToS.Information.marginal_entropyMethod
marginal_entropy(table::Union{Frequencies{T,N,A},Probabilities{T,N,A}}; margin::Int=1, 
-    base::Number=ℯ)

It calculates marginal entropy (H) from a table of Frequencies or Probabilities. It takes two keyword arguments: margin and base. The first one is used to indicate the margin used to calculate the entropy, e.g. it estimates the entropy H(X) if margin is 1, H(Y) for 2, etc. The default value of margin is 1. The second keyword argument is used to change the base of the log. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.

source
MIToS.Information.mutual_informationMethod
mutual_information(msa::AbstractArray{Residue}; base::Number=ℯ, kargs...)

It calculates Mutual Information (MI) from a multiple sequence alignment (MSA). The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits. The minimum value for rank is 2 (the default value). By defualt, it uses counts/frequencies to calculate the MI, as it's faster. You can use the keyword argument probabilities = true to calculate the MI from probabilities.

julia> using Random, MIToS.MSA, MIToS.Information
+
source
MIToS.Information.mapseqfreq!Method

It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each sequence from the msa (second argument).

  • weights (default: NoClustering()): Weights to be used for table counting.
  • pseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.
  • pseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.
source
MIToS.Information.mapseqpairfreq!Method

It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each pair of sequences from the msa (second argument).

  • weights (default: NoClustering()): Weights to be used for table counting.

  • pseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.

  • pseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.

  • usediagonal (default: true): If true, the function will be also applied to the diagonal elements.

  • diagonalvalue (default: zero): Value to fill diagonal elements if usediagonal is false.

source
MIToS.Information.marginal_entropyMethod
marginal_entropy(table::Union{Frequencies{T,N,A},Probabilities{T,N,A}}; margin::Int=1, 
+    base::Number=ℯ)

It calculates marginal entropy (H) from a table of Frequencies or Probabilities. It takes two keyword arguments: margin and base. The first one is used to indicate the margin used to calculate the entropy, e.g. it estimates the entropy H(X) if margin is 1, H(Y) for 2, etc. The default value of margin is 1. The second keyword argument is used to change the base of the log. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.

source
MIToS.Information.mutual_informationMethod
mutual_information(msa::AbstractArray{Residue}; base::Number=ℯ, kargs...)

It calculates Mutual Information (MI) from a multiple sequence alignment (MSA). The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits. The minimum value for rank is 2 (the default value). By defualt, it uses counts/frequencies to calculate the MI, as it's faster. You can use the keyword argument probabilities = true to calculate the MI from probabilities.

julia> using Random, MIToS.MSA, MIToS.Information
 
 julia> msa = rand(Random.MersenneTwister(37), Residue, 3, 4)
 3×4 Matrix{Residue}:
@@ -63,7 +63,7 @@
 
 julia> mi[1, 2]
 1.0986122886681098
-
source
MIToS.Information.mutual_informationMethod
mutual_information(table::Union{Frequencies{T,2,A},Probabilities{T,2,A}}; base::Number=ℯ)

It calculates Mutual Information (MI) from a table of Frequencies or Probabilities. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits. Note that calculating MI from Frequencies is faster than from Probabilities.

source
MIToS.Information.mutual_informationMethod
mutual_information(table::Union{Frequencies{T,3,A},Probabilities{T,3,A}}; base::Number=ℯ)

It calculates Mutual Information (MI) from a table of Frequencies or Probabilities with three dimensions. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.

julia> using Random, MIToS.MSA, MIToS.Information
+
source
MIToS.Information.mutual_informationMethod
mutual_information(table::Union{Frequencies{T,2,A},Probabilities{T,2,A}}; base::Number=ℯ)

It calculates Mutual Information (MI) from a table of Frequencies or Probabilities. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits. Note that calculating MI from Frequencies is faster than from Probabilities.

source
MIToS.Information.mutual_informationMethod
mutual_information(table::Union{Frequencies{T,3,A},Probabilities{T,3,A}}; base::Number=ℯ)

It calculates Mutual Information (MI) from a table of Frequencies or Probabilities with three dimensions. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.

julia> using Random, MIToS.MSA, MIToS.Information
 
 julia> msa = rand(Random.MersenneTwister(37), Residue, 3, 4)
 3×4 Matrix{Residue}:
@@ -75,7 +75,20 @@
 
 julia> mutual_information(Nxyz)
 1.0986122886681093
-
source
MIToS.Information.normalized_mutual_informationMethod
normalized_mutual_information(msa::AbstractArray{Residue}; kargs...)

This function calculates the Normalized Mutual Information (nMI) from a multiple sequence alignment using the mapfreq function—all the keyword arguments are passed to mapfreq. The mutual information score is normalized by the joint entropy of the two variables: $nMI(X, Y) = MI(X, Y) / H(X, Y)$ By default, it uses counts/frequencies to estimate the nMI, as it's faster than using probabilities.

source
MIToS.Information.normalized_mutual_informationMethod

It calculates a Normalized Mutual Information (nMI) from a table of Frequencies or Probabilities. The mutual information score is normalized by the joint entropy of the two variables: $nMI(X, Y) = MI(X, Y) / H(X, Y)$

source
MIToS.Information.pairwisegapfractionMethod

It takes a MSA or a file and a FileFormat as first arguments. It calculates the percentage of gaps on columns pairs (union and intersection) using sequence clustering (Hobohm I).

Argument, type, default value and descriptions:

    - clustering  Bool      true    Sequence clustering (Hobohm I)
+
source
MIToS.Information.normalized_mutual_informationMethod
normalized_mutual_information(msa::AbstractArray{Residue}; kargs...)

This function calculates the Normalized Mutual Information (nMI) from a multiple sequence alignment using the mapfreq function—all the keyword arguments are passed to mapfreq. The mutual information score is normalized by the joint entropy of the two variables: $nMI(X, Y) = MI(X, Y) / H(X, Y)$ By default, it uses counts/frequencies to estimate the nMI, as it's faster than using probabilities.

source
MIToS.Information.normalized_mutual_informationMethod

It calculates a Normalized Mutual Information (nMI) from a table of Frequencies or Probabilities. The mutual information score is normalized by the joint entropy of the two variables: $nMI(X, Y) = MI(X, Y) / H(X, Y)$

source
MIToS.Information.pairwisegapfractionMethod

It takes a MSA or a file and a FileFormat as first arguments. It calculates the percentage of gaps on columns pairs (union and intersection) using sequence clustering (Hobohm I).

Argument, type, default value and descriptions:

    - clustering  Bool      true    Sequence clustering (Hobohm I)
     - threshold             62      Percent identity threshold for sequence clustering (Hobohm I)

This function returns:

    - pairwise gap union as percentage
-    - pairwise gap intersection as percentage
source
MIToS.Information.probabilities!Method

It populates a ContingencyTable (first argument) using the probabilities in the sequences (last positional arguments). The dimension of the table must match the number of sequences and all the sequences must have the same length. You must indicate the used weights, pseudocounts and pseudofrequencies as second, third and fourth positional arguments respectively. You can use NoClustering(), NoPseudocount() and NoPseudofrequencies() to avoid the use of sequence weighting, pseudocounts and pseudofrequencies, respectively.

source
MIToS.Information.probabilitiesMethod

It returns a ContingencyTable wrapped in a Probabilities type with the probabilities of residues in the sequences that takes as arguments. The dimension of the table is equal to the number of sequences. You can use the keyword arguments alphabet, weights, pseudocounts and pseudofrequencies to indicate the alphabet of the table (default to UngappedAlphabet()), a clustering result (default to NoClustering()), the pseudocounts (default to NoPseudocount()) and the pseudofrequencies (default to NoPseudofrequencies()) to be used during the estimation of the probabilities.

source
MIToS.Information.shannon_entropyMethod
shannon_entropy(msa::AbstractArray{Residue}; base::Number=ℯ, 
-    probabilities::Bool=false, usediagonal::Bool=true, kargs...)

It calculates the Shannon entropy (H) on a MSA. You can use the keyword argument base to change the base of the log. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits. It uses mapfreq under the hood, so it takes the same keyword arguments. By default, it measures the entropy of each column in the MSA. You can use dims = 1 to measure the entropy of each sequence. You can also set rank = 2to measure the joint entropy of each pair of sequences or columns. This function sets by default the probabilities keyword argument to false because it's faster to calculate the entropy from counts/frequencies. It also sets usediagonal = true to also calculate the entropy of the individual variables (sequences or columns).

```jldoctest julia> using MIToS.MSA, MIToS.Information

julia> msa = Residue['C' 'G'; 'C' 'L'; 'C' 'I'] 3×2 Matrix{Residue}: C G C L C I

julia> shannonentropy(msa) 1×2 Named Matrix{Float64} Function ╲ Col │ 1 2 ────────────────┼───────────────── shannonentropy │ 0.0 1.09861

source
MIToS.Information.shannon_entropyMethod
shannon_entropy(table::Union{Frequencies{T,N,A},Probabilities{T,N,A}}; base::Number=ℯ)

It calculates the Shannon entropy (H) from a table of Frequencies or Probabilities. Use last and optional positional argument to change the base of the log. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.

source
+ - pairwise gap intersection as percentage
source
MIToS.Information.probabilities!Method

It populates a ContingencyTable (first argument) using the probabilities in the sequences (last positional arguments). The dimension of the table must match the number of sequences and all the sequences must have the same length. You must indicate the used weights, pseudocounts and pseudofrequencies as second, third and fourth positional arguments respectively. You can use NoClustering(), NoPseudocount() and NoPseudofrequencies() to avoid the use of sequence weighting, pseudocounts and pseudofrequencies, respectively.

source
MIToS.Information.probabilitiesMethod

It returns a ContingencyTable wrapped in a Probabilities type with the probabilities of residues in the sequences that takes as arguments. The dimension of the table is equal to the number of sequences. You can use the keyword arguments alphabet, weights, pseudocounts and pseudofrequencies to indicate the alphabet of the table (default to UngappedAlphabet()), a clustering result (default to NoClustering()), the pseudocounts (default to NoPseudocount()) and the pseudofrequencies (default to NoPseudofrequencies()) to be used during the estimation of the probabilities.

source
MIToS.Information.shannon_entropyMethod
shannon_entropy(msa::AbstractArray{Residue}; base::Number=ℯ, 
+    probabilities::Bool=false, usediagonal::Bool=true, kargs...)

It calculates the Shannon entropy (H) on a MSA. You can use the keyword argument base to change the base of the log. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits. It uses mapfreq under the hood, so it takes the same keyword arguments. By default, it measures the entropy of each column in the MSA. You can use dims = 1 to measure the entropy of each sequence. You can also set rank = 2to measure the joint entropy of each pair of sequences or columns. This function sets by default the probabilities keyword argument to false because it's faster to calculate the entropy from counts/frequencies. It also sets usediagonal = true to also calculate the entropy of the individual variables (sequences or columns).

julia> using MIToS.MSA, MIToS.Information
+
+julia> msa = Residue['C' 'G'; 'C' 'L'; 'C' 'I']
+3×2 Matrix{Residue}:
+ C  G
+ C  L
+ C  I
+
+julia> shannon_entropy(msa)
+1×2 Named Matrix{Float64}
+ Function ╲ Col │       1        2
+────────────────┼─────────────────
+shannon_entropy │     0.0  1.09861
+
source
MIToS.Information.shannon_entropyMethod
shannon_entropy(table::Union{Frequencies{T,N,A},Probabilities{T,N,A}}; base::Number=ℯ)

It calculates the Shannon entropy (H) from a table of Frequencies or Probabilities. Use last and optional positional argument to change the base of the log. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.

source
diff --git a/dev/Installation/index.html b/dev/Installation/index.html index 85704c7c..909f82ca 100644 --- a/dev/Installation/index.html +++ b/dev/Installation/index.html @@ -5,4 +5,4 @@ Pkg.add("Plots")

Once it is installed, you need to load Plots in order to use the plot function. There is more information about it in the Plots documentation.

using Plots

To generate graph (network), arc and chord (circo) plots, you also need to install and load GraphRecipes.

using Pkg
 Pkg.add("GraphRecipes")
 
-using GraphRecipes

You can look for examples in the GraphRecipes documentation.

+using GraphRecipes

You can look for examples in the GraphRecipes documentation.

diff --git a/dev/MSA/index.html b/dev/MSA/index.html index cc6535b8..4f54265b 100644 --- a/dev/MSA/index.html +++ b/dev/MSA/index.html @@ -4,7 +4,7 @@ read_file( "https://raw.githubusercontent.com/diegozea/MIToS.jl/master/test/data/PF09645_full.stockholm", Stockholm, -)
AnnotatedMultipleSequenceAlignment with 10 annotations : 4×110 Named Matrix{MIToS.MSA.Residue}
+)
AnnotatedMultipleSequenceAlignment with 9 annotations : 4×110 Named Matrix{MIToS.MSA.Residue}
          Seq ╲ Col │   6    7    8    9   10  …  111  112  113  114  115
 ───────────────────┼────────────────────────────────────────────────────
 C3N734_SULIY/1-95  │   -    -    -    N    S  …    -    -    -    -    -
@@ -24,7 +24,7 @@
 )
 
 printmodifications(msa)
-------------------
-2024-07-30T12:15:48.848
+2024-07-30T12:43:11.783
 
 deletefullgaps!  :  Deletes 10 columns full of gaps (inserts generate full gap columns on MIToS because lowercase and dots are not allowed)
 filtercolumns! : 10 columns have been deleted.

Writing MSA files

Julia REPL shows MSAs as Matrices. If you want to print them in another format, you should use the print_file function with an MSA object as first argument and the FileFormat FASTA, Stockholm, PIR or Raw as second argument.

using MIToS.MSA
@@ -71,8 +71,8 @@
 #=GF CC	not present in all SecA2-SecY2 systems. This family of Asp4 is
 #=GF CC	found in Firmicutes [1].
 #=GF SQ	6
-#=GF MIToS_2024-07-30T12:15:49.267	deletefullgaps!  :  Deletes 5 columns full of gaps (inserts generate full gap columns on MIToS because lowercase and dots are not allowed)
-#=GF MIToS_2024-07-30T12:15:49.267	filtercolumns! : 5 columns have been deleted.
+#=GF MIToS_2024-07-30T12:43:12.186	deletefullgaps!  :  Deletes 5 columns full of gaps (inserts generate full gap columns on MIToS because lowercase and dots are not allowed)
+#=GF MIToS_2024-07-30T12:43:12.186	filtercolumns! : 5 columns have been deleted.
 #=GS A3CM62_STRSV/3-57	AC	A3CM62.1
 #=GS A0A139NMD7_9STRE/5-59	AC	A0A139NMD7.1
 #=GS J0UVX5_STREE/3-41	AC	J0UVX5.1
@@ -144,13 +144,13 @@
     Stockholm,
     generatemapping = true,
     useidcoordinates = true,
-)
AnnotatedMultipleSequenceAlignment with 5 annotations : 1×7 Named Matrix{MIToS.MSA.Residue}
+)
AnnotatedMultipleSequenceAlignment with 4 annotations : 1×7 Named Matrix{MIToS.MSA.Residue}
       Seq ╲ Col │ 12  13  14  15  16  17  18
 ────────────────┼───────────────────────────
 PROT_SPECI/3-15 │  A   L   I   G   N   E   D

MIToS also keeps the column number of the input MSA and its total number of columns. All this data is stored in the MSA annotations using the SeqMap, ColMap and NCol feature names.

annotations(msa)
#=GF NCol	18
 #=GF ColMap	12,13,14,15,16,17,18
-#=GF MIToS_2024-07-30T12:15:52.871	deletefullgaps!  :  Deletes 11 columns full of gaps (inserts generate full gap columns on MIToS because lowercase and dots are not allowed)
-#=GF MIToS_2024-07-30T12:15:52.872	filtercolumns! : 11 columns have been deleted.
+#=GF MIToS_2024-07-30T12:43:15.494	deletefullgaps!  :  Deletes 11 columns full of gaps (inserts generate full gap columns on MIToS because lowercase and dots are not allowed)
+#=GF MIToS_2024-07-30T12:43:15.494	filtercolumns! : 11 columns have been deleted.
 #=GS PROT_SPECI/3-15	SeqMap	9,10,11,12,13,14,15
 

To have an easy access to mapping data, MIToS provides the getsequencemapping and getcolumnmapping functions.

getsequencemapping(msa, "PROT_SPECI/3-15")
7-element Vector{Int64}:
   9
@@ -273,8 +273,8 @@
 ───────────────────┼────────────────────────────────────────────────────
 H2C869_9CREN/7-104 │   -    -    L    N    D  …    -    -    -    -    -
annotations(secondsequence)
#=GF NCol	120
 #=GF ColMap	6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115
-#=GF MIToS_2024-07-30T12:15:53.093	deletefullgaps!  :  Deletes 10 columns full of gaps (inserts generate full gap columns on MIToS because lowercase and dots are not allowed)
-#=GF MIToS_2024-07-30T12:15:53.093	filtercolumns! : 10 columns have been deleted.
+#=GF MIToS_2024-07-30T12:43:15.704	deletefullgaps!  :  Deletes 10 columns full of gaps (inserts generate full gap columns on MIToS because lowercase and dots are not allowed)
+#=GF MIToS_2024-07-30T12:43:15.704	filtercolumns! : 10 columns have been deleted.
 #=GS H2C869_9CREN/7-104	AC	H2C869.1
 #=GS H2C869_9CREN/7-104	SeqMap	,,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,,,,,,,,,,,,,,,,
 #=GC seq_cons			...NshphAclhaKILppKtElolEDIlAQFEISsosAYsI.+sL+hICEpH.-ECpsppKsRKTlhh.hKpEphppptpEp..ppItKIhsAp................
@@ -431,7 +431,7 @@
  0
  1
  0
- 0
filtersequences!(msa, cluster_references)
AnnotatedMultipleSequenceAlignment with 489 annotations : 465×113 Named Matrix{MIToS.MSA.Residue}
+ 0
filtersequences!(msa, cluster_references)
AnnotatedMultipleSequenceAlignment with 488 annotations : 465×113 Named Matrix{MIToS.MSA.Residue}
                  Seq ╲ Col │  53   54   55   56   57  …  428  429  430  431  432
 ───────────────────────────┼────────────────────────────────────────────────────
 A0A370X5B3_9GAMM/1736-1853 │   -    -    -    -    -  …    Y    A    Y    R    L
@@ -533,4 +533,4 @@
 aa_HUMAN_&_AA_HUMAN │   A    R    D    N    A
 bb_MOUSE_&_BB_MOUSE │   G    K    E    E    G
 cc_YEAST            │   G    R    D    -    -
-CC_SHEEP            │   -    -    -    E    A

As we can see, the join_msas function has matched the sequences on both MSAs based on the specified pairing—in this example, we create a dictionary to pair the sequences from the same species. The join_msas have two important keyword arguments: kind and axis. By default, the function performs an outer join (kind = :outer) and matches the sequences (axis = 1). You can change these arguments to perform other kinds of joins or to match the columns. Since we performed an outer join, the resulting MSA contains all sequences from both input MSAs, and join_msas have added gaps where the sequences do not match.

+CC_SHEEP │ - - - E A

As we can see, the join_msas function has matched the sequences on both MSAs based on the specified pairing—in this example, we create a dictionary to pair the sequences from the same species. The join_msas have two important keyword arguments: kind and axis. By default, the function performs an outer join (kind = :outer) and matches the sequences (axis = 1). You can change these arguments to perform other kinds of joins or to match the columns. Since we performed an outer join, the resulting MSA contains all sequences from both input MSAs, and join_msas have added gaps where the sequences do not match.

diff --git a/dev/MSA_API/index.html b/dev/MSA_API/index.html index 1f2ec50a..063f9095 100644 --- a/dev/MSA_API/index.html +++ b/dev/MSA_API/index.html @@ -1,14 +1,14 @@ -MSA · MIToS

MSA

MIToS.MSAModule

The MSA module of MIToS has utilities for working with Multiple Sequence Alignments of protein Sequences (MSA).

Features

  • Read and write MSAs in Stockholm, FASTA, A3M, PIR, or Raw format
  • Handle MSA annotations
  • Edit the MSA, e.g. delete columns or sequences, change sequence order, shuffling...
  • Keep track of positions and annotations after modifications on the MSA
  • Describe a MSA, e.g. mean percent identity, sequence coverage, gap percentage...
using MIToS.MSA
source

Contents

Types

MIToS.MSA.AbstractAlignedObjectType

MIToS MSA and aligned sequences (aligned objects) are subtypes of AbstractMatrix{Residue}, because MSAs and sequences are stored as Matrix of Residues.

source
MIToS.MSA.AlignedSequenceType

An AlignedSequence wraps a NamedResidueMatrix{Array{Residue,2}} with only 1 row/sequence. The NamedArray stores the sequence name and original column numbers as Strings.

source
MIToS.MSA.AnnotatedMultipleSequenceAlignmentType

This type represent an MSA, similar to MultipleSequenceAlignment, but It also stores Annotations. This annotations are used to store residue coordinates (i.e. mapping to UniProt residue numbers).

source
MIToS.MSA.AnnotatedSequenceType

An AnnotationSequence wraps a NamedResidueMatrix{Array{Residue,2}} with only 1 row/sequence and its Annotations. The NamedArray stores the sequence name and original position numbers as Strings.

source
MIToS.MSA.AnnotationsType

The Annotations type is basically a container for Dicts with the annotations of a multiple sequence alignment. Annotations was designed for storage of annotations of the Stockholm format.

MIToS also uses MSA annotations to keep track of:

  • Modifications of the MSA (MIToS_...) as deletion of sequences or columns.
  • Positions numbers in the original MSA file (column mapping: ColMap)
  • Position of the residues in the sequence (sequence mapping: SeqMap)
source
MIToS.MSA.ClustersType

Data structure to represent sequence clusters. The sequence data itself is not included.

source
MIToS.MSA.GappedAlphabetType

This type defines the usual alphabet of the 20 natural residues and a gap character.

julia> using MIToS.MSA
+MSA · MIToS

MSA

MIToS.MSAModule

The MSA module of MIToS has utilities for working with Multiple Sequence Alignments of protein Sequences (MSA).

Features

  • Read and write MSAs in Stockholm, FASTA, A3M, PIR, or Raw format
  • Handle MSA annotations
  • Edit the MSA, e.g. delete columns or sequences, change sequence order, shuffling...
  • Keep track of positions and annotations after modifications on the MSA
  • Describe a MSA, e.g. mean percent identity, sequence coverage, gap percentage...
using MIToS.MSA
source

Contents

Types

MIToS.MSA.AbstractAlignedObjectType

MIToS MSA and aligned sequences (aligned objects) are subtypes of AbstractMatrix{Residue}, because MSAs and sequences are stored as Matrix of Residues.

source
MIToS.MSA.AlignedSequenceType

An AlignedSequence wraps a NamedResidueMatrix{Array{Residue,2}} with only 1 row/sequence. The NamedArray stores the sequence name and original column numbers as Strings.

source
MIToS.MSA.AnnotatedMultipleSequenceAlignmentType

This type represent an MSA, similar to MultipleSequenceAlignment, but It also stores Annotations. This annotations are used to store residue coordinates (i.e. mapping to UniProt residue numbers).

source
MIToS.MSA.AnnotatedSequenceType

An AnnotationSequence wraps a NamedResidueMatrix{Array{Residue,2}} with only 1 row/sequence and its Annotations. The NamedArray stores the sequence name and original position numbers as Strings.

source
MIToS.MSA.AnnotationsType

The Annotations type is basically a container for Dicts with the annotations of a multiple sequence alignment. Annotations was designed for storage of annotations of the Stockholm format.

MIToS also uses MSA annotations to keep track of:

  • Modifications of the MSA (MIToS_...) as deletion of sequences or columns.
  • Positions numbers in the original MSA file (column mapping: ColMap)
  • Position of the residues in the sequence (sequence mapping: SeqMap)
source
MIToS.MSA.ClustersType

Data structure to represent sequence clusters. The sequence data itself is not included.

source
MIToS.MSA.GappedAlphabetType

This type defines the usual alphabet of the 20 natural residues and a gap character.

julia> using MIToS.MSA
 
 julia> GappedAlphabet()
-GappedAlphabet of length 21. Residues : res"ARNDCQEGHILKMFPSTWYV-"
source
MIToS.MSA.MultipleSequenceAlignmentType

This MSA type include a NamedArray wrapping a Matrix of Residues. The use of NamedArray allows to store sequence names and original column numbers as Strings, and fast indexing using them.

source
MIToS.MSA.ReducedAlphabetType

ReducedAlphabet allows the construction of reduced residue alphabets, where residues inside parenthesis belong to the same group.

julia> using MIToS.MSA
+GappedAlphabet of length 21. Residues : res"ARNDCQEGHILKMFPSTWYV-"
source
MIToS.MSA.MultipleSequenceAlignmentType

This MSA type include a NamedArray wrapping a Matrix of Residues. The use of NamedArray allows to store sequence names and original column numbers as Strings, and fast indexing using them.

source
MIToS.MSA.ReducedAlphabetType

ReducedAlphabet allows the construction of reduced residue alphabets, where residues inside parenthesis belong to the same group.

julia> using MIToS.MSA
 
 julia> ab = ReducedAlphabet("(AILMV)(RHK)(NQST)(DE)(FWY)CGP")
 ReducedAlphabet of length 8 : "(AILMV)(RHK)(NQST)(DE)(FWY)CGP"
 
 julia> ab[Residue('K')]
-2
source
MIToS.MSA.ResidueType

Most of the MIToS design is created around the Residue bitstype. It has representations for the 20 natural amino acids, a value representing insertions and deletions (GAP, '-') and one representing unknown, ambiguous and non standard residues (XAA, 'X'). Each Residue is encoded as an integer number, with the same bit representation and size than a Int. This allows fast indexing operation of probability or frequency matrices.

Residue creation and conversion

Creation and conversion of Residues should be treated carefully. Residue is encoded as a 32 or 64 bits type similar to Int, to get fast indexing using Int(x::Residue). Int simply calls reinterpret without checking if the residue is valid. Valid residues have integer values in the closed interval [1,22]. convert from Int and Char always returns valid residues, however it's possible to find invalid residues (they are shown using the character '�') after the creation of uninitialized Residue arrays (i.e. using Array). You can use zeros, ones or rand to get initialized Residue arrays with valid residues. Conversions to and from Chars changes the bit representation and allows the use of the usual character representation of residues and amino acids. This conversions are used in IO operations and always return valid residues. In conversions from Char, lowercase letters, '*', '-' and '.' are translated to GAP, letters representing the 20 natural amino (ARNDCQEGHILKMFPSTWYV) acids are translated to their corresponding Residue and any other character is translated to XAA. Since lowercase letters and dots are translated to gaps, Pfam MSA insert columns are converted to columns full of gaps.

julia> using MIToS.MSA
+2
source
MIToS.MSA.ResidueType

Most of the MIToS design is created around the Residue bitstype. It has representations for the 20 natural amino acids, a value representing insertions and deletions (GAP, '-') and one representing unknown, ambiguous and non standard residues (XAA, 'X'). Each Residue is encoded as an integer number, with the same bit representation and size than a Int. This allows fast indexing operation of probability or frequency matrices.

Residue creation and conversion

Creation and conversion of Residues should be treated carefully. Residue is encoded as a 32 or 64 bits type similar to Int, to get fast indexing using Int(x::Residue). Int simply calls reinterpret without checking if the residue is valid. Valid residues have integer values in the closed interval [1,22]. convert from Int and Char always returns valid residues, however it's possible to find invalid residues (they are shown using the character '�') after the creation of uninitialized Residue arrays (i.e. using Array). You can use zeros, ones or rand to get initialized Residue arrays with valid residues. Conversions to and from Chars changes the bit representation and allows the use of the usual character representation of residues and amino acids. This conversions are used in IO operations and always return valid residues. In conversions from Char, lowercase letters, '*', '-' and '.' are translated to GAP, letters representing the 20 natural amino (ARNDCQEGHILKMFPSTWYV) acids are translated to their corresponding Residue and any other character is translated to XAA. Since lowercase letters and dots are translated to gaps, Pfam MSA insert columns are converted to columns full of gaps.

julia> using MIToS.MSA
 
 julia> alanine = Residue('A')
 A
@@ -41,10 +41,10 @@
 V 20
 - 21
 X 22
-
source
MIToS.MSA.UngappedAlphabetType

This type defines the usual alphabet of the 20 natural residues, without the gap character.

julia> using MIToS.MSA
 
 julia> UngappedAlphabet()
-UngappedAlphabet of length 20. Residues : res"ARNDCQEGHILKMFPSTWYV"
source

Constants

MIToS.MSA.GAPConstant

GAP is the Residue representation on MIToS for gaps ('-', insertions and deletions). Lowercase residue characters, dots and '*' are encoded as GAP in conversion from Strings and Chars. This Residue constant is encoded as Residue(21).

source
MIToS.MSA.WeightTypesType

The WeightTypes type is the same as Union{Weights,NoClustering,Clusters}. This type is used to represent weights. Most of the functions taking the weights kerword argument in the Information module accept instances of WeightTypes.

source
MIToS.MSA.XAAConstant

XAA is the Residue representation for unknown, ambiguous and non standard residues. This Residue constant is encoded as Residue(22).

source

Macros

MIToS.MSA.@res_strMacro

The MIToS macro @res_str takes a string and returns a Vector of Residues (sequence).

julia> using MIToS.MSA
+UngappedAlphabet of length 20. Residues : res"ARNDCQEGHILKMFPSTWYV"
source

Constants

MIToS.MSA.GAPConstant

GAP is the Residue representation on MIToS for gaps ('-', insertions and deletions). Lowercase residue characters, dots and '*' are encoded as GAP in conversion from Strings and Chars. This Residue constant is encoded as Residue(21).

source
MIToS.MSA.WeightTypesType

The WeightTypes type is the same as Union{Weights,NoClustering,Clusters}. This type is used to represent weights. Most of the functions taking the weights kerword argument in the Information module accept instances of WeightTypes.

source
MIToS.MSA.XAAConstant

XAA is the Residue representation for unknown, ambiguous and non standard residues. This Residue constant is encoded as Residue(22).

source

Macros

MIToS.MSA.@res_strMacro

The MIToS macro @res_str takes a string and returns a Vector of Residues (sequence).

julia> using MIToS.MSA
 
 julia> res"MIToS"
 5-element Vector{Residue}:
@@ -52,7 +52,7 @@
  I
  T
  -
- S
source

Methods and functions

Base.isvalidMethod

isvalid(res::Residue)

It returns true if the encoded integer is in the closed interval [1,22].

source
Base.namesMethod

It returns the name of each group. The name is a string with the one letter code of each residue that belong to the group.

julia> using MIToS.MSA
+ S
source

Methods and functions

Base.isvalidMethod

isvalid(res::Residue)

It returns true if the encoded integer is in the closed interval [1,22].

source
Base.namesMethod

It returns the name of each group. The name is a string with the one letter code of each residue that belong to the group.

julia> using MIToS.MSA
 
 julia> ab = ReducedAlphabet("(AILMV)(RHK)(NQST)(DE)(FWY)CGP")
 ReducedAlphabet of length 8 : "(AILMV)(RHK)(NQST)(DE)(FWY)CGP"
@@ -66,7 +66,7 @@
  "FWY"
  "C"
  "G"
- "P"
source
Base.randMethod

It chooses from the 20 natural residues (it doesn't generate gaps).

julia> using MIToS.MSA
+ "P"
source
Base.randMethod

It chooses from the 20 natural residues (it doesn't generate gaps).

julia> using MIToS.MSA
 
 julia> using Random
 
@@ -80,7 +80,7 @@
  E  D  D  A
  F  S  K  K
  M  S  I  M
- Y  F  E  D
source
MIToS.MSA.adjustreferenceFunction

Creates a new matrix of residues. This function deletes positions/columns of the MSA with gaps in the reference (first) sequence.

source
MIToS.MSA.annotate_modification!Method

Annotates on file annotations the modifications realized by MIToS on the MSA. It always returns true, so It can be used in a boolean context.

source
MIToS.MSA.annotationsMethod

The annotations function returns the Annotations of an annotated MSA or aligned sequence. If the object is not annotated, it returns an empty Annotations object.

source
MIToS.MSA.column_indexMethod
column_index(msa, col_name)

Return the index (integer position) of the column with name col_name in the MSA msa. A KeyError is thrown if the column name does not exist. If col_name is an integer, the same integer is returned without checking if it is a valid index.

source
MIToS.MSA.columnname_iteratorMethod

columnname_iterator(msa)

It returns an iterator that returns the column names of the msa. If the msa is a Matrix{Residue} this function returns the actual column numbers as strings. Otherwise it returns the column number of the original MSA through the wrapped NamedArray column names.

source
MIToS.MSA.columnnamesMethod

columnnames(msa)

It returns a Vector{String} with the sequence names/identifiers. If the msa is a Matrix{Residue} this function returns the actual column numbers as strings. Otherwise it returns the column number of the original MSA through the wrapped NamedArray column names.

source
MIToS.MSA.columnpairsmatrixMethod

Initialize an empty PairwiseListMatrix for a pairwise measure in sequence pairs. It uses the sequence names if they are available, otherwise it uses the actual sequence numbers. You can use the positional argument to indicate the number Type (default: Float64), if the PairwiseListMatrix should store the diagonal values on the list (default: false) and a default value for the diagonal (default: NaN).

source
MIToS.MSA.filtercolumns!Function

filtercolumns!(msa, mask[, annotate::Bool=true])

It allows to filter MSA or aligned sequence columns/positions using a AbstractVector{Bool} mask. Annotations are updated if annotate is true (default).

source
MIToS.MSA.filtercolumns!Method

filtercolumns!(data::Annotations, mask)

It is useful for deleting column annotations (creating a subset in place).

source
MIToS.MSA.filtersequences!Function

filtersequences!(msa, mask[, annotate::Bool=true])

It allows to filter msa sequences using a AbstractVector{Bool} mask (It removes sequences with false values). AnnotatedMultipleSequenceAlignment annotations are updated if annotate is true (default).

source
MIToS.MSA.filtersequences!Method

filtersequences!(data::Annotations, ids::Vector{String}, mask::AbstractArray{Bool,1})

It is useful for deleting sequence annotations. ids should be a list of the sequence names and mask should be a logical vector.

source
MIToS.MSA.gapfractionMethod

It calculates the fraction of gaps on the Array (alignment, sequence, column, etc.). This function can take an extra dimension argument for calculation of the gap fraction over the given dimension.

source
MIToS.MSA.gapstrip!Function

This functions deletes/filters sequences and columns/positions on the MSA on the following order:

  1. Removes all the columns/position on the MSA with gaps on the reference (first) sequence.
  2. Removes all the sequences with a coverage with respect to the number of columns/positions on the MSA less than a coveragelimit (default to 0.75: sequences with 25% of gaps).
  3. Removes all the columns/position on the MSA with more than a gaplimit (default to 0.5: 50% of gaps).
source
MIToS.MSA.gapstripMethod

Creates a new matrix of Residues (MSA) with deleted sequences and columns/positions. The MSA is edited in the following way:

  1. Removes all the columns/position on the MSA with gaps on the reference (first) sequence
  2. Removes all the sequences with a coverage with respect to the number of columns/positions on the MSA less than a coveragelimit (default to 0.75: sequences with 25% of gaps)
  3. Removes all the columns/position on the MSA with more than a gaplimit (default to 0.5: 50% of gaps)
source
MIToS.MSA.getcolumnmappingMethod

It returns a Vector{Int} with the original column number of each column on the actual MSA. The mapping is annotated in the ColMap file annotation of an AnnotatedMultipleSequenceAlignment or in the column names of an NamedArray or MultipleSequenceAlignment.

NOTE: When the MSA results from vertically concatenating MSAs using vcat, the column map annotations from the constituent MSAs (such as 1_ColMap, 2_ColMap, etc.) are not returned. Instead, the column numbers referenced in the column names are provided. To access the original annotations, utilize the getannotfile function.

source
MIToS.MSA.gethcatmappingMethod

It returns a vector of numbers from 1 to N for each column that indicates the source MSA. The mapping is annotated in the "HCat" file annotation of an AnnotatedMultipleSequenceAlignment or in the column names of an NamedArray or MultipleSequenceAlignment.

NOTE: When the MSA results from vertically concatenating MSAs using vcat, the "HCat" annotations from the constituent MSAs are renamed as "1_HCat", "2_HCat", etc. In that case, the MSA numbers referenced in the column names are provided. To access the original annotations, utilize the getannotfile function.

source
MIToS.MSA.getnamedictMethod

It takes a ResidueAlphabet and returns a dictionary from group name to group position.

julia> using MIToS.MSA
+ Y  F  E  D
source
MIToS.MSA.adjustreferenceFunction

Creates a new matrix of residues. This function deletes positions/columns of the MSA with gaps in the reference (first) sequence.

source
MIToS.MSA.annotate_modification!Method

Annotates on file annotations the modifications realized by MIToS on the MSA. It always returns true, so It can be used in a boolean context.

source
MIToS.MSA.annotationsMethod

The annotations function returns the Annotations of an annotated MSA or aligned sequence. If the object is not annotated, it returns an empty Annotations object.

source
MIToS.MSA.column_indexMethod
column_index(msa, col_name)

Return the index (integer position) of the column with name col_name in the MSA msa. A KeyError is thrown if the column name does not exist. If col_name is an integer, the same integer is returned without checking if it is a valid index.

source
MIToS.MSA.columnname_iteratorMethod

columnname_iterator(msa)

It returns an iterator that returns the column names of the msa. If the msa is a Matrix{Residue} this function returns the actual column numbers as strings. Otherwise it returns the column number of the original MSA through the wrapped NamedArray column names.

source
MIToS.MSA.columnnamesMethod

columnnames(msa)

It returns a Vector{String} with the sequence names/identifiers. If the msa is a Matrix{Residue} this function returns the actual column numbers as strings. Otherwise it returns the column number of the original MSA through the wrapped NamedArray column names.

source
MIToS.MSA.columnpairsmatrixMethod

Initialize an empty PairwiseListMatrix for a pairwise measure in sequence pairs. It uses the sequence names if they are available, otherwise it uses the actual sequence numbers. You can use the positional argument to indicate the number Type (default: Float64), if the PairwiseListMatrix should store the diagonal values on the list (default: false) and a default value for the diagonal (default: NaN).

source
MIToS.MSA.filtercolumns!Function

filtercolumns!(msa, mask[, annotate::Bool=true])

It allows to filter MSA or aligned sequence columns/positions using a AbstractVector{Bool} mask. Annotations are updated if annotate is true (default).

source
MIToS.MSA.filtercolumns!Method

filtercolumns!(data::Annotations, mask)

It is useful for deleting column annotations (creating a subset in place).

source
MIToS.MSA.filtersequences!Function

filtersequences!(msa, mask[, annotate::Bool=true])

It allows to filter msa sequences using a AbstractVector{Bool} mask (It removes sequences with false values). AnnotatedMultipleSequenceAlignment annotations are updated if annotate is true (default).

source
MIToS.MSA.filtersequences!Method

filtersequences!(data::Annotations, ids::Vector{String}, mask::AbstractArray{Bool,1})

It is useful for deleting sequence annotations. ids should be a list of the sequence names and mask should be a logical vector.

source
MIToS.MSA.gapfractionMethod

It calculates the fraction of gaps on the Array (alignment, sequence, column, etc.). This function can take an extra dimension argument for calculation of the gap fraction over the given dimension.

source
MIToS.MSA.gapstrip!Function

This functions deletes/filters sequences and columns/positions on the MSA on the following order:

  1. Removes all the columns/position on the MSA with gaps on the reference (first) sequence.
  2. Removes all the sequences with a coverage with respect to the number of columns/positions on the MSA less than a coveragelimit (default to 0.75: sequences with 25% of gaps).
  3. Removes all the columns/position on the MSA with more than a gaplimit (default to 0.5: 50% of gaps).
source
MIToS.MSA.gapstripMethod

Creates a new matrix of Residues (MSA) with deleted sequences and columns/positions. The MSA is edited in the following way:

  1. Removes all the columns/position on the MSA with gaps on the reference (first) sequence
  2. Removes all the sequences with a coverage with respect to the number of columns/positions on the MSA less than a coveragelimit (default to 0.75: sequences with 25% of gaps)
  3. Removes all the columns/position on the MSA with more than a gaplimit (default to 0.5: 50% of gaps)
source
MIToS.MSA.getcolumnmappingMethod

It returns a Vector{Int} with the original column number of each column on the actual MSA. The mapping is annotated in the ColMap file annotation of an AnnotatedMultipleSequenceAlignment or in the column names of an NamedArray or MultipleSequenceAlignment.

NOTE: When the MSA results from vertically concatenating MSAs using vcat, the column map annotations from the constituent MSAs (such as 1_ColMap, 2_ColMap, etc.) are not returned. Instead, the column numbers referenced in the column names are provided. To access the original annotations, utilize the getannotfile function.

source
MIToS.MSA.gethcatmappingMethod

It returns a vector of numbers from 1 to N for each column that indicates the source MSA. The mapping is annotated in the "HCat" file annotation of an AnnotatedMultipleSequenceAlignment or in the column names of an NamedArray or MultipleSequenceAlignment.

NOTE: When the MSA results from vertically concatenating MSAs using vcat, the "HCat" annotations from the constituent MSAs are renamed as "1_HCat", "2_HCat", etc. In that case, the MSA numbers referenced in the column names are provided. To access the original annotations, utilize the getannotfile function.

source
MIToS.MSA.getnamedictMethod

It takes a ResidueAlphabet and returns a dictionary from group name to group position.

julia> using MIToS.MSA
 
 julia> ab = ReducedAlphabet("(AILMV)(RHK)(NQST)(DE)(FWY)CGP")
 ReducedAlphabet of length 8 : "(AILMV)(RHK)(NQST)(DE)(FWY)CGP"
@@ -94,7 +94,7 @@
   "FWY"   => 5
   "C"     => 6
   "G"     => 7
-  "P"     => 8
source
MIToS.MSA.getresiduesMethod

getresidues allows you to access the residues stored inside an MSA or aligned sequence as a Matrix{Residue} without annotations nor column/row names.

source
MIToS.MSA.getsequenceFunction

getsequence takes an MSA and a sequence number or identifier and returns an aligned sequence object. If the MSA is an AnnotatedMultipleSequenceAlignment, it returns an AnnotatedAlignedSequence with the sequence annotations. From a MultipleSequenceAlignment, It returns an AlignedSequence object. If an Annotations object and a sequence identifier are used, this function returns the annotations related to the sequence.

source
MIToS.MSA.getsequencemappingMethod

It returns the sequence coordinates as a Vector{Int} for an MSA sequence. That vector has one element for each MSA column. If the number if 0 in the mapping, there is a gap in that column for that sequence.

source
MIToS.MSA.getweightMethod

getweight(c[, i::Int])

This function returns the weight of the sequence number i. getweight should be defined for any type used for frequencies!/frequencies in order to use his weigths. If i isn't used, this function returns a vector with the weight of each sequence.

source
MIToS.MSA.getresiduesMethod

getresidues allows you to access the residues stored inside an MSA or aligned sequence as a Matrix{Residue} without annotations nor column/row names.

source
MIToS.MSA.getsequenceFunction

getsequence takes an MSA and a sequence number or identifier and returns an aligned sequence object. If the MSA is an AnnotatedMultipleSequenceAlignment, it returns an AnnotatedAlignedSequence with the sequence annotations. From a MultipleSequenceAlignment, It returns an AlignedSequence object. If an Annotations object and a sequence identifier are used, this function returns the annotations related to the sequence.

source
MIToS.MSA.getsequencemappingMethod

It returns the sequence coordinates as a Vector{Int} for an MSA sequence. That vector has one element for each MSA column. If the number if 0 in the mapping, there is a gap in that column for that sequence.

source
MIToS.MSA.getweightMethod

getweight(c[, i::Int])

This function returns the weight of the sequence number i. getweight should be defined for any type used for frequencies!/frequencies in order to use his weigths. If i isn't used, this function returns a vector with the weight of each sequence.

source
MIToS.MSA.join_msasMethod
join_msas(msa_a::AnnotatedMultipleSequenceAlignment, 
     msa_b::AnnotatedMultipleSequenceAlignment, 
     pairing; 
     kind::Symbol=:outer, 
@@ -105,14 +105,14 @@
     positions_a, 
     positions_b; 
     kind::Symbol=:outer, 
-    axis::Int=1)::AnnotatedMultipleSequenceAlignment

Join two Multiple Sequence Alignments (MSAs), msa_a and msa_b, based on specified matching positions or names. The function supports two formats: one takes a pairing argument as a list of correspondences, and the other takes positions_a and positions_b as separate lists indicating matching positions or names in each MSA. This function allows for various types of join operations (:inner, :outer, :left, :right) and can merge MSAs by sequences (axis 1) or by columns (axis 2).

Parameters:

  • msa_a::AnnotatedMultipleSequenceAlignment: The first MSA.
  • msa_b::AnnotatedMultipleSequenceAlignment: The second MSA.
  • pairing: An iterable where each element is a pair of sequence or column positions (Ints) or names (Strings) to match between msa_a and msa_b. For example, it can be a list of two-element tuples or pairs, or and OrderedDict.
  • positions_a, positions_b: Separate lists of positions or names in msa_a and msa_b, respectively.
  • kind::Symbol: Type of join operation. Default is :outer.
  • axis::Int: The axis along which to join (1 to match sequences, 2 to match columns).

Returns:

  • AnnotatedMultipleSequenceAlignment: A new MSA resulting from the join operation.

Behavior and Sequence Ordering:

The order of sequences or columns in the resulting MSA depends on the kind of join operation and the order of elements in the pairing or positions_a and positions_b lists.

  • For :inner joins, the function returns an MSA containing only those sequences/columns that are paired in both msa_a and msa_b. The order of elements in the output MSA follows the order in the pairing or position lists.
  • For :outer joins, the output MSA includes all sequences/columns from both msa_a and msa_b. Unpaired sequences/columns are filled with gaps as needed. The sequences/columns from msa_a are placed first. If the pairing or position lists are sorted, the output MSA columns and sequences will keep the same order as in the inputs. That's nice for situations such as profile alignments where the order of columns is important. If the pairing or position lists are not sorted, then the order of sequences/columns in the output MSA is not guaranteed to be the same as in the inputs. In particular, the matched sequences or columns will be placed first, followed by the unmatched ones.
  • For :left joins, all sequences/columns from msa_a are included in the output MSA keeping the same order as in msa_a. Sequences/columns from msa_b are added where matches are found, with gaps filling the unmatched positions.
  • For :right joins, the output MSA behaves like :left joins but with roles of msa_a and msa_b reversed.

Warning: When using Dict for pairing, the order of elements might not be preserved as expected. Dict in Julia does not maintain the order of its elements, which might lead to unpredictable order of sequences/columns in the output MSA. To preserve order, it is recommended to use an OrderedDict or a list of Pairs objects.

source
MIToS.MSA.meanpercentidentityFunction

Returns the mean of the percent identity between the sequences of a MSA. If the MSA has 300 sequences or less, the mean is exact. If the MSA has more sequences and the exact keyword is false (defualt), 44850 random pairs of sequences are used for the estimation. The number of samples can be changed using the second argument. Use exact=true to perform all the pairwise comparison (the calculation could be slow).

source
MIToS.MSA.namedmatrixMethod

The namedmatrix function returns the NamedResidueMatrix{Array{Residue,2}} stored in an MSA or aligned sequence.

source
MIToS.MSA.ncolumnsMethod

ncolumns(ann::Annotations) returns the number of columns/residues with annotations. This function returns -1 if there is not annotations per column/residue.

source
MIToS.MSA.percentidentityMethod

percentidentity(seq1, seq2, threshold)

Computes quickly if two aligned sequences have a identity value greater than a given threshold value. Returns a boolean value. Positions with gaps in both sequences doesn't count to the length of the sequences. Positions with a XAA in at least one sequence aren't counted.

source
MIToS.MSA.percentidentityMethod

percentidentity(seq1, seq2)

Calculates the fraction of identities between two aligned sequences. The identity value is calculated as the number of identical characters in the i-th position of both sequences divided by the length of both sequences. Positions with gaps in both sequences doesn't count to the length of the sequences. Positions with a XAA in at least one sequence aren't counted.

source
MIToS.MSA.percentidentityMethod

percentidentity(msa[, out::Type=Float64])

Calculates the identity between all the sequences on a MSA. You can indicate the output element type with the last optional parameter (Float64 by default). For a MSA with a lot of sequences, you can use Float32 or Flot16 in order to avoid the OutOfMemoryError().

source
MIToS.MSA.percentsimilarityFunction

Calculates the similarity percent between two aligned sequences. The 100% is the length of the aligned sequences minus the number of columns with gaps in both sequences and the number of columns with at least one residue outside the alphabet. So, columns with residues outside the alphabet (other than the specially treated GAP) aren't counted to the protein length. Two residues are considered similar if they below to the same group in a ReducedAlphabet. The alphabet (third positional argument) by default is:

ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")

The first group is composed of the non polar residues (AILMV), the second group is composed of polar residues, the third group are positive residues, the fourth group are negative residues, the fifth group is composed by the aromatic residues (FWY). C, G and P are considered unique residues.

Other residue groups/alphabets:

SMS (Sequence Manipulation Suite) Ident and Sim (Stothard Paul. 2000):

ReducedAlphabet("(GAVLI)(FYW)(ST)(KRH)(DENQ)P(CM)")

Stothard P (2000) The Sequence Manipulation Suite: JavaScript programs for analyzing and formatting protein and DNA sequences. Biotechniques 28:1102-1104.

Bio3D 2.2 seqidentity (Grant, Barry J., et al. 2006):

ReducedAlphabet("(GA)(MVLI)(FYW)(ST)(KRH)(DE)(NQ)PC")

References

source
MIToS.MSA.percentsimilarityMethod

Calculates the similarity percent between all the sequences on a MSA. You can indicate the output element type with the out keyword argument (Float64 by default). For an MSA with a lot of sequences, you can use out=Float32 or out=Flot16 in order to avoid the OutOfMemoryError().

source
MIToS.MSA.rename_sequences!Method
rename_sequences!(msa, newnames::Vector{T}) where {T<:AbstractString}
+    axis::Int=1)::AnnotatedMultipleSequenceAlignment

Join two Multiple Sequence Alignments (MSAs), msa_a and msa_b, based on specified matching positions or names. The function supports two formats: one takes a pairing argument as a list of correspondences, and the other takes positions_a and positions_b as separate lists indicating matching positions or names in each MSA. This function allows for various types of join operations (:inner, :outer, :left, :right) and can merge MSAs by sequences (axis 1) or by columns (axis 2).

Parameters:

  • msa_a::AnnotatedMultipleSequenceAlignment: The first MSA.
  • msa_b::AnnotatedMultipleSequenceAlignment: The second MSA.
  • pairing: An iterable where each element is a pair of sequence or column positions (Ints) or names (Strings) to match between msa_a and msa_b. For example, it can be a list of two-element tuples or pairs, or and OrderedDict.
  • positions_a, positions_b: Separate lists of positions or names in msa_a and msa_b, respectively.
  • kind::Symbol: Type of join operation. Default is :outer.
  • axis::Int: The axis along which to join (1 to match sequences, 2 to match columns).

Returns:

  • AnnotatedMultipleSequenceAlignment: A new MSA resulting from the join operation.

Behavior and Sequence Ordering:

The order of sequences or columns in the resulting MSA depends on the kind of join operation and the order of elements in the pairing or positions_a and positions_b lists.

  • For :inner joins, the function returns an MSA containing only those sequences/columns that are paired in both msa_a and msa_b. The order of elements in the output MSA follows the order in the pairing or position lists.
  • For :outer joins, the output MSA includes all sequences/columns from both msa_a and msa_b. Unpaired sequences/columns are filled with gaps as needed. The sequences/columns from msa_a are placed first. If the pairing or position lists are sorted, the output MSA columns and sequences will keep the same order as in the inputs. That's nice for situations such as profile alignments where the order of columns is important. If the pairing or position lists are not sorted, then the order of sequences/columns in the output MSA is not guaranteed to be the same as in the inputs. In particular, the matched sequences or columns will be placed first, followed by the unmatched ones.
  • For :left joins, all sequences/columns from msa_a are included in the output MSA keeping the same order as in msa_a. Sequences/columns from msa_b are added where matches are found, with gaps filling the unmatched positions.
  • For :right joins, the output MSA behaves like :left joins but with roles of msa_a and msa_b reversed.

Warning: When using Dict for pairing, the order of elements might not be preserved as expected. Dict in Julia does not maintain the order of its elements, which might lead to unpredictable order of sequences/columns in the output MSA. To preserve order, it is recommended to use an OrderedDict or a list of Pairs objects.

source
MIToS.MSA.meanpercentidentityFunction

Returns the mean of the percent identity between the sequences of a MSA. If the MSA has 300 sequences or less, the mean is exact. If the MSA has more sequences and the exact keyword is false (defualt), 44850 random pairs of sequences are used for the estimation. The number of samples can be changed using the second argument. Use exact=true to perform all the pairwise comparison (the calculation could be slow).

source
MIToS.MSA.namedmatrixMethod

The namedmatrix function returns the NamedResidueMatrix{Array{Residue,2}} stored in an MSA or aligned sequence.

source
MIToS.MSA.ncolumnsMethod

ncolumns(ann::Annotations) returns the number of columns/residues with annotations. This function returns -1 if there is not annotations per column/residue.

source
MIToS.MSA.percentidentityMethod

percentidentity(seq1, seq2, threshold)

Computes quickly if two aligned sequences have a identity value greater than a given threshold value. Returns a boolean value. Positions with gaps in both sequences doesn't count to the length of the sequences. Positions with a XAA in at least one sequence aren't counted.

source
MIToS.MSA.percentidentityMethod

percentidentity(seq1, seq2)

Calculates the fraction of identities between two aligned sequences. The identity value is calculated as the number of identical characters in the i-th position of both sequences divided by the length of both sequences. Positions with gaps in both sequences doesn't count to the length of the sequences. Positions with a XAA in at least one sequence aren't counted.

source
MIToS.MSA.percentidentityMethod

percentidentity(msa[, out::Type=Float64])

Calculates the identity between all the sequences on a MSA. You can indicate the output element type with the last optional parameter (Float64 by default). For a MSA with a lot of sequences, you can use Float32 or Flot16 in order to avoid the OutOfMemoryError().

source
MIToS.MSA.percentsimilarityFunction

Calculates the similarity percent between two aligned sequences. The 100% is the length of the aligned sequences minus the number of columns with gaps in both sequences and the number of columns with at least one residue outside the alphabet. So, columns with residues outside the alphabet (other than the specially treated GAP) aren't counted to the protein length. Two residues are considered similar if they below to the same group in a ReducedAlphabet. The alphabet (third positional argument) by default is:

ReducedAlphabet("(AILMV)(NQST)(RHK)(DE)(FWY)CGP")

The first group is composed of the non polar residues (AILMV), the second group is composed of polar residues, the third group are positive residues, the fourth group are negative residues, the fifth group is composed by the aromatic residues (FWY). C, G and P are considered unique residues.

Other residue groups/alphabets:

SMS (Sequence Manipulation Suite) Ident and Sim (Stothard Paul. 2000):

ReducedAlphabet("(GAVLI)(FYW)(ST)(KRH)(DENQ)P(CM)")

Stothard P (2000) The Sequence Manipulation Suite: JavaScript programs for analyzing and formatting protein and DNA sequences. Biotechniques 28:1102-1104.

Bio3D 2.2 seqidentity (Grant, Barry J., et al. 2006):

ReducedAlphabet("(GA)(MVLI)(FYW)(ST)(KRH)(DE)(NQ)PC")

References

source
MIToS.MSA.percentsimilarityMethod

Calculates the similarity percent between all the sequences on a MSA. You can indicate the output element type with the out keyword argument (Float64 by default). For an MSA with a lot of sequences, you can use out=Float32 or out=Flot16 in order to avoid the OutOfMemoryError().

source
MIToS.MSA.rename_sequences!Method
rename_sequences!(msa, newnames::Vector{T}) where {T<:AbstractString}
 rename_sequences!(msa, old2new::AbstractDict)
-rename_sequences!(msa, old2new::Pair...)

Rename the sequences of an MSA given a vector of new names, a dictionary mapping old names to new names, or one or more pairs going from old to new names. If the msa is an AnnotatedMultipleSequenceAlignment, the annotations are also updated. The function modifies the msa in place and returns it.

source
MIToS.MSA.rename_sequencesMethod
rename_sequences(msa, newnames::Vector{T}) where {T<:AbstractString}
+rename_sequences!(msa, old2new::Pair...)

Rename the sequences of an MSA given a vector of new names, a dictionary mapping old names to new names, or one or more pairs going from old to new names. If the msa is an AnnotatedMultipleSequenceAlignment, the annotations are also updated. The function modifies the msa in place and returns it.

source
MIToS.MSA.rename_sequencesMethod
rename_sequences(msa, newnames::Vector{T}) where {T<:AbstractString}
 rename_sequences(msa, old2new::AbstractDict)
-rename_sequences(msa, old2new::Pair...)

Rename the sequences of an MSA given a vector of new names, a dictionary mapping old names to new names, or one or more pairs going from old to new names. If the msa is an AnnotatedMultipleSequenceAlignment, the annotations are also updated. The function returns a new MSA with the sequences renamed without modifying the original MSA.

source
MIToS.MSA.residue2threeMethod

This function returns the three letter name of the Residue.

julia> using MIToS.MSA
+rename_sequences(msa, old2new::Pair...)

Rename the sequences of an MSA given a vector of new names, a dictionary mapping old names to new names, or one or more pairs going from old to new names. If the msa is an AnnotatedMultipleSequenceAlignment, the annotations are also updated. The function returns a new MSA with the sequences renamed without modifying the original MSA.

source
MIToS.MSA.residue2threeMethod

This function returns the three letter name of the Residue.

julia> using MIToS.MSA
 
 julia> residue2three(Residue('G'))
-"GLY"
source
MIToS.MSA.residuefractionMethod

It calculates the fraction of residues (no gaps) on the Array (alignment, sequence, column, etc.). This function can take an extra dimension argument for calculation of the residue fraction over the given dimension

source
MIToS.MSA.sequence_idMethod
sequence_id(seq::Union{AbstractSequence,AbstractAlignedSequence})

It returns the sequence identifier of a sequence object.

source
MIToS.MSA.sequence_indexMethod
sequence_index(msa, seq_name)

Return the index (integer position) of the sequence with name seq_name in the MSA msa. A KeyError is thrown if the sequence name does not exist. If seq_name is an integer, the same integer is returned without checking if it is a valid index.

source
MIToS.MSA.sequencepairsmatrixMethod

Initialize an empty PairwiseListMatrix for a pairwise measure in column pairs. It uses the column mapping (column number in the input MSA file) if it’s available, otherwise it uses the actual column numbers. You can use the positional argument to indicate the number Type (default: Float64), if the PairwiseListMatrix should store the diagonal values on the list (default: false) and a default value for the diagonal (default: NaN).

source
MIToS.MSA.setannotresidue!Function

setannotresidue!(ann, seqname, feature, annotation)

It stores per residue annotation (1 char per residue) for (seqname, feature)

source
MIToS.MSA.setreference!Function

It puts the sequence i (name or position) as reference (first sequence) of the MSA. This function swaps the sequences 1 and i.

source
MIToS.MSA.shuffle_msa!Method
shuffle_msa!([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false)

In-place version of shuffle_msa. It randomly permute residues in the MSA msa along sequences (dims=1) or columns (dims=2, the default). The optional positional argument subset allows to shuffle only a subset of the sequences or columns. The optional keyword argument fixedgaps indicates if the gaps should remain their positions (true by default). The optional keyword argument fixed_reference indicates if the residues in the first sequence should remain in their positions (false by default).

source
MIToS.MSA.shuffle_msaMethod
shuffle_msa([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false)

It randomly permute residues in the MSA msa along sequences (dims=1) or columns (dims=2, the default). The optional positional argument subset allows to shuffle only a subset of the sequences or columns. The optional keyword argument fixedgaps indicates if the gaps should remain their positions (true by default). The optional keyword argument fixed_reference indicates if the residues in the first sequence should remain in their positions (false by default). To shuffle in-place, see shuffle_msa!.

julia> using MIToS.MSA
+"GLY"
source
MIToS.MSA.residuefractionMethod

It calculates the fraction of residues (no gaps) on the Array (alignment, sequence, column, etc.). This function can take an extra dimension argument for calculation of the residue fraction over the given dimension

source
MIToS.MSA.sequence_idMethod
sequence_id(seq::Union{AbstractSequence,AbstractAlignedSequence})

It returns the sequence identifier of a sequence object.

source
MIToS.MSA.sequence_indexMethod
sequence_index(msa, seq_name)

Return the index (integer position) of the sequence with name seq_name in the MSA msa. A KeyError is thrown if the sequence name does not exist. If seq_name is an integer, the same integer is returned without checking if it is a valid index.

source
MIToS.MSA.sequencepairsmatrixMethod

Initialize an empty PairwiseListMatrix for a pairwise measure in column pairs. It uses the column mapping (column number in the input MSA file) if it’s available, otherwise it uses the actual column numbers. You can use the positional argument to indicate the number Type (default: Float64), if the PairwiseListMatrix should store the diagonal values on the list (default: false) and a default value for the diagonal (default: NaN).

source
MIToS.MSA.setannotresidue!Function

setannotresidue!(ann, seqname, feature, annotation)

It stores per residue annotation (1 char per residue) for (seqname, feature)

source
MIToS.MSA.setreference!Function

It puts the sequence i (name or position) as reference (first sequence) of the MSA. This function swaps the sequences 1 and i.

source
MIToS.MSA.shuffle_msa!Method
shuffle_msa!([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false)

In-place version of shuffle_msa. It randomly permute residues in the MSA msa along sequences (dims=1) or columns (dims=2, the default). The optional positional argument subset allows to shuffle only a subset of the sequences or columns. The optional keyword argument fixedgaps indicates if the gaps should remain their positions (true by default). The optional keyword argument fixed_reference indicates if the residues in the first sequence should remain in their positions (false by default).

source
MIToS.MSA.shuffle_msaMethod
shuffle_msa([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false)

It randomly permute residues in the MSA msa along sequences (dims=1) or columns (dims=2, the default). The optional positional argument subset allows to shuffle only a subset of the sequences or columns. The optional keyword argument fixedgaps indicates if the gaps should remain their positions (true by default). The optional keyword argument fixed_reference indicates if the residues in the first sequence should remain in their positions (false by default). To shuffle in-place, see shuffle_msa!.

julia> using MIToS.MSA
 
 julia> using Random
 
@@ -137,9 +137,9 @@
  G  D  R
  R  -  D
  E  K  -
-
source
MIToS.MSA.stringsequenceMethod
stringsequence(seq)
 stringsequence(msa, i::Int)
-stringsequence(msa, id::String)

It returns the selected sequence as a String.

source
MIToS.MSA.swapsequences!Method

It swaps the sequences on the positions i and j of an MSA. Also it's possible to swap sequences using their sequence names/identifiers when the MSA object as names.

source
MIToS.MSA.three2residueMethod

It takes a three letter residue name and returns the corresponding Residue. If the name isn't in the MIToS dictionary, a XAA is returned.

julia> using MIToS.MSA
+stringsequence(msa, id::String)

It returns the selected sequence as a String.

source
MIToS.MSA.swapsequences!Method

It swaps the sequences on the positions i and j of an MSA. Also it's possible to swap sequences using their sequence names/identifiers when the MSA object as names.

source
MIToS.MSA.three2residueMethod

It takes a three letter residue name and returns the corresponding Residue. If the name isn't in the MIToS dictionary, a XAA is returned.

julia> using MIToS.MSA
 
 julia> three2residue("ALA")
-A
source
MIToS.Utils.parse_fileFunction

parse_file(io, format[, output; generatemapping, useidcoordinates, deletefullgaps])

The keyword argument generatemapping (false by default) indicates if the mapping of the sequences ("SeqMap") and columns ("ColMap") and the number of columns in the original MSA ("NCol") should be generated and saved in the annotations. If useidcoordinates is true (default: false) the sequence IDs of the form "ID/start-end" are parsed and used for determining the start and end positions when the mappings are generated. deletefullgaps (true by default) indicates if columns 100% gaps (generally inserts from a HMM) must be removed from the MSA.

source
Random.shuffleFunction

It's like shuffle but in-place. When a Matrix{Residue} or a AbstractAlignedObject (sequence or MSA) is used, you can indicate if the gaps should remain their positions using the last boolean argument.

DEPRECATED: This method is deprecated. Use shuffle_msa instead.

source
Random.shuffle!Function

It's like Random.shuffle. When a Matrix{Residue} is used, you can indicate if the gaps should remain their positions using the last boolean argument. The previous argument should be the dimension to shuffle, 1 for shuffling residues in a sequence (row) or 2 for shuffling residues in a column.

DEPRECATED: This method is deprecated. Use shuffle_msa! instead.

source
+A
source
MIToS.Utils.parse_fileFunction

parse_file(io, format[, output; generatemapping, useidcoordinates, deletefullgaps])

The keyword argument generatemapping (false by default) indicates if the mapping of the sequences ("SeqMap") and columns ("ColMap") and the number of columns in the original MSA ("NCol") should be generated and saved in the annotations. If useidcoordinates is true (default: false) the sequence IDs of the form "ID/start-end" are parsed and used for determining the start and end positions when the mappings are generated. deletefullgaps (true by default) indicates if columns 100% gaps (generally inserts from a HMM) must be removed from the MSA.

source
Random.shuffleFunction

It's like shuffle but in-place. When a Matrix{Residue} or a AbstractAlignedObject (sequence or MSA) is used, you can indicate if the gaps should remain their positions using the last boolean argument.

DEPRECATED: This method is deprecated. Use shuffle_msa instead.

source
Random.shuffle!Function

It's like Random.shuffle. When a Matrix{Residue} is used, you can indicate if the gaps should remain their positions using the last boolean argument. The previous argument should be the dimension to shuffle, 1 for shuffling residues in a sequence (row) or 2 for shuffling residues in a column.

DEPRECATED: This method is deprecated. Use shuffle_msa! instead.

source
diff --git a/dev/PDB/index.html b/dev/PDB/index.html index 8aa47ab8..7a596df3 100644 --- a/dev/PDB/index.html +++ b/dev/PDB/index.html @@ -189,4 +189,4 @@ scatter3d!(chain_C, label = "C", alpha = 0.5)

superimposed_A, superimposed_C, RMSD = superimpose(chain_A, chain_C)
 
 RMSD
0.23000472638958339
scatter3d(superimposed_A, label = "A", alpha = 0.5)
-scatter3d!(superimposed_C, label = "C", alpha = 0.5)

+scatter3d!(superimposed_C, label = "C", alpha = 0.5)

diff --git a/dev/PDB_API/index.html b/dev/PDB_API/index.html index 371c7c20..8ad48e12 100644 --- a/dev/PDB_API/index.html +++ b/dev/PDB_API/index.html @@ -1,25 +1,25 @@ -PDB · MIToS

PDB

MIToS.PDBModule

The module PDB defines types and methods to work with protein structures inside Julia. It is useful to link structural and sequential information, and needed for measure the predictive performance at protein contact prediction of mutual information scores.

Features

  • Read and parse PDF and PDBML files
  • Calculate distance and contacts between atoms or residues
  • Determine interaction between residues
using MIToS.PDB
source

Contents

Types

MIToS.PDB.PDBAtomType

A PDBAtom object contains the information from a PDB atom, without information of the residue. It has the following fields that you can access at any moment for query purposes:

- `coordinates` : x,y,z coordinates, e.g. `Coordinates(109.641,73.162,42.7)`.
+PDB · MIToS

PDB

MIToS.PDBModule

The module PDB defines types and methods to work with protein structures inside Julia. It is useful to link structural and sequential information, and needed for measure the predictive performance at protein contact prediction of mutual information scores.

Features

  • Read and parse PDF and PDBML files
  • Calculate distance and contacts between atoms or residues
  • Determine interaction between residues
using MIToS.PDB
source

Contents

Types

MIToS.PDB.PDBAtomType

A PDBAtom object contains the information from a PDB atom, without information of the residue. It has the following fields that you can access at any moment for query purposes:

- `coordinates` : x,y,z coordinates, e.g. `Coordinates(109.641,73.162,42.7)`.
 - `atom` : Atom name, e.g. `"CA"`.
 - `element` : Element type of the atom, e.g. `"C"`.
 - `occupancy` : A float number with the occupancy, e.g. `1.0`.
 - `B` : B factor as a string, e.g. `"23.60"`.
 - `alt_id` : Alternative location ID, e.g. `"A"`.
-- `charge` : Charge of the atom, e.g. `"0"`.
source
MIToS.PDB.PDBFileType

PDBFile <: FileFormat

Protein Data Bank (PDB) format. It provides a standard representation for macromolecular structure data derived from X-ray diffraction and NMR studies.

source
MIToS.PDB.PDBMLType

PDBML <: FileFormat

Protein Data Bank Markup Language (PDBML), a representation of PDB data in XML format.

source
MIToS.PDB.PDBResidueType

A PDBResidue object contains all the information about a PDB residue. It has the following fields that you can access at any moment for query purposes:

- `id` : A `PDBResidueIdentifier` object.
-- `atoms` : A vector of `PDBAtom`s.
source
MIToS.PDB.PDBResidueIdentifierType

A PDBResidueIdentifier object contains the information needed to identity PDB residues. It has the following fields that you can access at any moment for query purposes:

- `PDBe_number` : It's only used when a PDBML is readed (PDBe number as a string).
+- `charge` : Charge of the atom, e.g. `"0"`.
source
MIToS.PDB.PDBFileType

PDBFile <: FileFormat

Protein Data Bank (PDB) format. It provides a standard representation for macromolecular structure data derived from X-ray diffraction and NMR studies.

source
MIToS.PDB.PDBMLType

PDBML <: FileFormat

Protein Data Bank Markup Language (PDBML), a representation of PDB data in XML format.

source
MIToS.PDB.PDBResidueType

A PDBResidue object contains all the information about a PDB residue. It has the following fields that you can access at any moment for query purposes:

- `id` : A `PDBResidueIdentifier` object.
+- `atoms` : A vector of `PDBAtom`s.
source
MIToS.PDB.PDBResidueIdentifierType

A PDBResidueIdentifier object contains the information needed to identity PDB residues. It has the following fields that you can access at any moment for query purposes:

- `PDBe_number` : It's only used when a PDBML is readed (PDBe number as a string).
 - `number` : PDB residue number, it includes insertion codes, e.g. `"34A"`.
 - `name` : Three letter residue name in PDB, e.g. `"LYS"`.
 - `group` : It can be `"ATOM"` or `"HETATM"`.
 - `model` : The model number as a string, e.g. `"1"`.
-- `chain` : The chain as a string, e.g. `"A"`.
source

Constants

MIToS.PDB.covalentradiusConstant

Covalent radius in Å of each element from the Additional file 1 of PICCOLO (Bickerton et al.). Hydrogen was updated using the value on Table 2 from (Cordero et al.).

References

- [Bickerton, George R., Alicia P. Higueruelo, and Tom L. Blundell. "Comprehensive, 
+- `chain` : The chain as a string, e.g. `"A"`.
source

Constants

MIToS.PDB.covalentradiusConstant

Covalent radius in Å of each element from the Additional file 1 of PICCOLO (Bickerton et al.). Hydrogen was updated using the value on Table 2 from (Cordero et al.).

References

- [Bickerton, George R., Alicia P. Higueruelo, and Tom L. Blundell. "Comprehensive, 
   atomic-level characterization of structurally characterized protein-protein 
   interactions: the PICCOLO database." BMC bioinformatics 
   12 (2011): 1-15.](@cite 10.1186/1471-2105-12-313)
 - [Cordero, Beatriz, et al. "Covalent radii revisited." Dalton Transactions 
-  21 (2008): 2832-2838.](@cite 10.1039/B801115J)
source
MIToS.PDB.vanderwaalsradiusConstant

van der Waals radius in Å from the Additional file 1 of Bickerton et al.

References

- [Bickerton, George R., Alicia P. Higueruelo, and Tom L. Blundell. "Comprehensive, 
+  21 (2008): 2832-2838.](@cite 10.1039/B801115J)
source
MIToS.PDB.vanderwaalsradiusConstant

van der Waals radius in Å from the Additional file 1 of Bickerton et al.

References

- [Bickerton, George R., Alicia P. Higueruelo, and Tom L. Blundell. "Comprehensive, 
   atomic-level characterization of structurally characterized protein-protein 
   interactions: the PICCOLO database." BMC bioinformatics 
-  12 (2011): 1-15.](@cite 10.1186/1471-2105-12-313)
source

Macros

MIToS.PDB.@atomsMacro

@atoms ... model ... chain ... group ... residue ... atom ...

These return a vector of PDBAtoms with the selected subset of atoms from a list of residues. You can use the type All to avoid filtering that option.

DEPRECATED: This macro is deprecated. Use the select_atoms function instead.

source
MIToS.PDB.@residuesMacro

@residues ... model ... chain ... group ... residue ...

These return a new vector with the selected subset of residues from a list of residues. You can use the type All to avoid filtering that option.

DEPRECATED: This macro is deprecated. Use the select_residues function instead.

source
MIToS.PDB.@residuesdictMacro

@residuesdict ... model ... chain ... group ... residue ...

This macro returns a dictionary (using PDB residue numbers as keys) with the selected subset of residues from a list of residues. You can use the type All to avoid filtering that option.

DEPRECATED: This macro is deprecated. Use the residuesdict function instead.

source

Methods and functions

Base.angleMethod

angle(a::Coordinates, b::Coordinates, c::Coordinates)

Angle (in degrees) at b between a-b and b-c

source
Base.anyMethod

any(f::Function, a::PDBResidue, b::PDBResidue, criteria::Function)

Test if the function f is true for any pair of atoms between the residues a and b. This function only test atoms that returns true for the fuction criteria.

source
Base.anyMethod

any(f::Function, a::PDBResidue, b::PDBResidue)

Test if the function f is true for any pair of atoms between the residues a and b

source
MIToS.PDB.CAmatrixMethod

Returns a matrix with the x, y and z coordinates of the Cα with best occupancy for each PDBResidue of the ATOM group. If a residue doesn't have a Cα, its Cα coordinates are NaNs.

source
MIToS.PDB.center!Method

center!(A::AbstractMatrix{Float64})

Takes a set of points A as an NxD matrix (N: number of points, D: dimension). Translates A in place so that its centroid is at the origin of coordinates

source
MIToS.PDB.centeredcoordinatesFunction

Returns a Matrix{Float64} with the centered coordinates of all the atoms in residues. An optional positional argument CA (default: true) defines if only Cα carbons should be used to center the matrix.

source
MIToS.PDB.centeredresiduesFunction

Returns a new Vector{PDBResidue} with the PDBResidues having centered coordinates. An optional positional argument CA (default: true) defines if only Cα carbons should be used to center the matrix.

source
MIToS.PDB.change_coordinatesFunction

change_coordinates(residue::PDBResidue, coordinates::AbstractMatrix{Float64}, offset::Int=1)

Returns a new PDBResidues with (x,y,z) from a coordinates AbstractMatrix{Float64} You can give an offset indicating in wich matrix row starts the (x,y,z) coordinates of the residue.

source
MIToS.PDB.change_coordinatesMethod

change_coordinates(residues::AbstractVector{PDBResidue}, coordinates::AbstractMatrix{Float64})

Returns a new Vector{PDBResidues} with (x,y,z) from a coordinates Matrix{Float64}

source
MIToS.PDB.contactMethod

contact(a::Coordinates, b::Coordinates, limit::AbstractFloat)

It returns true if the distance is less or equal to the limit. It doesn't call sqrt because it does squared_distance(a,b) <= limit^2.

source
MIToS.PDB.contactMethod

contact(A::PDBResidue, B::PDBResidue, limit::AbstractFloat; criteria::String="All")

Returns true if the residues A and B are at contact distance (limit). The available distance criteria are: Heavy, All, CA, CB (CA for GLY)

source
MIToS.PDB.contactMethod

contact(residues::Vector{PDBResidue}, limit::AbstractFloat; criteria::String="All")

If contact takes a Vector{PDBResidue}, It returns a matrix with all the pairwise comparisons (contact map).

source
MIToS.PDB.covalentMethod

Returns true if the distance between atoms is less than the sum of the covalentradius of each atom.

source
MIToS.PDB.distanceMethod

distance(residues::Vector{PDBResidue}; criteria::String="All")

If distance takes a Vector{PDBResidue} returns a PairwiseListMatrix{Float64, false} with all the pairwise comparisons (distance matrix).

source
MIToS.PDB.download_alphafold_structureMethod
download_alphafold_structure(uniprot_accession::String; format::Type{T}=MMCIFFile) where T<:FileFormat

This function downloads the structure file (PDB or mmCIF) for a given UniProt Accession from AlphaFoldDB. The uniprot_accession parameter specifies the UniProt Accession of the protein, e.g. "P00520". The format parameter specifies the file format to download, with the default being mmCIF, i.e. MMCIFFile. You can set format to PDBFile if you want to download a PDB file.

source
MIToS.PDB.downloadpdbMethod
downloadpdb(pdbcode::String; format::Type{T} = MMCIFFile, filename, baseurl, kargs...)

It downloads a gzipped PDB file from PDB database. It requires a four character pdbcode. Its default format is MMCIFFile (mmCIF) and It uses the baseurl "http://www.rcsb.org/pdb/files/". filename is the path/name of the output file. This function calls MIToS.Utils.download_file that calls Downloads.download. So, you can use keyword arguments, such as headers, from that function.

source
MIToS.PDB.findatomsMethod

findatoms(res::PDBResidue, atom::String)

Returns a index vector of the atoms with the given atom name.

source
MIToS.PDB.findheavyMethod

Returns a list with the index of the heavy atoms (all atoms except hydrogen) in the PDBResidue

source
MIToS.PDB.getCAMethod

Returns the Cα with best occupancy in the PDBResidue. If the PDBResidue has no Cα, missing is returned.

source
MIToS.PDB.getpdbdescriptionMethod

Access general information about a PDB entry (e.g., Header information) using the GraphQL interface of the PDB database. It parses the JSON answer into a JSON3.Object that can be used as a dictionary.

source
MIToS.PDB.hydrogenbondMethod

This function only works if there are hydrogens in the structure. The criteria for a hydrogen bond are:

  • d(Ai, Aj) < 3.9Å
  • d(Ah, Aacc) < 2.5Å
  • θ(Adon, Ah, Aacc) > 90°
  • θ(Adon, Aacc, Aacc-antecedent) > 90°
  • θ(Ah, Aacc, Aacc-antecedent) > 90°

Where Ah is the donated hydrogen atom, Adon is the hydrogen bond donor atom, Aacc is the hydrogen bond acceptor atom and Aacc-antecednt is the atom antecedent to the hydrogen bond acceptor atom.

source
MIToS.PDB.ionicMethod

There's an ionic interaction if a cationic and an anionic atoms are at 6.0 Å or less.

source
MIToS.PDB.is_aminoacidMethod
is_aminoacid(residue::PDBResidue)
-is_aminoacid(residue_id::PDBResidueIdentifier)

This function returns true if the PDB residue is an amino acid residue. It checks if the residue's three-letter name exists in the MIToS.Utils.THREE2ONE dictionary, and returns false otherwise.

source
MIToS.PDB.isresidueMethod
 isresidue(res; model=All, chain=All, group=All, residue=All)

This function tests if a PDBResidue has the indicated model, chain, group and residue names/numbers. You can use the type All (default value) to avoid filtering that level.

source
MIToS.PDB.kabschMethod

kabsch(A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})

This function takes two sets of points, A (refrence) and B as NxD matrices, where D is the dimension and N is the number of points. Assumes that the centroids of A and B are at the origin of coordinates. You can call center! on each matrix before calling kabsch to center the matrices in the (0.0, 0.0, 0.0). Rotates B so that rmsd(A,B) is minimized. Returns the rotation matrix. You should do B * RotationMatrix to get the rotated B.

source
MIToS.PDB.mean_coordinatesMethod

Calculates the average/mean position of each atom in a set of structure. The function takes a vector (AbstractVector) of vectors (AbstractVector{PDBResidue}) or matrices (AbstractMatrix{Float64}) as first argument. As second (optional) argument this function can take an AbstractVector{Float64} of matrix/structure weights to return a weighted mean. When a AbstractVector{PDBResidue} is used, if the keyword argument calpha is false the RMSF is calculated for all the atoms. By default only alpha carbons are used (default: calpha=true).

source

Macros

MIToS.PDB.@atomsMacro

@atoms ... model ... chain ... group ... residue ... atom ...

These return a vector of PDBAtoms with the selected subset of atoms from a list of residues. You can use the type All to avoid filtering that option.

DEPRECATED: This macro is deprecated. Use the select_atoms function instead.

source
MIToS.PDB.@residuesMacro

@residues ... model ... chain ... group ... residue ...

These return a new vector with the selected subset of residues from a list of residues. You can use the type All to avoid filtering that option.

DEPRECATED: This macro is deprecated. Use the select_residues function instead.

source
MIToS.PDB.@residuesdictMacro

@residuesdict ... model ... chain ... group ... residue ...

This macro returns a dictionary (using PDB residue numbers as keys) with the selected subset of residues from a list of residues. You can use the type All to avoid filtering that option.

DEPRECATED: This macro is deprecated. Use the residuesdict function instead.

source

Methods and functions

Base.angleMethod

angle(a::Coordinates, b::Coordinates, c::Coordinates)

Angle (in degrees) at b between a-b and b-c

source
Base.anyMethod

any(f::Function, a::PDBResidue, b::PDBResidue, criteria::Function)

Test if the function f is true for any pair of atoms between the residues a and b. This function only test atoms that returns true for the fuction criteria.

source
Base.anyMethod

any(f::Function, a::PDBResidue, b::PDBResidue)

Test if the function f is true for any pair of atoms between the residues a and b

source
MIToS.PDB.CAmatrixMethod

Returns a matrix with the x, y and z coordinates of the Cα with best occupancy for each PDBResidue of the ATOM group. If a residue doesn't have a Cα, its Cα coordinates are NaNs.

source
MIToS.PDB.center!Method

center!(A::AbstractMatrix{Float64})

Takes a set of points A as an NxD matrix (N: number of points, D: dimension). Translates A in place so that its centroid is at the origin of coordinates

source
MIToS.PDB.centeredcoordinatesFunction

Returns a Matrix{Float64} with the centered coordinates of all the atoms in residues. An optional positional argument CA (default: true) defines if only Cα carbons should be used to center the matrix.

source
MIToS.PDB.centeredresiduesFunction

Returns a new Vector{PDBResidue} with the PDBResidues having centered coordinates. An optional positional argument CA (default: true) defines if only Cα carbons should be used to center the matrix.

source
MIToS.PDB.change_coordinatesFunction

change_coordinates(residue::PDBResidue, coordinates::AbstractMatrix{Float64}, offset::Int=1)

Returns a new PDBResidues with (x,y,z) from a coordinates AbstractMatrix{Float64} You can give an offset indicating in wich matrix row starts the (x,y,z) coordinates of the residue.

source
MIToS.PDB.change_coordinatesMethod

change_coordinates(residues::AbstractVector{PDBResidue}, coordinates::AbstractMatrix{Float64})

Returns a new Vector{PDBResidues} with (x,y,z) from a coordinates Matrix{Float64}

source
MIToS.PDB.contactMethod

contact(a::Coordinates, b::Coordinates, limit::AbstractFloat)

It returns true if the distance is less or equal to the limit. It doesn't call sqrt because it does squared_distance(a,b) <= limit^2.

source
MIToS.PDB.contactMethod

contact(A::PDBResidue, B::PDBResidue, limit::AbstractFloat; criteria::String="All")

Returns true if the residues A and B are at contact distance (limit). The available distance criteria are: Heavy, All, CA, CB (CA for GLY)

source
MIToS.PDB.contactMethod

contact(residues::Vector{PDBResidue}, limit::AbstractFloat; criteria::String="All")

If contact takes a Vector{PDBResidue}, It returns a matrix with all the pairwise comparisons (contact map).

source
MIToS.PDB.covalentMethod

Returns true if the distance between atoms is less than the sum of the covalentradius of each atom.

source
MIToS.PDB.distanceMethod

distance(residues::Vector{PDBResidue}; criteria::String="All")

If distance takes a Vector{PDBResidue} returns a PairwiseListMatrix{Float64, false} with all the pairwise comparisons (distance matrix).

source
MIToS.PDB.download_alphafold_structureMethod
download_alphafold_structure(uniprot_accession::String; format::Type{T}=MMCIFFile) where T<:FileFormat

This function downloads the structure file (PDB or mmCIF) for a given UniProt Accession from AlphaFoldDB. The uniprot_accession parameter specifies the UniProt Accession of the protein, e.g. "P00520". The format parameter specifies the file format to download, with the default being mmCIF, i.e. MMCIFFile. You can set format to PDBFile if you want to download a PDB file.

source
MIToS.PDB.downloadpdbMethod
downloadpdb(pdbcode::String; format::Type{T} = MMCIFFile, filename, baseurl, kargs...)

It downloads a gzipped PDB file from PDB database. It requires a four character pdbcode. Its default format is MMCIFFile (mmCIF) and It uses the baseurl "http://www.rcsb.org/pdb/files/". filename is the path/name of the output file. This function calls MIToS.Utils.download_file that calls Downloads.download. So, you can use keyword arguments, such as headers, from that function.

source
MIToS.PDB.findatomsMethod

findatoms(res::PDBResidue, atom::String)

Returns a index vector of the atoms with the given atom name.

source
MIToS.PDB.findheavyMethod

Returns a list with the index of the heavy atoms (all atoms except hydrogen) in the PDBResidue

source
MIToS.PDB.getCAMethod

Returns the Cα with best occupancy in the PDBResidue. If the PDBResidue has no Cα, missing is returned.

source
MIToS.PDB.getpdbdescriptionMethod

Access general information about a PDB entry (e.g., Header information) using the GraphQL interface of the PDB database. It parses the JSON answer into a JSON3.Object that can be used as a dictionary.

source
MIToS.PDB.hydrogenbondMethod

This function only works if there are hydrogens in the structure. The criteria for a hydrogen bond are:

  • d(Ai, Aj) < 3.9Å
  • d(Ah, Aacc) < 2.5Å
  • θ(Adon, Ah, Aacc) > 90°
  • θ(Adon, Aacc, Aacc-antecedent) > 90°
  • θ(Ah, Aacc, Aacc-antecedent) > 90°

Where Ah is the donated hydrogen atom, Adon is the hydrogen bond donor atom, Aacc is the hydrogen bond acceptor atom and Aacc-antecednt is the atom antecedent to the hydrogen bond acceptor atom.

source
MIToS.PDB.ionicMethod

There's an ionic interaction if a cationic and an anionic atoms are at 6.0 Å or less.

source
MIToS.PDB.is_aminoacidMethod
is_aminoacid(residue::PDBResidue)
+is_aminoacid(residue_id::PDBResidueIdentifier)

This function returns true if the PDB residue is an amino acid residue. It checks if the residue's three-letter name exists in the MIToS.Utils.THREE2ONE dictionary, and returns false otherwise.

source
MIToS.PDB.isresidueMethod
 isresidue(res; model=All, chain=All, group=All, residue=All)

This function tests if a PDBResidue has the indicated model, chain, group and residue names/numbers. You can use the type All (default value) to avoid filtering that level.

source
MIToS.PDB.kabschMethod

kabsch(A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})

This function takes two sets of points, A (refrence) and B as NxD matrices, where D is the dimension and N is the number of points. Assumes that the centroids of A and B are at the origin of coordinates. You can call center! on each matrix before calling kabsch to center the matrices in the (0.0, 0.0, 0.0). Rotates B so that rmsd(A,B) is minimized. Returns the rotation matrix. You should do B * RotationMatrix to get the rotated B.

source
MIToS.PDB.mean_coordinatesMethod

Calculates the average/mean position of each atom in a set of structure. The function takes a vector (AbstractVector) of vectors (AbstractVector{PDBResidue}) or matrices (AbstractMatrix{Float64}) as first argument. As second (optional) argument this function can take an AbstractVector{Float64} of matrix/structure weights to return a weighted mean. When a AbstractVector{PDBResidue} is used, if the keyword argument calpha is false the RMSF is calculated for all the atoms. By default only alpha carbons are used (default: calpha=true).

source
MIToS.PDB.modelled_sequencesMethod
modelled_sequences(residue_list::AbstractArray{PDBResidue,N}; 
     model::Union{String,Type{All}}=All, chain::Union{String,Type{All}}=All, 
-    group::Union{String,Regex,Type{All}}=All) where N

This function returns an OrderedDict where each key is a named tuple (containing the model and chain identifiers), and each value is the protein sequence corresponding to the modelled residues in those chains. Therefore, the obtained sequences do not contain missing residues. All modelled residues are included by default, but those that don't satisfy specified criteria based on the model, chain, or group keyword arguments are excluded. One-letter residue names are obtained from the MIToS.Utils.THREE2ONE dictionary for all residue names that return true for is_aminoacid.

source
MIToS.PDB.picationMethod

There's a Π-Cation interaction if a cationic and an aromatic atoms are at 6.0 Å or less

source
MIToS.PDB.proximitymeanMethod

proximitymean calculates the proximity mean/average for each residue as the average score (from a scores list) of all the residues within a certain physical distance to a given amino acid. The score of that residue is not included in the mean unless you set include to true. The default values are 6.05 for the distance threshold/limit and "Heavy" for the criteria keyword argument. This function allows to calculate pMI (proximity mutual information) and pC (proximity conservation) as in Buslje et al..

References

source
MIToS.PDB.query_alphafolddbMethod
query_alphafolddb(uniprot_accession::String)

This function queries the AlphaFoldDB API to retrieve structure information for a given uniprot_accession, e.g. "P00520". This function returns the structure information as a JSON3.Object.

source
MIToS.PDB.residuepairsmatrixMethod

It creates a NamedArray containing a PairwiseListMatrix where each element (column, row) is identified with a PDBResidue from the input vector. You can indicate the value type of the matrix (default to Float64), if the list should have the diagonal values (default to Val{false}) and the diagonal values (default to NaN).

source
MIToS.PDB.residuesMethod

The residues function for AbstractArray{PDBResidue,N} is deprecated. Use the select_residues function instead. So, residues(residue_list, model, chain, group, residue) becomes select_residues(residue_list; model=model, chain=chain, group=group, residue=residue).

source
MIToS.PDB.residuesdictMethod
 residuesdict(residue_list; model=All, chain=All, group=All, residue=All)

This function returns a dictionary (using PDB residue numbers as keys) with the selected subset of residues. The residues are selected using the keyword arguments model, chain, group and residue. You can use the type All (default value) to avoid filtering at a particular level.

source
MIToS.PDB.rmsdMethod

rmsd(A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})

Return RMSD between two sets of points A and B, given as NxD matrices (N: number of points, D: dimension).

source
MIToS.PDB.rmsdMethod

rmsd(A::AbstractVector{PDBResidue}, B::AbstractVector{PDBResidue}; superimposed::Bool=false)

Returns the Cα RMSD value between two PDB structures: A and B. If the structures are already superimposed between them, use superimposed=true to avoid a new superimposition (superimposed is false by default).

source
MIToS.PDB.rmsfMethod

Calculates the RMSF (Root Mean-Square-Fluctuation) between an atom and its average position in a set of structures. The function takes a vector (AbstractVector) of vectors (AbstractVector{PDBResidue}) or matrices (AbstractMatrix{Float64}) as first argument. As second (optional) argument this function can take an AbstractVector{Float64} of matrix/structure weights to return the root weighted mean-square-fluctuation around the weighted mean structure. When a Vector{PDBResidue} is used, if the keyword argument calpha is false the RMSF is calculated for all the atoms. By default only alpha carbons are used (default: calpha=true).

source
MIToS.PDB.select_atomsMethod
select_atoms(residue_list; model=All, chain=All, group=All, residue=All, atom=All, alt_id=All, charge=All)

This function returns a vector of PDBAtoms with the selected subset of atoms from a list of residues. The atoms are selected using the keyword arguments model, chain, group, residue, atom, alt_id, and charge. You can use the type All (default value) to avoid filtering at a particular level.

source
MIToS.PDB.select_residuesMethod
select_residues(residue_list; model=All, chain=All, group=All, residue=All)

This function returns a new vector with the selected subset of residues from a list of residues. You can use the keyword arguments model, chain, group and residue to select the residues. You can use the type All (default value) to avoid filtering at a particular level.

source
MIToS.PDB.squared_distanceMethod

squared_distance(A::PDBResidue, B::PDBResidue; criteria::String="All")

Returns the squared distance between the residues A and B. The available criteria are: Heavy, All, CA, CB (CA for GLY)

source
MIToS.PDB.superimposeFunction
Asuper, Bsuper, RMSD = superimpose(A, B, matches=nothing)

This function takes A::AbstractVector{PDBResidue} (reference) and B::AbstractVector{PDBResidue}. Translates A and B to the origin of coordinates, and rotates B so that rmsd(A,B) is minimized with the Kabsch algorithm (using only their α carbons). Returns the rotated and translated versions of A and B, and the RMSD value.

Optionally provide matches which iterates over matched index pairs in A and B, e.g., matches = [(3, 5), (4, 6), ...]. The alignment will be constructed using just the matching residues.

source
MIToS.PDB.vanderwaalsMethod

Test if two atoms or residues are in van der Waals contact using: distance(a,b) <= 0.5 + vanderwaalsradius[a] + vanderwaalsradius[b]. It returns distance <= 0.5 if the atoms aren't in vanderwaalsradius.

source
MIToS.PDB.vanderwaalsclashMethod

Returns true if the distance between the atoms is less than the sum of the vanderwaalsradius of the atoms. If the atoms aren't on the list (i.e. OXT), the vanderwaalsradius of the element is used. If there is not data in the dict, distance 0.0 is used.

source
MIToS.Utils.parse_fileMethod

parse_file(pdbml, ::Type{PDBML}; chain=All, model=All, group=All, atomname=All, onlyheavy=false, label=true, occupancyfilter=false)

Reads a LightXML.XMLDocument representing a pdb file. Returns a list of PDBResidues (view MIToS.PDB.PDBResidues). Setting chain, model, group, atomname and onlyheavy values can be used to select of a subset of all residues. If not set, all residues are returned. If the keyword argument label (default: true) is false,the auth_ attributes will be use instead of the label_ attributes for chain, atom and residue name fields. The auth_ attributes are alternatives provided by an author in order to match the identification/values used in the publication that describes the structure. If the keyword argument occupancyfilter (default: false) is true, only the atoms with the best occupancy are returned.

source
MIToS.Utils.parse_fileMethod

parse_file(io, ::Type{MMCIFFile}; chain=All, model=All, group=All, atomname=All, onlyheavy=false, label=true, occupancyfilter=false)

Parse an mmCIF file and returns a list of PDBResidues. Setting chain, model, group, atomname and onlyheavy values can be used to select a subset of residues. Group can be "ATOM" or "HETATM". If those keyword arguments are not set, all residues are returned. If the keyword argument label (default: true) is false, the auth_ attributes will be used instead of the label_ attributes for chain, atom, and residue name fields. The auth_ attributes are alternatives provided by an author in order to match the identification/values used in the publication that describes the structure. If the keyword argument occupancyfilter (default: false) is true, only the atoms with the best occupancy are returned.

source
MIToS.Utils.parse_fileMethod

parse_file(io, ::Type{PDBFile}; chain=All, model=All, group=All, atomname=All, onlyheavy=false, occupancyfilter=false)

Reads a text file of a PDB entry. Returns a list of PDBResidue (view MIToS.PDB.PDBResidues). Setting chain, model, group, atomname and onlyheavy values can be used to select of a subset of all residues. Group can be "ATOM" or "HETATM". If not set, all residues are returned. If the keyword argument occupancyfilter (default: false) is true, only the atoms with the best occupancy are returned.

source
MIToS.Utils.print_fileFunction

print_file(io, res, format::Type{PDBFile}) print_file(res, format::Type{PDBFile})

Print a PDBResidue or a vector of PDBResidues in PDB format.

source
+ group::Union{String,Regex,Type{All}}=All) where N

This function returns an OrderedDict where each key is a named tuple (containing the model and chain identifiers), and each value is the protein sequence corresponding to the modelled residues in those chains. Therefore, the obtained sequences do not contain missing residues. All modelled residues are included by default, but those that don't satisfy specified criteria based on the model, chain, or group keyword arguments are excluded. One-letter residue names are obtained from the MIToS.Utils.THREE2ONE dictionary for all residue names that return true for is_aminoacid.

source
MIToS.PDB.picationMethod

There's a Π-Cation interaction if a cationic and an aromatic atoms are at 6.0 Å or less

source
MIToS.PDB.proximitymeanMethod

proximitymean calculates the proximity mean/average for each residue as the average score (from a scores list) of all the residues within a certain physical distance to a given amino acid. The score of that residue is not included in the mean unless you set include to true. The default values are 6.05 for the distance threshold/limit and "Heavy" for the criteria keyword argument. This function allows to calculate pMI (proximity mutual information) and pC (proximity conservation) as in Buslje et al..

References

source
MIToS.PDB.query_alphafolddbMethod
query_alphafolddb(uniprot_accession::String)

This function queries the AlphaFoldDB API to retrieve structure information for a given uniprot_accession, e.g. "P00520". This function returns the structure information as a JSON3.Object.

source
MIToS.PDB.residuepairsmatrixMethod

It creates a NamedArray containing a PairwiseListMatrix where each element (column, row) is identified with a PDBResidue from the input vector. You can indicate the value type of the matrix (default to Float64), if the list should have the diagonal values (default to Val{false}) and the diagonal values (default to NaN).

source
MIToS.PDB.residuesMethod

The residues function for AbstractArray{PDBResidue,N} is deprecated. Use the select_residues function instead. So, residues(residue_list, model, chain, group, residue) becomes select_residues(residue_list; model=model, chain=chain, group=group, residue=residue).

source
MIToS.PDB.residuesdictMethod
 residuesdict(residue_list; model=All, chain=All, group=All, residue=All)

This function returns a dictionary (using PDB residue numbers as keys) with the selected subset of residues. The residues are selected using the keyword arguments model, chain, group and residue. You can use the type All (default value) to avoid filtering at a particular level.

source
MIToS.PDB.rmsdMethod

rmsd(A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})

Return RMSD between two sets of points A and B, given as NxD matrices (N: number of points, D: dimension).

source
MIToS.PDB.rmsdMethod

rmsd(A::AbstractVector{PDBResidue}, B::AbstractVector{PDBResidue}; superimposed::Bool=false)

Returns the Cα RMSD value between two PDB structures: A and B. If the structures are already superimposed between them, use superimposed=true to avoid a new superimposition (superimposed is false by default).

source
MIToS.PDB.rmsfMethod

Calculates the RMSF (Root Mean-Square-Fluctuation) between an atom and its average position in a set of structures. The function takes a vector (AbstractVector) of vectors (AbstractVector{PDBResidue}) or matrices (AbstractMatrix{Float64}) as first argument. As second (optional) argument this function can take an AbstractVector{Float64} of matrix/structure weights to return the root weighted mean-square-fluctuation around the weighted mean structure. When a Vector{PDBResidue} is used, if the keyword argument calpha is false the RMSF is calculated for all the atoms. By default only alpha carbons are used (default: calpha=true).

source
MIToS.PDB.select_atomsMethod
select_atoms(residue_list; model=All, chain=All, group=All, residue=All, atom=All, alt_id=All, charge=All)

This function returns a vector of PDBAtoms with the selected subset of atoms from a list of residues. The atoms are selected using the keyword arguments model, chain, group, residue, atom, alt_id, and charge. You can use the type All (default value) to avoid filtering at a particular level.

source
MIToS.PDB.select_residuesMethod
select_residues(residue_list; model=All, chain=All, group=All, residue=All)

This function returns a new vector with the selected subset of residues from a list of residues. You can use the keyword arguments model, chain, group and residue to select the residues. You can use the type All (default value) to avoid filtering at a particular level.

source
MIToS.PDB.squared_distanceMethod

squared_distance(A::PDBResidue, B::PDBResidue; criteria::String="All")

Returns the squared distance between the residues A and B. The available criteria are: Heavy, All, CA, CB (CA for GLY)

source
MIToS.PDB.superimposeFunction
Asuper, Bsuper, RMSD = superimpose(A, B, matches=nothing)

This function takes A::AbstractVector{PDBResidue} (reference) and B::AbstractVector{PDBResidue}. Translates A and B to the origin of coordinates, and rotates B so that rmsd(A,B) is minimized with the Kabsch algorithm (using only their α carbons). Returns the rotated and translated versions of A and B, and the RMSD value.

Optionally provide matches which iterates over matched index pairs in A and B, e.g., matches = [(3, 5), (4, 6), ...]. The alignment will be constructed using just the matching residues.

source
MIToS.PDB.vanderwaalsMethod

Test if two atoms or residues are in van der Waals contact using: distance(a,b) <= 0.5 + vanderwaalsradius[a] + vanderwaalsradius[b]. It returns distance <= 0.5 if the atoms aren't in vanderwaalsradius.

source
MIToS.PDB.vanderwaalsclashMethod

Returns true if the distance between the atoms is less than the sum of the vanderwaalsradius of the atoms. If the atoms aren't on the list (i.e. OXT), the vanderwaalsradius of the element is used. If there is not data in the dict, distance 0.0 is used.

source
MIToS.Utils.parse_fileMethod

parse_file(pdbml, ::Type{PDBML}; chain=All, model=All, group=All, atomname=All, onlyheavy=false, label=true, occupancyfilter=false)

Reads a LightXML.XMLDocument representing a pdb file. Returns a list of PDBResidues (view MIToS.PDB.PDBResidues). Setting chain, model, group, atomname and onlyheavy values can be used to select of a subset of all residues. If not set, all residues are returned. If the keyword argument label (default: true) is false,the auth_ attributes will be use instead of the label_ attributes for chain, atom and residue name fields. The auth_ attributes are alternatives provided by an author in order to match the identification/values used in the publication that describes the structure. If the keyword argument occupancyfilter (default: false) is true, only the atoms with the best occupancy are returned.

source
MIToS.Utils.parse_fileMethod

parse_file(io, ::Type{MMCIFFile}; chain=All, model=All, group=All, atomname=All, onlyheavy=false, label=true, occupancyfilter=false)

Parse an mmCIF file and returns a list of PDBResidues. Setting chain, model, group, atomname and onlyheavy values can be used to select a subset of residues. Group can be "ATOM" or "HETATM". If those keyword arguments are not set, all residues are returned. If the keyword argument label (default: true) is false, the auth_ attributes will be used instead of the label_ attributes for chain, atom, and residue name fields. The auth_ attributes are alternatives provided by an author in order to match the identification/values used in the publication that describes the structure. If the keyword argument occupancyfilter (default: false) is true, only the atoms with the best occupancy are returned.

source
MIToS.Utils.parse_fileMethod

parse_file(io, ::Type{PDBFile}; chain=All, model=All, group=All, atomname=All, onlyheavy=false, occupancyfilter=false)

Reads a text file of a PDB entry. Returns a list of PDBResidue (view MIToS.PDB.PDBResidues). Setting chain, model, group, atomname and onlyheavy values can be used to select of a subset of all residues. Group can be "ATOM" or "HETATM". If not set, all residues are returned. If the keyword argument occupancyfilter (default: false) is true, only the atoms with the best occupancy are returned.

source
MIToS.Utils.print_fileFunction

print_file(io, res, format::Type{PDBFile}) print_file(res, format::Type{PDBFile})

Print a PDBResidue or a vector of PDBResidues in PDB format.

source
diff --git a/dev/Pfam/index.html b/dev/Pfam/index.html index 27b6c648..ba6ded85 100644 --- a/dev/Pfam/index.html +++ b/dev/Pfam/index.html @@ -109,4 +109,4 @@ using ROCAnalysis # You need to load ROCAnalysis to use the AUC function -AUC(ZMIp, cmap)
0.7514388233747185
+AUC(ZMIp, cmap)
0.7514388233747185
diff --git a/dev/Pfam_API/index.html b/dev/Pfam_API/index.html index cc919839..95213302 100644 --- a/dev/Pfam_API/index.html +++ b/dev/Pfam_API/index.html @@ -1,4 +1,4 @@ -Pfam · MIToS

Pfam

MIToS.PfamModule

The Pfam module, defines functions to measure the protein contact prediction performance of information measure between column pairs from a Pfam MSA.

Features

  • Read and download Pfam MSAs
  • Obtain PDB information from alignment annotations
  • Map between sequence/alignment residues/columns and PDB structures
  • Measure of AUC (ROC curve) for contact prediction of MI scores
using MIToS.Pfam
source

Contents

Types

Constants

Macros

Methods and functions

MIToS.Pfam.downloadpfamMethod

It downloads a gzipped Stockholm alignment from InterPro for the Pfam family with the given pfamcode.

By default, it downloads the full Pfam alignment. You can use the alignment keyword argument to download the seed or the uniprot alignment instead. For example, downloadpfam("PF00069") will download the full alignment for the PF00069 Pfam family, while downloadpfam("PF00069", alignment="seed") will download the seed alignment of the family.

The extension of the downloaded file is .stockholm.gz by default; you can change it using the filename keyword argument, but the .gz at the end is mandatory.

source
MIToS.Pfam.getcontactmasksMethod

This function takes a msacontacts or its list of contacts contact_list with 1.0 for true contacts and 0.0 for not contacts (NaN or other numbers for missing values). Returns two BitVectors, the first with trues where contact_list is 1.0 and the second with trues where contact_list is 0.0. There are useful for AUC calculations.

source
MIToS.Pfam.getseq2pdbMethod

Generates from a Pfam msa a Dict{String, Vector{Tuple{String,String}}}. Keys are sequence IDs and each value is a list of tuples containing PDB code and chain.

julia> getseq2pdb(msa)
+Pfam · MIToS

Pfam

MIToS.PfamModule

The Pfam module, defines functions to measure the protein contact prediction performance of information measure between column pairs from a Pfam MSA.

Features

  • Read and download Pfam MSAs
  • Obtain PDB information from alignment annotations
  • Map between sequence/alignment residues/columns and PDB structures
  • Measure of AUC (ROC curve) for contact prediction of MI scores
using MIToS.Pfam
source

Contents

Types

Constants

Macros

Methods and functions

MIToS.Pfam.downloadpfamMethod

It downloads a gzipped Stockholm alignment from InterPro for the Pfam family with the given pfamcode.

By default, it downloads the full Pfam alignment. You can use the alignment keyword argument to download the seed or the uniprot alignment instead. For example, downloadpfam("PF00069") will download the full alignment for the PF00069 Pfam family, while downloadpfam("PF00069", alignment="seed") will download the seed alignment of the family.

The extension of the downloaded file is .stockholm.gz by default; you can change it using the filename keyword argument, but the .gz at the end is mandatory.

source
MIToS.Pfam.getcontactmasksMethod

This function takes a msacontacts or its list of contacts contact_list with 1.0 for true contacts and 0.0 for not contacts (NaN or other numbers for missing values). Returns two BitVectors, the first with trues where contact_list is 1.0 and the second with trues where contact_list is 0.0. There are useful for AUC calculations.

source
MIToS.Pfam.getseq2pdbMethod

Generates from a Pfam msa a Dict{String, Vector{Tuple{String,String}}}. Keys are sequence IDs and each value is a list of tuples containing PDB code and chain.

julia> getseq2pdb(msa)
 Dict{String,Array{Tuple{String,String},1}} with 1 entry:
-  "F112_SSV1/3-112" => [("2VQC","A")]
source
MIToS.Pfam.msacolumn2pdbresidueMethod

msacolumn2pdbresidue(msa, seqid, pdbid, chain, pfamid, siftsfile; strict=false, checkpdbname=false, missings=true)

This function returns a OrderedDict{Int,String} with MSA column numbers on the input file as keys and PDB residue numbers ("" for missings) as values. The mapping is performed using SIFTS. This function needs correct ColMap and SeqMap annotations. This checks correspondence of the residues between the MSA sequence and SIFTS (It throws a warning if there are differences). Missing residues are included if the keyword argument missings is true (default: true). If the keyword argument strict is true (default: false), throws an Error, instead of a Warning, when residues don't match. If the keyword argument checkpdbname is true (default: false), throws an Error if the three letter name of the PDB residue isn't the MSA residue. If you are working with a downloaded Pfam MSA without modifications, you should read it using generatemapping=true and useidcoordinates=true. If you don't indicate the path to the siftsfile used in the mapping, this function downloads the SIFTS file in the current folder. If you don't indicate the Pfam accession number (pfamid), this function tries to read the AC file annotation.

source
MIToS.Pfam.msacontactsFunction

This function takes an AnnotatedMultipleSequenceAlignment with correct ColMap annotations and two dicts:

  1. The first is an OrderedDict{String,PDBResidue} from PDB residue number to PDBResidue.
  2. The second is a Dict{Int,String} from MSA column number on the input file to PDB residue number.

msacontacts returns a PairwiseListMatrix{Float64,false} of 0.0 and 1.0 where 1.0 indicates a residue contact. Contacts are defined with an inter residue distance less or equal to distance_limit (default to 6.05) angstroms between any heavy atom. NaN indicates a missing value.

source
MIToS.Pfam.msaresiduesMethod

This function takes an AnnotatedMultipleSequenceAlignment with correct ColMap annotations and two dicts:

  1. The first is an OrderedDict{String,PDBResidue} from PDB residue number to PDBResidue.
  2. The second is a Dict{Int,String} from MSA column number on the input file to PDB residue number.

msaresidues returns an OrderedDict{Int,PDBResidue} from input column number (ColMap) to PDBResidue. Residues on inserts are not included.

source
+ "F112_SSV1/3-112" => [("2VQC","A")]
source
MIToS.Pfam.msacolumn2pdbresidueMethod

msacolumn2pdbresidue(msa, seqid, pdbid, chain, pfamid, siftsfile; strict=false, checkpdbname=false, missings=true)

This function returns a OrderedDict{Int,String} with MSA column numbers on the input file as keys and PDB residue numbers ("" for missings) as values. The mapping is performed using SIFTS. This function needs correct ColMap and SeqMap annotations. This checks correspondence of the residues between the MSA sequence and SIFTS (It throws a warning if there are differences). Missing residues are included if the keyword argument missings is true (default: true). If the keyword argument strict is true (default: false), throws an Error, instead of a Warning, when residues don't match. If the keyword argument checkpdbname is true (default: false), throws an Error if the three letter name of the PDB residue isn't the MSA residue. If you are working with a downloaded Pfam MSA without modifications, you should read it using generatemapping=true and useidcoordinates=true. If you don't indicate the path to the siftsfile used in the mapping, this function downloads the SIFTS file in the current folder. If you don't indicate the Pfam accession number (pfamid), this function tries to read the AC file annotation.

source
MIToS.Pfam.msacontactsFunction

This function takes an AnnotatedMultipleSequenceAlignment with correct ColMap annotations and two dicts:

  1. The first is an OrderedDict{String,PDBResidue} from PDB residue number to PDBResidue.
  2. The second is a Dict{Int,String} from MSA column number on the input file to PDB residue number.

msacontacts returns a PairwiseListMatrix{Float64,false} of 0.0 and 1.0 where 1.0 indicates a residue contact. Contacts are defined with an inter residue distance less or equal to distance_limit (default to 6.05) angstroms between any heavy atom. NaN indicates a missing value.

source
MIToS.Pfam.msaresiduesMethod

This function takes an AnnotatedMultipleSequenceAlignment with correct ColMap annotations and two dicts:

  1. The first is an OrderedDict{String,PDBResidue} from PDB residue number to PDBResidue.
  2. The second is a Dict{Int,String} from MSA column number on the input file to PDB residue number.

msaresidues returns an OrderedDict{Int,PDBResidue} from input column number (ColMap) to PDBResidue. Residues on inserts are not included.

source
diff --git a/dev/References/index.html b/dev/References/index.html index a9c9f52c..b8bcf0dc 100644 --- a/dev/References/index.html +++ b/dev/References/index.html @@ -1,2 +1,2 @@ -References · MIToS

References

[1]
D. J. Zea, D. Anfossi, M. Nielsen and C. Marino-Buslje. MIToS. jl: mutual information tools for protein sequence analysis in the Julia language. Bioinformatics 33, 564–565 (2017).
[2]
U. Hobohm, M. Scharf, R. Schneider and C. Sander. Selection of representative protein data sets. Protein Science 1, 409–417 (1992).
[3]
C. M. Buslje, J. Santos, J. M. Delfino and M. Nielsen. Correction for phylogeny, small number of observations and data redundancy improves the identification of coevolving amino acid pairs using mutual information. Bioinformatics 25, 1125–1131 (2009).
[4]
S. F. Altschul, T. L. Madden, A. A. Schäffer, J. Zhang, Z. Zhang, W. Miller and D. J. Lipman. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic acids research 25, 3389–3402 (1997).
[5]
S. D. Dunn, L. M. Wahl and G. B. Gloor. Mutual information without the influence of phylogeny or entropy dramatically improves residue contact prediction. Bioinformatics 24, 333–340 (2008).
[6]
S. Velankar, J. M. Dana, J. Jacobsen, G. van Ginkel, P. J. Gane, J. Luo, T. J. Oldfield, C. O’Donovan, M.-J. Martin and G. J. Kleywegt. SIFTS: Structure Integration with Function, Taxonomy and Sequences resource. Nucleic Acids Research 41, D483-D489 (2012).
[7]
P. Stothard. The sequence manipulation suite: JavaScript programs for analyzing and formatting protein and DNA sequences. Biotechniques 28, 1102–1104 (2000).
[8]
B. J. Grant, A. P. Rodrigues, K. M. ElSawy, J. A. McCammon and L. S. Caves. Bio3d: an R package for the comparative analysis of protein structures. Bioinformatics 22, 2695–2696 (2006).
[9]
W. Perks. Some observations on inverse probability including a new indifference rule. Journal of the Institute of Actuaries 73, 285–334 (1947).
[10]
S. Trybula. Some problems of simultaneous minimax estimation. The Annals of Mathematical Statistics 29, 245–253 (1958).
[11]
H. Jeffreys. An invariant form for the prior probability in estimation problems. Proceedings of the Royal Society of London. Series A. Mathematical and Physical Sciences 186, 453–461 (1946).
[12]
C. Marino Buslje, E. Teppa, T. Di Doménico, J. M. Delfino and M. Nielsen. Networks of high mutual information define the structural proximity of catalytic sites: implications for catalytic residue identification. PLoS computational biology 6, e1000978 (2010).
+References · MIToS

References

[1]
D. J. Zea, D. Anfossi, M. Nielsen and C. Marino-Buslje. MIToS. jl: mutual information tools for protein sequence analysis in the Julia language. Bioinformatics 33, 564–565 (2017).
[2]
U. Hobohm, M. Scharf, R. Schneider and C. Sander. Selection of representative protein data sets. Protein Science 1, 409–417 (1992).
[3]
C. M. Buslje, J. Santos, J. M. Delfino and M. Nielsen. Correction for phylogeny, small number of observations and data redundancy improves the identification of coevolving amino acid pairs using mutual information. Bioinformatics 25, 1125–1131 (2009).
[4]
S. F. Altschul, T. L. Madden, A. A. Schäffer, J. Zhang, Z. Zhang, W. Miller and D. J. Lipman. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic acids research 25, 3389–3402 (1997).
[5]
S. D. Dunn, L. M. Wahl and G. B. Gloor. Mutual information without the influence of phylogeny or entropy dramatically improves residue contact prediction. Bioinformatics 24, 333–340 (2008).
[6]
S. Velankar, J. M. Dana, J. Jacobsen, G. van Ginkel, P. J. Gane, J. Luo, T. J. Oldfield, C. O’Donovan, M.-J. Martin and G. J. Kleywegt. SIFTS: Structure Integration with Function, Taxonomy and Sequences resource. Nucleic Acids Research 41, D483-D489 (2012).
[7]
P. Stothard. The sequence manipulation suite: JavaScript programs for analyzing and formatting protein and DNA sequences. Biotechniques 28, 1102–1104 (2000).
[8]
B. J. Grant, A. P. Rodrigues, K. M. ElSawy, J. A. McCammon and L. S. Caves. Bio3d: an R package for the comparative analysis of protein structures. Bioinformatics 22, 2695–2696 (2006).
[9]
W. Perks. Some observations on inverse probability including a new indifference rule. Journal of the Institute of Actuaries 73, 285–334 (1947).
[10]
S. Trybula. Some problems of simultaneous minimax estimation. The Annals of Mathematical Statistics 29, 245–253 (1958).
[11]
H. Jeffreys. An invariant form for the prior probability in estimation problems. Proceedings of the Royal Society of London. Series A. Mathematical and Physical Sciences 186, 453–461 (1946).
[12]
C. Marino Buslje, E. Teppa, T. Di Doménico, J. M. Delfino and M. Nielsen. Networks of high mutual information define the structural proximity of catalytic sites: implications for catalytic residue identification. PLoS computational biology 6, e1000978 (2010).
diff --git a/dev/SIFTS/index.html b/dev/SIFTS/index.html index 0c596af3..9288be73 100644 --- a/dev/SIFTS/index.html +++ b/dev/SIFTS/index.html @@ -164,4 +164,4 @@ "But there are ", sum([res.missing for res in sifts_1jqz]), " missing residues in the PDB file.", - )But there are 10 missing residues in the PDB file. + )But there are 10 missing residues in the PDB file. diff --git a/dev/SIFTS_API/index.html b/dev/SIFTS_API/index.html index 28cf2bc4..0be7063e 100644 --- a/dev/SIFTS_API/index.html +++ b/dev/SIFTS_API/index.html @@ -1,5 +1,5 @@ -SIFTS · MIToS

SIFTS

MIToS.SIFTSModule

The SIFTS module of MIToS allows to obtain the residue-level mapping between databases stored in the SIFTS XML files. It makes easy to assign PDB residues to UniProt/Pfam positions. Given the fact that pairwise alignments can lead to misleading association between residues in both sequences, SIFTS offers more reliable association between sequence and structure residue numbers.

Features

  • Download and parse SIFTS XML files
  • Store residue-level mapping in Julia
  • Easy generation of OrderedDicts between residues numbers
using MIToS.SIFTS
source

Contents

Types

MIToS.SIFTS.SIFTSResidueType

A SIFTSResidue object stores the SIFTS residue level mapping for a residue. It has the following fields that you can access at any moment for query purposes:

- `PDBe` : A `dbPDBe` object, it's present in all the `SIFTSResidue`s.
+SIFTS · MIToS

SIFTS

MIToS.SIFTSModule

The SIFTS module of MIToS allows to obtain the residue-level mapping between databases stored in the SIFTS XML files. It makes easy to assign PDB residues to UniProt/Pfam positions. Given the fact that pairwise alignments can lead to misleading association between residues in both sequences, SIFTS offers more reliable association between sequence and structure residue numbers.

Features

  • Download and parse SIFTS XML files
  • Store residue-level mapping in Julia
  • Easy generation of OrderedDicts between residues numbers
using MIToS.SIFTS
source

Contents

Types

MIToS.SIFTS.SIFTSResidueType

A SIFTSResidue object stores the SIFTS residue level mapping for a residue. It has the following fields that you can access at any moment for query purposes:

- `PDBe` : A `dbPDBe` object, it's present in all the `SIFTSResidue`s.
 - `UniProt` : A `dbUniProt` object or `missing`.
 - `Pfam` : A `dbPfam` object or `missing`.
 - `NCBI` : A `dbNCBI` object or `missing`.
@@ -12,4 +12,4 @@
 - `Ensembl` : An array of `dbEnsembl` objects.
 - `missing` : It's `true` if the residue is missing, i.e. not observed, in the structure.
 - `sscode` : A string with the secondary structure code of the residue.
-- `ssname` : A string with the secondary structure name of the residue.
source
MIToS.SIFTS.dbEnsemblType

dbEnsembl stores the residue (gene) accession id, the transcript, translation and exon ids in Ensembl as strings, together with the residue number and name using the UniProt coordinates.

source
MIToS.SIFTS.dbSCOP2BType

dbSCOP2B stores the residue id, number, name and chain in SCOP2B as strings. SCOP2B is expansion of SCOP2 domain annotations at superfamily level to every PDB with same UniProt accession having at least 80% SCOP2 domain coverage.

source

Constants

Macros

Methods and functions

MIToS.SIFTS.downloadsiftsMethod
downloadsifts(pdbcode::String; filename::String, source::String="https")

Download the gzipped SIFTS XML file for the provided pdbcode. The downloaded file will have the default extension .xml.gz. While you can change the filename, it must include the .xml.gz ending. The source keyword argument is set to "https" by default. Alternatively, you can choose "ftp" as the source, which will retrieve the file from the EBI FTP server at ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/. However, please note that using "https" is highly recommended. This option will download the file from the EBI PDBe server at https://www.ebi.ac.uk/pdbe/files/sifts/.

source
MIToS.SIFTS.siftsmappingMethod

Parses a SIFTS XML file and returns a OrderedDict between residue numbers of two DataBases with the given identifiers. A chain could be specified (All by default). If missings is true (default) all the residues are used, even if they haven’t coordinates in the PDB file.

source
MIToS.Utils.parse_fileMethod

parse_file(document::LightXML.XMLDocument, ::Type{SIFTSXML}; chain=All, missings::Bool=true)

Returns a Vector{SIFTSResidue} parsed from a SIFTSXML file. By default, parses all the chains and includes missing residues.

source
+- `ssname` : A string with the secondary structure name of the residue.
source
MIToS.SIFTS.dbEnsemblType

dbEnsembl stores the residue (gene) accession id, the transcript, translation and exon ids in Ensembl as strings, together with the residue number and name using the UniProt coordinates.

source
MIToS.SIFTS.dbSCOP2BType

dbSCOP2B stores the residue id, number, name and chain in SCOP2B as strings. SCOP2B is expansion of SCOP2 domain annotations at superfamily level to every PDB with same UniProt accession having at least 80% SCOP2 domain coverage.

source

Constants

Macros

Methods and functions

MIToS.SIFTS.downloadsiftsMethod
downloadsifts(pdbcode::String; filename::String, source::String="https")

Download the gzipped SIFTS XML file for the provided pdbcode. The downloaded file will have the default extension .xml.gz. While you can change the filename, it must include the .xml.gz ending. The source keyword argument is set to "https" by default. Alternatively, you can choose "ftp" as the source, which will retrieve the file from the EBI FTP server at ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/. However, please note that using "https" is highly recommended. This option will download the file from the EBI PDBe server at https://www.ebi.ac.uk/pdbe/files/sifts/.

source
MIToS.SIFTS.siftsmappingMethod

Parses a SIFTS XML file and returns a OrderedDict between residue numbers of two DataBases with the given identifiers. A chain could be specified (All by default). If missings is true (default) all the residues are used, even if they haven’t coordinates in the PDB file.

source
MIToS.Utils.parse_fileMethod

parse_file(document::LightXML.XMLDocument, ::Type{SIFTSXML}; chain=All, missings::Bool=true)

Returns a Vector{SIFTSResidue} parsed from a SIFTSXML file. By default, parses all the chains and includes missing residues.

source
diff --git a/dev/Scripts/index.html b/dev/Scripts/index.html index 456dad4a..2f4a1e33 100644 --- a/dev/Scripts/index.html +++ b/dev/Scripts/index.html @@ -386,4 +386,4 @@ Bioinformatics Unit Leloir Institute Foundation -Av. Patricias Argentinas 435, CP C1405BWE, Buenos Aires, Argentina +Av. Patricias Argentinas 435, CP C1405BWE, Buenos Aires, Argentina diff --git a/dev/Utils_API/index.html b/dev/Utils_API/index.html index aed8b1d3..8373e76b 100644 --- a/dev/Utils_API/index.html +++ b/dev/Utils_API/index.html @@ -1,14 +1,14 @@ -Utils · MIToS

Utils

MIToS.UtilsModule

The Utils has common utils functions and types used in other modules.

using MIToS.Utils
source

Contents

Types

MIToS.Utils.AllType

All is used instead of MIToS 1.0 "all" or "*", because it's possible to dispatch on it.

source
MIToS.Utils.FileFormatType

FileFormat is used for defile special parse_file (called by read_file) and print_file (called by read_file) methods for different file formats.

source

Constants

MIToS.Utils.THREE2ONEConstant

THREE2ONE is a dictionary that maps three-letter amino acid residue codes (String) to their corresponding one-letter codes (Char). The dictionary is generated by parsing components.cif file from the Protein Data Bank.

julia> using MIToS.Utils
+Utils · MIToS

Utils

MIToS.UtilsModule

The Utils has common utils functions and types used in other modules.

using MIToS.Utils
source

Contents

Types

MIToS.Utils.AllType

All is used instead of MIToS 1.0 "all" or "*", because it's possible to dispatch on it.

source
MIToS.Utils.FileFormatType

FileFormat is used for defile special parse_file (called by read_file) and print_file (called by read_file) methods for different file formats.

source

Constants

MIToS.Utils.THREE2ONEConstant

THREE2ONE is a dictionary that maps three-letter amino acid residue codes (String) to their corresponding one-letter codes (Char). The dictionary is generated by parsing components.cif file from the Protein Data Bank.

julia> using MIToS.Utils
 
 julia> one_letter_code = THREE2ONE["ALA"]
-'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
source

Macros

Methods and functions

MIToS.Utils.check_fileMethod

Returns the filename. Throws an ErrorException if the file doesn't exist, or a warning if the file is empty.

source
MIToS.Utils.download_fileMethod

download_file uses Downloads.jl to download files from the web. It takes the file url as first argument and, optionally, a path to save it. Keyword arguments are are directly passed to to Downloads.download.

julia> using MIToS.Utils
+'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)
source

Macros

Methods and functions

MIToS.Utils.check_fileMethod

Returns the filename. Throws an ErrorException if the file doesn't exist, or a warning if the file is empty.

source
MIToS.Utils.download_fileMethod

download_file uses Downloads.jl to download files from the web. It takes the file url as first argument and, optionally, a path to save it. Keyword arguments are are directly passed to to Downloads.download.

julia> using MIToS.Utils
 
 julia> download_file("https://www.uniprot.org/uniprot/P69905.fasta", "seq.fasta")
-"seq.fasta"
source
MIToS.Utils.get_n_wordsMethod

get_n_words{T <: Union{ASCIIString, UTF8String}}(line::T, n::Int) It returns a Vector{T} with the first n (possibles) words/fields (delimited by space or tab). If there is more than n words, the last word returned contains the finals words and the delimiters. The length of the returned vector is n or less (if the number of words is less than n). This is used for parsing the Stockholm format.

julia> using MIToS.Utils
+"seq.fasta"
source
MIToS.Utils.get_n_wordsMethod

get_n_words{T <: Union{ASCIIString, UTF8String}}(line::T, n::Int) It returns a Vector{T} with the first n (possibles) words/fields (delimited by space or tab). If there is more than n words, the last word returned contains the finals words and the delimiters. The length of the returned vector is n or less (if the number of words is less than n). This is used for parsing the Stockholm format.

julia> using MIToS.Utils
 
 julia> get_n_words("#=GR O31698/18-71 SS    CCCHHHHHHHHHHHHHHHEEEEEEEEEEEEEEEEHHH", 3)
 3-element Vector{String}:
  "#=GR"
  "O31698/18-71"
- "SS    CCCHHHHHHHHHHHHHHHEEEEEEEEEEEEEEEEHHH"
source
MIToS.Utils.list2matrixMethod

Returns a square symmetric matrix from the vector vec. side is the number of rows/columns. The diagonal is not included by default, set to true if there are diagonal elements in the list.

source
MIToS.Utils.matrix2listMethod

Returns a vector with the part ("upper" or "lower") of the square matrix mat. The diagonal is not included by default.

source
MIToS.Utils.read_fileMethod

read_file(pathname, FileFormat [, Type [, … ] ] ) -> Type

This function opens a file in the pathname and calls parse_file(io, ...) for the given FileFormat and Type on it. If the pathname is an HTTP or FTP URL, the file is downloaded with download in a temporal file. Gzipped files should end on .gz.

source
MIToS.Utils.select_elementMethod

Selects the first element of the vector. This is useful for unpacking one element vectors. Throws a warning if there are more elements. element_name is element by default, but the name can be changed using the second argument.

source
MIToS.Utils.write_fileMethod

write_file{T<:FileFormat}(filename::AbstractString, object, format::Type{T}, mode::ASCIIString="w")

This function opens a file with filename and mode (default: "w") and writes (print_file) the object with the given format. Gzipped files should end on .gz.

source
+ "SS CCCHHHHHHHHHHHHHHHEEEEEEEEEEEEEEEEHHH"
source
MIToS.Utils.list2matrixMethod

Returns a square symmetric matrix from the vector vec. side is the number of rows/columns. The diagonal is not included by default, set to true if there are diagonal elements in the list.

source
MIToS.Utils.matrix2listMethod

Returns a vector with the part ("upper" or "lower") of the square matrix mat. The diagonal is not included by default.

source
MIToS.Utils.read_fileMethod

read_file(pathname, FileFormat [, Type [, … ] ] ) -> Type

This function opens a file in the pathname and calls parse_file(io, ...) for the given FileFormat and Type on it. If the pathname is an HTTP or FTP URL, the file is downloaded with download in a temporal file. Gzipped files should end on .gz.

source
MIToS.Utils.select_elementMethod

Selects the first element of the vector. This is useful for unpacking one element vectors. Throws a warning if there are more elements. element_name is element by default, but the name can be changed using the second argument.

source
MIToS.Utils.write_fileMethod

write_file{T<:FileFormat}(filename::AbstractString, object, format::Type{T}, mode::ASCIIString="w")

This function opens a file with filename and mode (default: "w") and writes (print_file) the object with the given format. Gzipped files should end on .gz.

source
diff --git a/dev/index.html b/dev/index.html index c3a46cfd..53d46f3c 100644 --- a/dev/index.html +++ b/dev/index.html @@ -1,4 +1,4 @@ Home · MIToS
MIToS MIToS

A Julia Package to Analyze Protein Sequences, Structures, and Evolutionary Information

Modules

MIToS tools are separated into different modules for different tasks.

  • MSA: This module defines multiple functions and types for dealing with Multiple Sequence Alignments (MSAs) and their annotations. It also includes facilities for sequence clustering and shuffling, among others.
  • PDB: This module defines types and methods to work with protein structures from different sources, such as the Protein Data Bank (PDB) or AlphaFold DB. It includes functions to superpose structures, measure the distance between residues, and much more.
  • Information: This module defines residue contingency tables and methods on them to estimate information measures. This allow to measure evolutionary information on MSAs positions. It includes functions to estimate corrected mutual information (ZMIp, ZBLMIp) between MSA columns, as well as conservation estimations using Shannon entropy and the Kullback-Leibler divergence.
  • SIFTS: This module allows access to SIFTS residue-level mapping of UniProt, Pfam, and other databases with PDB entries.
  • Pfam: This module uses the previous modules to work with Pfam MSAs. It also has useful parameter optimization functions to be used with Pfam alignments.
  • Utils: MIToS has also a Utils module with common utils functions and types used in different modules of this package.

Citation

If you use MIToS [1], please cite:

Diego J. Zea, Diego Anfossi, Morten Nielsen, Cristina Marino-Buslje; MIToS.jl: mutual information tools for protein sequence analysis in the Julia language, Bioinformatics, Volume 33, Issue 4, 15 February 2017, Pages 564–565, https://doi.org/10.1093/bioinformatics/btw646

Older MIToS versions

You can change the MIToS version of the documentation at the bottom left of this site—the older version available is MIToS 2.0. If you are using MIToS v1 in a version of Julia pre-1.0, please read this older documentation instead.

Acknowledgments

MIToS was initially developed at the Structural Bioinformatics Unit of the Fundación Instituto Leloir (FIL) in Argentina. Its development now continues at the Molecular Assemblies and Genome Integrity group of the Institute for Integrative Biology of the Cell (I2BC) in France.

We want to thank all contributors who have helped improve MIToS. We also thank the Julia community and all the MIToS users for their feedback and support.

FIL and I2BC logos -FIL and I2BC logos
+FIL and I2BC logos diff --git a/dev/inf_entropy.png b/dev/inf_entropy.png index 6abf71f539d8a3ab22fae680596a08874a831f4f..1899b49a5b8d7a97aa98e64d3abdd4a165ef56a2 100644 GIT binary patch literal 17625 zcmdUXcRbhq|E{)BS(OnYWskDTUS(uUHjzEc&TNPzsf3c5nZ1QjR0xs1Rg#@e_PN~O zGk)iB&iS45JAa+yemp++jnC)p{eHck<9c4#^}2)9ROAj3(GU?35FApFm(?I3*sViA zu#1y$H-7TOSnmV=wcqrzoGiii&Oh-LsbK^JCkYf}rLK5HPY<{oP*} zWqGnoHYhheeWJ_Q&wfm=IZk3_T;zL~vAf_Ud4?bpssy#PDOSpy*RQ4j?Wa0f4QTce z5U9v7;gu%ny(%n9v&4HrCcgUA1Y+k za?;*@`g8Yraz+ByM_1VC_^n#Tz9mZ7xdxbch{l@AXbAHwGCB-Z`b>47_4C{Q&+jA; ztrfX7pOl|?TUA+7lAwk+r0{0{smz>@A7x7(8*&@HOPi9!NNcO znUk+=TPa1ZCvErz)}5}#*SmM3LPA2t%-r1B%C7{=%E|Fbmf3rkB&E35AK4reDPx9Q#@w`sl7`BQR_b#!%?ru#|~^u0FM1Q=y6Uv8+c zSCElOjOEx`Y*VcXJmT)|?&qi+U}Dh}Rdj2?qBM2j6cdw^lhYaHsOOQ9CFTtgalUIo zr1|*0bz4eLPme&_(U3zyqy#j^kvBiKC$PCO9}U~~uM0PPd&8vmG2Ib*(QB7CHa5P- zNo@C6#&z^d8W|Z`Sp2T_zcKnXChvOf^4K>SR!VW-4UfThx3;%dswS&E&CSidy}g^} ztG3n}YHAV_f^Mw*8Sq;9V~e!ZnPhiwh4oH!(3W zH@`?oeb&_f3j1k)dFE&H^G<@b?eflfZ$+aeGOe*ws1x3&ZmmWjzg8^s=~<{)_QtTj~_oyPCl()lq2)v;ZFQdd{kG_U)^kL~SGd``dQJ(^~=l3);cn@ z$%B&KzSU7y4yhe}la`iyz4+jL=>QWNYyKiVMzY-m?mpX_E485v2#x0YdTKGx#g!qy zD(BI!_}TVypX1!z+`R)w{`3@Bs;H>g*i0aqN*sq&2!n!y*MA1{L`O#}Dk>Tn8gg)O zG{=anWKVl}c%-GJ{rde|SVRPI8cNM~iJAP(n>Rt9ei9$&*3r>1c;_*fkyAOpxX8)D z(bmyX;WYB4z_NM&;S(=nVw4FF5);1?ad$v;1qTNoI&?@(RFsfWzu2z6v9Z6%_Bi9O zrO7UCZtjth5&tJIUc68UJJZwM{cB{T$hPax-??F@pmfAiX=y1|s4Z2M#b4BSqx=2) z?;nFGe*E~sp`E=rJL`cJKO(WgYum}Gw_W%2)R50g46@4FdK~LDJvG(Z*0#C1sbAv2 zMM;F4;4v&~Pf@1TyY-{`etmuYzJ2?!6n-0Xfhj4!AJd7%iu-uZewXT@5_X&N`1vsi z5gIMwXQZsmB`VtA+}zyXzp=Hsjz}#0I{TwWE`&zF`M1(1@6SuO5vV6lKb7}g9f|Dk z@4uR*M<$IB_{IL~*RQ3K}Y;CxUhv)cyScuyzP^4rxivv1kRH(_7>Q3z^_Lc0 zuYJ5IXyW8Fk7`m>RAh`Bnl2d<_n3X((Xq6;FpB*5Szk^b&&tik!v?amvokU>UXI|z zMc*bTn@daY7IP&&bjYy8VE~=wQi-Fzz0X(+W1h~`%uJPKbL>9iW9>J89YA5RXrw{` z8)=AaqsuZZ7sN_5HgbxIi3tm@wZGAH&CI?b?@#bv9e?4^pFel*+zC6QJTX*Nm6w-i zuWRLyduw3^I3kSqNcXCvhuifi%!Bf zx|4y=f}~>E14Ytdpscj4j9l8lz@W@^(rta&o~Hg+ec1TXZ}IW*yY}oSl}6P^1X9w| zZ~bn1wXxydV}0rF-MfU02o(wzm3Q8&P6I)?zi-kD+I3?|mj~R3&;uN+{SU6Ktt}aE z4*$3yP4M^Q_{2nOaNzSR>_-XrE`A=&(l6P6h-PtprV5ph?)(i&hl_M{bZ##PuTTWZ zYP@+q?X{Y%^5W{%tJn#TwZ(D7IsT}>vg`k$Rf^pdl+4mzJg`$CexdH1IH;;tEFB{w?(*Y3*FOy=Unj1qu0F_3SrF<4-!>gu|L z(0Vohn)N<1DDJ$FP|2f5kI+8X&It-e)DC|OqeqD^bsjY;bs9l>FCM!#;Nt2U6BFY& zP<^XJc4sIIOq_Zv56mzRw;jzkfzdT*h^g`T+ZEH-uvt8<)^GIz4o zHheMjGJAM_g;7XIh_kcv_e$`DK)H(-KcTYpOo!44q>CS@9T?dBmMF*c^Z_Xo zc@Vw0cil3j`3e;Ug$yzNw19PM5>Msd>V}4lq$F845hQpjF?wHys<9>tU76;Wt}bn3 zV+m&FkBBU!`%;j>og3Da1mZX8>FE>YLg~-n_`W#amXV%bjGkiGb52aG0=uGHU~%Zg zc{NqlnU(~ZIKQo1KE=!EUkC8DZ(nk1s->A(cW-Z}`peqkEL4Lp&)IesGT*!*Q@7xE zbDX24Wn@$oJ9&`r-yeYpW0d#7&!5k*vws;F80hYPA-*g}c+<@daWI>vo|vK%<2l}% zJTYOTM^8#R+axrUX92aEk!~!99`sWNQknVTd`f&Rq495 zrAcBUA}43(PzDM5qeqtpH{Hft%~8}QCMJ;kr!GA@>E9SDwvMLKi<|fA(mKy9Op3&{A*#-&>cc{d@-)3aknwkB2!YK2nzjR@ugQL{Pb*$w_ErXw#@nIT% ztKrWf_wV1A2|VoT<|gj`$3RKxQALHAPpsoa`y14{GN1M0u&_2H<}K0K@bIJS{F_ft z$t}+eZGZo8uRe^$Z>dB3(xprPloB~vS^xX@&yr9aJ9dnoUWFq^C-1d(j!{>R@o~bv z2!wn44>9?Z&~>Kam`efHBV{%g)Z!he@VQ#qRJC1G!WmHrKBX|*VFCMx3akC?Z`lFMasv=x7Hq*&(BH5tzWEU zXc$v*hWl7>Z3t}%usVu5ors%K^!c-A&pxFSA|$v?nr}J!_-vMINMZY{YsMM@y0;cf z4VW6yX}*34G&*B=n|gh$ds9$9)(T`m@;N8e8t8ly}VpRY#*Sm>&=@A3JPyd zksUj}ZCr8|jmmW?_?0ii%=fR+ejuP)tS(N=r|7a&^Tp$HmR9qNv!~+3Du& ztf{SCl9$(0=E_n22D4dvd;4KJ;oQ@%BxQ!!(HAe$Qd4UWPxv4d0SpGp-9zb$*8#hx zQqJ-51wVWUKwMNJW$cMtBpt2*;jkfy`T=Br>!BbUM3*)QQ7biyum_4VUj zIYann1p~+7GBM1`3=9l+B%ee__oHqC^HE5nKuWlbwW!1h17P>`_JRQr-ap{bmJ;&t zp`x#EWpMCiPtP^<0jsZd&CONbtN9@zO=ud&IdxjH^o8z7zUNi)@bJKLi;OL!#2w?( z`!zZFDkdf?HC1~h9{qMGR@gQD)hm4mhioJ*gWpC%c6KFT<&!6*J@p)8GG0titv3Fm z3(R3w!YhEYnAft><;#uCt8s& z3cVPk8JZk|`IVsk>*i>F`n9n2U8RzmnhO;QWkx}>z^MRAeLzcYs7tA;aS`F+hE=`; z0P7z={z8$qEBg8Kr(SK`Y5!RdQDx;O_vCu_1i$&TjEpW}KMRUPzq7z{E!U_5ek`oU zy%*SZfux2%FfmKV>PnckZjc(&Q$S@t35lWi?{AYb9Sa^|f9Ja?%*>pGGJf&m#p2>( zK*_w~;#ed@lz^?p*J!q5!Rw3TG6$w2WDX2)^YBPG{d$1@QNbl-`8E2DW@@X{c?=x2 zh>;V*E>eA5A0F*@u+V?Ju&PP5sPfkuoeUQ9-*|s@Wj8@QY z3=vao--mkPytT2KvPwViA$o<<*BFg z$Zwm@D)dS;wQ-#{7$;DOGB2FFow;+Zusg_yya8Ep>LnW6_O`Z#WiaXeSN0OEDp)6DJb^o1EDEtfjf@@qj zb?Vf~leZ7s*N_w4pE4-=6&I}Wi|(F7WO#V}nRiANp2|`DL8L#YrlOxbQRAzA_>jo$ z9x8;3gHaDx*tVFA`H}nhw*U6QxZC@Wl%W9s7#^01MCrt`V2P*4$5%0b0N=6;ixS)p z9r^JCXyL=-AE0l1z(@#_ym*?z| z_T*nQHXiEfDRrIf9ZUi)V18i04V2W@#s#{Q%1lpN<{AkZ z5fM{8eL%Zo(&cW`DBa-W%6)|IP)--H_Z<-g!tYF7ko*<`lHvUM^I!m;M??_q-7PJT z_yZL|r>u#QSWC-XPSXAxO@A}8TRgwbwDj%UcPkkL1o5)j+1UVl2MGyZoHJL%D1n-M zHOD9`F)=JIPQrDf9Wxgd4Gq1p%L6*QG6z~A2NEKpvYW&Er=6@F9E{vIwKA@TAf^BV zk%O-=`x_WUS5){oIptqr2a1p3w|*8F7?_o1Z)TPRj%X^r4|Qw@ybHUGVPP@KeOwNq z;l1`=Xvn(&!yEd^a&_j*m(+B>Es$my+JHzgMx)JRbjElv*^*F$8ADk~3F+*D%mtqF z-dyU!dyM@yi-Ln2fP8@654i4pE$u|W3*H1<pIRdccXe=9$i{zDLx2(J?WWO;P5K zj>d6`U7|(b59biHX!QFTM(g(GCIm=4TBM^ZU77AXj!EOdFWcTMljjlgq6azq4(bmb$n9)5nkfC_b;}*`DRQuyTzrCc6S1-CPRkDj<(0r8sbR%z<2Z#pkEWPlsuZ%(6-Sk9@c#HGvv)fQht>OgP;>Z<}-H&e1pyg9M4^ z^%md8Bw{xv%9n|mNYyb)10?sQPH>#~nc-0PhL${fwT>Bh0# zBLQCi1O)G`agqNgUk`m4Hhg=6o0RZ5-2nmuRuXa(nY1)HgQ^JNiK+7-cx>WLHcfR>$Q`6VzUm(=bP-wZXKr8u_F+**Fga$InHK`#X zA@N+AK+(I{fRJQ3dQ`ys?<^RfeFqK*=|$u1A+IYf{gI_pPSVoV;T+nSBPb{+z#}~c zKb(=#horEzodjIz1(mLnhEF`~`tIe=#>NKDZgFYpw7;yJoY$>eAZN&>m6VkF2L`U2 zno=^H!lNa=nW1K6Gz02ce8sUoXBJ6jTrX#MUs<Ny zT)uOuh?2`HNkLgzS*v5ktZYAXHIjU%#2h?$5P>`ZME>s90_IIwCNlkm{Co|N%bl*6 zWFhvTVfH)&Hh!0JF4(O?T~pJ`vFXhF1*>swk_%ku2c%A*K(QofK0+%0b?*$zT=H^rdC#4D7#dnqP@rGxbed5X6bwIqAuy7-xcIkk z-vA=3osl3dK%& zg2ei|Jn*DNMOp^(*nZ{Fw$i?X$nRbMe&GK9_**?mrbeHI023;6xm^SVaVf{15JJcS z=OFgqesGY{G*7385!EL3jX6jfg4_SdHUGVS)_Gg@#j|Hlpj3YSGAo@2iHlgT@?26( zxPuGx-(Nc;&A<>>>9Y<+XnCT;7DJew5X!#0rE%!v|8{nuJp`IyGAQ~j54z_C>K3jls=Ty z^xCy+kbVpdln6^bmu!%bBJO{b2v^ax&~L%(byy`p2xN`5k>&T6o3)Xdl>Ey>vw~B#_mK! zMp8%<5fPD7Q22Oz)0|d9D125Q;*<9C^Fw;`Dp^^vGJfppS^<&i?(Dp#e5N z0=RbR@rl*Jj2NU-lOlE(69wwhG^Zk=$;`gk^EXX_%RsD84pnWVYSEm&v=e2TL$Yr{ zzvwl%y0}De8>YYYz{jH~KmQ+S&;PzM{U86Wvc|KZ|KbJs_cZVS_?7?fuVzafgd(W% zpEGkGv$UaW2gB7I4L3^ED(iGdrbj_Ry3l+vc>*n&HH1BTwz0Vhc4J4ii-_nXBqJ9O zB@M8C#I=WjV5dwbCnIiA%2QKP^6!(>fZ2Er$rA%7pz{eK2ZQ0gsDdpz6R!walwQo% z9FcJO)2~}Ri!26>+7Q9TCnyLYy9BZk!fHFA6cUU-BD75I#gixB$H(8mL_qkhrR9dR z^JOKaD{9wvN~_e;^{YU6n9r^oW32r>I@+ZdX7b|8moHG(@=k*ih3aHD(3g{zb}h1@ zp+VwIdngMf(YQ1cgz*`S~xyvsv0PY1GUKdP4SL_;hzS_1w9 zOBHg^NMa)*u7W#NSAQB2af#>7+B!6@!wlj|ggq5rCD2%NavX<00Y1jxSW@;Ea_9#~ zdj955SqlpbNy+!0Lue1~h(;h(P*1x$ItDNY@2~`_GaA5B-n|oE)-HgIw`c~YRmbPnV8*uk{cmgg>~gN^dU8(F%D(+9dK%Se*MecUE{TjIhAjXl-AR;F2AxE4(f8K7Dz;>X9 z7j%)P8|A~5#c@q@c!T215>eVulLw*Uekk83{0%m)o)$Am^D<~+Nu#?0?0G6A?JWHcusf!E=2w0q$E}-5&9@|E-rlgNVsUJ0#d%d#0MF zYQqVQB_$;V$zz;*?T)mm4O>*bOb8KkxZ=08V|Prb-dssX)lg**_w}4!W0uHN%~Vfk zC~$%wCWwN?sL~q-6JgIqtK-Kbp=!T+`I3ztX?Oecvbv?e-{z7w^ihmGd=#x$$AOvhtt3Y%EP2``3R(tsHVOPiPsd@_YqtbM) z9rHGMkE0at>)4o?ej#jVV9y|Vqy)-x)4BZd6)nn2E+IjIQy@6E+rWUQKg`!qrmJ9= zDou$>e?clHa|)@SeR*=hLv*JGEYwSMiGDK8i%-&#(N@*e467Zk`07(bY8sy{O~|Iz zF{%||la^j+)A7*3--Jp$&Uo@eQXdb+(}&M?2jp0!C4v~7^Lx5|&*6@deLQjD;Z1G9 zYo#FtGp$!Cwaz{Ja<1U&{=y4|xfa(*Y6w3jy({c)tW$&1>>$NV9v%?^+|ubOIoMI? z9^N&_P0KqVK0ro$gD#`pQ}UO%RDel$MO*dVySsN_0{xRR38`h{`NyF$>T&UQVUvWA zhr6n)U;mzab4ya*J}!Q0+4j=Xt^=}+ao0dWqiEV?O!`UMTy<#K^)yS@J#5m-Fw1!S z-Me?_8gJ6mJIfRY4kWQjeAw@k3>6YOvH(B7&A@r<@7iTpHd$7h?<+6iY{;@-r2@<9 z=g4y>w2U9WVfT8e!}PSyr<;?GE@p>RqNdI}jiCW-kgH=Arbq?~qeOUzJT}*g!otGx z_oy>Jm0$e%i5Rkg`GME+8gKX+850c4J(rT~rG5?%gDuHHcyWr0S3+_q+WOSubo$I0 zm~5&hOY-h3Dkxyi=s=HhJ2_){{rbY7FvC(0z}xR%zjArt6U)rZ9MoEEO}@-fz_Sx| zv+65^n227zcp=Y_8XOwxx4Gt;V^l#&Nrl-{^KTlTG#_8V#o%N6_UQsrNqI(VNQbN(4kja zmKWb#S%aECHDyOkOx$~?G6gmP4Duy59g5itgFyb^jhK%Go3C37QD7#=v(= z1Oyz=IwU|)fkUqR&sq8S7UqU)caXsZ4U|_!xdVtFGvd?a_2|AZ8cqIf?mlr=?bV}4 zBnJ)z0^MRqk5HeJWhSrJA`SioSD%ZE%LZl*_>wd1{U_@)wP;NajkEaSItpE--M18Tg4garIR;2@)ps>F)c z3)&!h3wgP?nh=FuT_qSXP@92yta{UiuEQJv*>l&fUD(i~&u$C9l_@B~Ffr=s=x`cW zMjzW5O0bNBgM&MM2~7AX&pn0ee&?C1dzItFeSSB-{70pJTUy$&A$7t1PjpzA=06~j z_!uW42}#n()QmOoSZwSdo_xSxy{Z;8#c?jZkGzdY)amJI2$P)JKM;NJXHpOW0>jaT zWO?P25g-44V891VWD^$|8k2BPpyILw@m!jK8B1yku)UL_r!7hz!(o5a_? zy}pRQejmZUe6q6R|rB3*b&x$5Z2W@8123{edhKC@SD=J`xHBhMdGiO*OTX+O@T{mO$ak z6gPbOE4;e;`=dXXVthgwtJ6x$&MwZ+FY}mVs^JAy4e}oVnxv*NPQnkQY2i))fh_JR zywf!T{|$$vVHje}D4axo6ynA5Zu+oQ|GRjeTcUPhNI{mNp|f`J#kxgc>W%QBWpr?pjUC zz-5?`fg5jT>XW<&-b=sF0DPq`76eIGyi``E%2g*J;iZsQPirmo<2nIQ0sxK#RLvU||V_G3ojH*Txr?r7uQu z8v?d->i>WSf?Y#(*;#2$NVN?-RY?sP#=XVkRFdAq5f*_`xjx(iM+G#5b0mPkbm@9SXf6#$G+XWd{#dlH^yzABN?8YZ18o2 zhM$s}+U2kA_xu!A_zwd5^P?SS z=nEl7Bb16L_K6h_q!ieXdo)^j3yU?P!Bjz^9x}x%-odqJ!FXOd2fZ(unViKP=!Ix| zb~T}NB7#6*NJQwbLKz8*knY*~()1+Qiq$b9A$u2v-4qqSAZQ@`2w!`D8>(A}fqIac z1l#4ymr+?EpVn*T78G1}e9&q47gZNTf>G`CbP7u!E7U|hf9a)fq0cHd8wG9VetxpH zwuXp;(B6hcQP6R)|KrmxspURMoUl%wS&=d`H-8xwWn*jG7HO9VmI>0mwFz7}SboHH z+ow&kCxo_Aew#ggU07FKU0|~A>Pi1xSClgmx2bLnXt=mg+{}kS0Yz3ySTwn@-6+n= zRCasnK5>RC!l-G^f&TPmS@s%jC9p1Ro`3)TO-f2a@rOO^En7|YE4xNqeUWIioc)O( z4G=R-cBlu^|9m0bX!*@8u6uPIP;Vx}mLdcX9X{-ES8Y;N&;ny>cD7OTC6soUlhMjw zym|#05U@bdyxY}lhUN5noJw&u{P-0}y+gEukfoggVqG+qk1okcF=gIIWCN*)xO&04 z(UfovrWmLg-9(_h=yvrC4fz3NqP(MTK`+Z0ztl~hIbML4ydKDI?%?pXIEqw5M@OF4!Ol+UGAx;54*hSzqVa3h z&9JOYpA!;_ocv~EVxnH4jlz|-}8VO5mgu{}*45}0l4+ezlW9hjP*f3DOqzMtjv z%F&7&Kkk;5c_O!8o$2*_X8&A`m6C9;Q^>RRr0z>TD+BI)7L9*lnc1;9ff7Up3C7uy z7x|V?q`enRhb%}&Muu3xSJlRRa74th^^ERD)E)2vpZfY$p6M@@QKbOGr9_w@lf1sRhUV2O3{BlG z-|?_jqsuCI5#*4Q3YIh2x*vK(3H_>5+!Zv?5#P0nckh6k z#sa0|>RyLH37e8RxqttD>4e->=4!B@twz57PhgG-lKuJn_Zj^U9UbAQO*}X?0zp@3 zT>&DfHEbrR|Lbr-0%=4wANlUZx^&ol!FB$Z0&pB4PL6TaU`xxB!Q1ssO&x7*iH1us z=!HCb)R}DvkYQnOzlqQCk%Yy@ieLZy2rveoHgwhjoT&J7aPNrljqkT(Vg_J+LsHwD zno6(FB%_0kkAHI@M!&%A!hCSvZR!O?8wk@Q#VsTv4a_VolarHu6 z$2w_(gf0 zsa*d9#u*aDz`I*~Tnqv>?I`tQPS>u*S5<8zQc-V*M@Qv|h3vY)J?{h^P{QipzrYCU z7Y?O`=x{yBg#Ob5{B#N#Av+S5>?fLRW~6oocX7#dVjQBhK|f^|m=fLJJM)6SY)JnV;_j!7HR z!8~*%moc?pUcre$Vg3QXpUg6TFFl5%g6ErcM{21|w!->r>O2=eh#_n!CM?rpiW zxw$d~?&SsiHQL5L7inRW1aKIOv}uW#f*}Q|4`*Q?TxRyqb8>Quik9GR86F;nH2n_B z<8MAyintvD)Wyzj3MR8a66$K-a%jkOCr*s@m;k3@O29IqcL8w893Xlk&kO_%5rUte zABud=)ZW6T69GElWL0X!{`VYQ;qUohM3T-6vp-}I)!-@4bZdi(fLFsZ&Zl$lshhxp zk&g?8r=|>?UsU{N6BgD}Rc*!@o>s}ukR|)ViPhuA z-g!5ENGE@U#ljCIO5A6y?yfEP{)KOTU<3*>3jWdum43 z572|K3c$E##Np?8Elf>+0-^6Xrw|CmH_%t6NJJhUB`5zmIq3!GYw^dLnp5Y`zpej# zZ1wjmM|eZq_}6U|u|uc)(N;M`NUb0?OI;8Z6N9Q1KgBvR57wH$r*aL}#{S=01m6$4 zMBMp(`oe{t+FCic*wz;>sBm-%m_4RaNnV~x%(D=wgyBCYX9f--b8R>rXe4~!f$qi+ zGcqRO$F{l@3cRNXy^2k&79tq{Q9Mj+4<0<|i!P^+|B*t-9VoU*+yI zzN_=*%UQ_!TV7t6OAqYdPeV-&-=4}V!H~hs^f#!EFnS&5Hi%!>%QM}_xMOAolDlT9 z0SCk=fH)dT4-U6rfp6cw4Z9NPa>vTm&j4hNjdmD=T<$6;GCj5HEv&Dpk&%;wzt{pt zxd1qpBz~hoVez4xwtf2ss}=n@GeQwEjv8K^?8<=k1yC&<{J1L@mY1;S!?ULlP4t|3~vWiQ#11Opq$XbW#c~NGYb1FPN#)G zf3Ea`_d{djqr)vUAQ2<3HF|0lX6{ z1hGGwR6|W|79;PEnn0YaK^%&Ri^D0u1h9k3n%VOnZ3D+g+P{4}%gXwIR9cP~$FYJQ zJu+kY83c4 zrajoL>+0m;jgYPb{R7}Hi~|g(FGs*^j=GQP4YuMO2~gGxP$}HH`AJA$U@Dkr;MNrJ zm_5lT%Xv(~`|k~KydWn)#JgQhjfSAOwsPO1$QP873Y!7iL z!Ebeb1l{z!h{*AyM{$Toh=(UfylJlo%jvJ)yP|>Lp)i?mS9R;gfdByn91op`4v2*a zkBwCsnqOIQzj+huHI)L3AY2*aPLh(6SUj-FHh*r`Jz-otP}cbjaT-kO1ebIV5zY~S zZAxjtc(10euArop!~^h-T8-&V#u#II9BIDOh@#s-eEdleW9|Y&pFW*tWtA`XU!J=cWQpZI472ElD)&$22rR8;Xd!W;zw7mi~xg3fE3}U_;I_%+4 zpJZiam10GEz%>5&Hi#aBn0J}imwpfx)%%=H6BJcjTU(DX0JnK0a3~fWK3s8+P@Gx- zCVk!Da^}a5U1i1%H5Wr7GdY6Eea| z|LN1G`~h8z{tfg*(hm|7hr!7qML0whgx}Y`-&qq&%TeS;)g*SO;XV#3>gJn`L81vD zNw^QF8zIk(aRt*;s=;P;wKUE+9Sf!%AJ1D{bSeGH$uBgDeJ<9+-0W1PU@10U=pOb@60 z%`7eXNMs7GG;vk=Zt6j}|7yd=a_W?fs%j40hjs_W3~);pmu8)VQWM)|6$mXD zL6L|C=UQ4?(ngPA`~j2qDk1`OTNCyygh9fH^O$NBzXI4hEv+Riual|}U?qh#D9w+i zbAU`JlOuv@DJfUg)fvfy#J=?6agy0s@59elxidNqUQuXiPSFBm+?aGHHo zWPqnVJv~zm$Qg`5%cqs|>{6Segx&v~*qNx*`Iwlb<>W?dKF54(ZN=8C-9BnNGCX{W zln^xh$YTW=)l1NEF*G)yawA~<0|VQ*fdg(gAG8>0am_Fr3+m5DO)7a35bSl^`Tqi7r=ql_psDZPvj;-YC~Ucj`~F^& z_m4jTm;1_=wuT0|kL}2paLm}K^1OU}BG7g*tfG8U2f$j3Q!Swn9{la9+PXMnp}_nM z{>bUUidWfhVRQM%E7>Hp`_64r4R!UK6YXqkR{^FA^78>-!g&{&z6?pD5f{@evlBI!CXs$ePpvAiDi+$Gan0@+21fqT6hld0 zXJs{wE+9^Yamc2}H)#h7^bp^Mymx$|s^SuL0B$!JOcxX_kRyy{I}>7#O0Vse;u3Z) zHdfZOqE%2&_PvD(>qI0ZW;T-o?0tC)Sa5$*&-}}H&=69Mt5+`>;}WeIwQv>e92&!k zqXNhpU1jd2Mu}icfw4~eQ_<44CCD8(M8ixmQr_*w`3Yq81xb1qPSJ;shnqiP$GzaBfQX<6o5!L_V zMRUxXV7C)1E+uQQRk+!}CuMJs$A_T@n|!KIa*^hKK($U{{9a7-0?lQx3?aIAenm=n z;=~Cv$7dDZtIukO^XOPnUlL!x#w+;gz9L(e$gALJU%!5hSqFkanag-}tde|kD|!v+ z8ovzF$fnNp2L-hmsKhs-;BiB`1OFTV{oehI=0gF;|DYh{^=s4X z*V{^+Eqk~?vSX}@j?PR>Y^(I~>X9194KkfNwJ<-AX&OMK=ZG}ZQ!@(-JP62;4s{&I zi-8u2djKl|k`JJyty!8q{4)r83yaoDbuf|7gFE~Cy+LhL*-}m%2_n6%kCnz3PFsOo y5i_aYYaW;=kHdkgN=n`sc#+Oe2otup7_NvOjM((}bp(PPfx<-<*=*@+cmESOGiHYX literal 16075 zcmc(G2RPUL|E{)3q(KUiRI(Du&KF5`W-_v~%ibDNl2F+R*@W!uN|L=-LWN}Sz0Uo8 ze%D#o|NlSd{I7Ga>wKT<`TctI^%?K?YuvBPUo{<8u=*a2F$wIpPPxbkJeL#gq=B{&)E$}%?yy=V|LKZMUE4zK z-cu!4$am?hskM7;t@Zn4Ec|Vbvn~~_tFO0u{$eyGCg#?c$2v~qKT^KZ(FN$!W@)~g zWc!d-RHVfgoLWpanfmUXs?*-ri;Fg1$LV(c@18j#v51`H-qJNqO-(7OM@4DNG`^B= z^9 z%ZaagG2q!VVcWrPAMfvojAuEFwYhywm|<$E-%G>k@O0VC+?++(Ncf%S|9zeQ?Xq&4 zA0mn9Oys!vl0n!qs~{(f+Oz!PhswVFeLINP(><0y!9KsAdDVnt#^f1cpsJEhTvJMq;&v6)24^Hbn$;@nPXLnlh4Q~J5swW>`yiY}a{J5~ycU~c(;@uuk|KJ7h z{pnmdXfWADZ4)0D)PE$6UC3>5ydg?xU|^v8)W+J9nB5<_Xi*PyM`&9+%F=XSn(W<0eq?=lAYegHhdGGl)TwrD&y7_FC#NN!-M`+5xDg~IM1+N( z`uXK#XD?3mZ_QLOcA8eUCMmkPxpiJ3m5`9=%+$-I_Fh?8v6l@A37PCKkKX+YHy-hD zE#F+VASd_3EA#&y;C<;6b@J7C>_a*ByN&nf0euGL+8L)YNo^9wL)6H)reJWwbEXIfaYw-McqUBlC@r6DN}-3$>b> z8shiM7s>B-#>PVr_fVSmT`T+W;nJy7kB(faHp+fX$(Wd!SYyN!{I$Egdt_u}y!E`{ zS9YBu)r9){JNA4pu+hUaBWQ*jqeDYN%G_73vrx;hQ0Mz=vRnU({LXI~FkQvyRaaBf z(zLAS&a>}=NnKd=%hS=9?i#cv%167;${-dm9NJ6L#IBy9?K1zjc|X12;GaLMYipU> zg|-Mbr>VY;mEq`@FJAaiGWHcaP5t=sqpeNV%WEszbJds1cDNxbKy>w6b#=^ttM`3r4shg{- zx~l5mu`#Uw%;v@#cCNmi#;~Ob>R{{jH2$#eq3cu&CO@IxXSHEUPMLdxVo;QGE7WN zR2tkhGaEtB_2gMdALF`_JvcK^B_$<=h)7b1YH4onF0?bXwVlP5yt4S)meQJKST#F0 z*Z5lCBo*oB&!1P9rfkg2j?&OLtuNpGZi3u2ta={BXQAG@&e$ojW06$l`t^o4B5ALu zgMxx8D=W{bzvDiCeg#p*DB}7mFtBN#)OHeHkwWbcZIFyLWHgxRJN0t)}L%wm1iih0VNdsRDB z8^Ym-687W?uBer7c~s<+4Cz@;&P1!-lnf>AE4G9u!NKhaer9H70|SFZ#pummr~a&s z0<%_DLBUl71#{3D7MA!)4l(!T>AM48>!XDDMMXUazWO7E$;rvjojG&DJ3c9?>dP0o zFz(n%h6r_!B~oLa?p(9he4G9^K|!B5dp>>o zR9IM8;X`4#l#%T1z0<;C%(}Pe00B4HvyWD(%2Y-{f%EKHmsRX(lHy*b-Me=;H#gfS zjpDyqSy{h-|E{a6D;>a)Fj3`yOj>aSL5O}^(cT_R_51hlmN@BvZ{MzJRlho`^0(7X zNvSE2NrIF~P*4!rLP<_u+t|p(&aPTke^p6|ib+yM#jYh@20N{pdt145i}6WH$_T2( zIyT*Frjo{OX)+3R0`+gK+;yOxpOoL~`{2MpYmTYn&Mp%y*K32<0EbT{ypM(eK%?$Ubxh4DIR~vcG#~n_kK{&0s7O~00RsnuCE2` zOpx|?y_r~v$6X4N%t5x>#tj()s@@nFHTDIEhLZ1m(9&{!HCH7`!Sda629edSEJL=# z*@idt%G^`dIcs{J(yxE!6!u(S{*H)GziDc0e3*ttI^b}5>fMKRh{+QtZq59ze7v9j zt(R%?E!xEF>{*Jk#}4-PJGYDXT8$p|K6>=1kke$zonHhkEiG|zag={_6u_6PtgP8R zoLpS|=g+6)?O_N=!q0>wzBV-ligYfs&6i6nDk=(C-=y8JYHsPcZb;!@#l0`OQDgJ& zj`VhEIM=vZ+^enY1ig{q;pFe-z*Vo7BF&|{RgMxy>i}=~X(jXz~Bl&Z< z3X6(FUFVbEzWx01Svg4Q%xh5(L8kclcos1+ zul3~_v~JYrvuDrF>#FdCkE83c6)&>AS^^l>@+mGZE-&9|ij^>}3sY88TOhvqLc0di z4H;$kr|zhree(VLpOF7z>(dph^Z7(X*_yOx*Dl1MsgY3|y|6HY4dMyG5`{g@x9TCn zEbJ_B6xw5o>gq&6e?!>$3VgnP{qv_j5^#*0kI%TuT~J7f676QZE9?IK`$){pjEuzE zjrH{qPMsp_UO^8J5664HN@ujQ3@T&9A0FY*iqFjS8g8IhRBSw}l321hFhAAb-`(x< zF~nn4-`vy}--GEl3oi2V?%ThAbINN=mUN`Abj_*X!_?3a2q#WBH<&6L z`psOrcHhO;R<4o~hTbm3F4Bk{XZ{8dl+v<3CV_`Kz?F zG2HITGui`(4=(#KTfk%%XVMEWH zKR*)}AWF7xUp#;F#%Nlmu&^+Y^3q`SODyH$SZ9EGPpVqlF@Eby0s=Z8?)*YT=ROV$ z3`Aft3Og%thDu9GHH*C6wqv)mt1C5uor9yBBHN^%&XsS2xSQ$e003VUEG|2d-Ey8mO)lmTU%~ys1FM(tK&$^ zDG`z0PoHAN&s~gt{rdRLn?1&Qoc$j%wuOEvv-)vLZc24zV6w1 zs!!HBi|!qJ9do8PpWc?4nOSgfFefJ`upm7UBswy}HDYSw5`j5r=dN8hZr-%%&Q9pq z?dj>M=ec4mF?=vNX?G+01&mffC#AC$J>1>9;^O4BXh}*=du}b5I)pa8Vq;@F!lnB< zIQY~dvWE{J20ifCz)XZ0EEkwOr9E&A~sCpB28cDj%uW zKPm6x>RRUbY#;BFM~`-uj!Niocb*~P%?#8Ls_Gt#%r*BtSwZG=V{YtZ@CnSJ0F<#3 zoGFM32ZUtD$a?_z*HKaORlCc=kqb!Qz^%w(tuu!%-d-?md}GyH1bckLj7WIH2cd_tAXw`}{A?hVgYWrm(NCO_i&$_M+9yr?K{C zBqyIad)7>j;{5sZ_AOT7av@=1w1*DmV(MDWPlPw2)JVpC37s zWUP{~$7*hBY7LM>6lrd5?mW_?sHn)=+8V9t;>C+S1vaL|Y@D15^70d1StpCpF3%hy zbKO=q5c$?f=4 zvLV_f&H&QSQDPqMrqIeUk-vJCRA^)oqkNIm6eqPWgP7;|*jO>D3*oA^_7B7wwi2W5 zEene@r8A>p_Qg8Ij(~RW-#1lOA_+WDa(OS1k&|<2f4F66Xor3y<#z=5NrHv?{Jw)n zj)c5;@mZnBV|n@)roTi$I|Segl$gqFPrSuum=S@m2zdv*i`1zU&-qUIK7{ z>iS$6p)pzv)6(7kG7s5n*DS59kUl-RW;wS$J^1RWvdn_Imi3*R59Q*zI zdR6*Pj6Rnyf3zvz0Gj1;)r;|1TLcE+5)g1f^W*E8>M7v5aN)w)vobetuHg-lH*WJ+ z7Xq`h5mU>{%b+D5d3#%;?l;AV3ptLH?%gW@`XGIY-?M-i({ao0o|qtkXhqCg;!(gb zoLot`kI2E>?PUVwMU4&*4>vI}nVFfHr*o8-ug3)3Cino?w_lkZ;?gbofU<-owYRsA zzdq8Qrm?m(rFgpch_B?lH8s#OCV3(pK(8Me8h(da8wF4yO7H*y`-rdV{e!uYt{>2f zKr-xQLK+}x=l;~ctR2d0$OUK<(Eq$sP{SMx@$N>BdBeUKOH51i(`I<`;NkmjV!Y#D zOGrqjjw3{dsv}0?d*(NH|N8X{^8^i-uDydpZ=OYH&C{9!?WC91wMINX`dxGSnS$r9 zG#>c+^(#0XAjiur)V`GlDXKLJso2EJlpLO08%s9}tn0sho1UACQ|WpC#UShX^ZjeD z{!qD8#`gep*WB+cu;~Y`IY>Y?Bes*A&owpI@+IWuFO5#C#_uC5q1#9U?v(X|PR+dAX{H7YBIYJ$(rr__9D z<=eG$Cq^lgo_s6d3r9!C{DOk$1+VSfx05mf1tDEP5Xs2Pi@40j&r-Mx+W(#Y+k8?` zJVKi@X)gQ#p{}m3tCWB7$a_Zj>HAk%Fc}-JCGiKoHB=9-`h+cRS)3PD$~`C=J@xnR zZ4(o9U0ns6wKw7Rk)A0?PqR3ym4G8k!7 z*4)~9j-7qqzJ14ejGhDqHKWCLS0|3;yUhKGiHX51*1`meB*dX$3v&f}zL?jhyK7_; zN(LAlpnA<%rFVMeo+t{S27oa!-c8TXtG=->MycA|+&n4${56kp4Zw7_i>%MSquCfA zQM*UR#&Y(UaImxg<>f~`K=W7tegeM299xD`i9K`}@4AaY3o#7#2P3&+wCKioP9we} zx;B1+h9;EbX1@E{L;)%;S-dO4%nb`Ks-ycC585X9gs+xPdnOV?f z5EuU26`~+cAt%E`*o&rIw`FgC|Y1KECXo-rVx_7vl~Sj9^YqpE4@ z>CcY~*aBF8cka$M-dJ5Ar=)DGu13DnOz5eo%mLR0JbhZ^I1zD`Wfj~Z`qfp^tlZoZ ztSQJAJPYOVOP3WNQ4=-+KG7@>9X@RJ@|Bp38xY8$Y? z&hHCIyq9)d@&EVv_I)AS?j?TVt^+00uCuGehKGkIBGZnDBaUKsr^>g=a&uLCAFaze zIPlSM{&u$jH@&p9q!IqAWG~qJn6v4cSzz9iT~YDe?xe*yX@LF4^iY(ROx|$d-@oU- z;rRb8cm2PV1pY6+&c{w#TT?^plj7iVuzd#!Nz>cWW8Oqki<|q7tt}5XcbJG+-Tz`#aI;)9j&LW{W>Coj)B1w%odSIp&%vg$~9B* z^xQ;oN|FnIt8^wJM~j5y9;}($J;#!?U zGtMB1z!Wcd2@#&WdBd>L2?PV^&xE;>@FXNZ1UinekN&c|Xz!b*S#(h{w zl51~nZmh6&9sp-n_Wpe!)$hqk&!tI_SyxC8_*H@}b#ijD`MsUEcq$$b@1WPCCxa?W zQjG2cP=oHXmr2BJF*iS7x75`^UA^P!;d8Z#YPO`%7b0xscC8dZMRRL6_Z9cp7sul zj1&?O7#bcvNC3%UWNeHI%|-YIz=-SwGZdMrPR!w??xw@k)YK4s#4y_suBxl619NnT z9zTAZ=fVZOytjnd54J;OYi-rl)y>UWu~Yj|TXN}+csITg!HoUGhvL|6A2oG#b2GE7 z%zTJ}n3y^aAN2{UHUf>aquA(MAQP%g+SH#UAmi^ai}n27m4B z;=(5zj?Ju(D|7P(eM2mLpNW+7Rpbz4#qY>UY!*iJ`mpnc;N|fh2U}b7uLi^%*4ptv zi7hxWA)z@5Abb83ks1Y=1YHJPw~CAD0Vu^$O8qCaLCvMt`agTtk*U{jzkvJYCKq^Q zC}YP=9{uNJ=Kq)RpIabE+88aAG)ax;6j@T=qYoCONET1E^y4Ny#BP%Me`(wQiJNM2 z%@kMzhdXFsf#fO0HMu&OKcfSrPO_oukNgKY_&*Bi|MSgG>zC7!kTBR2y#W8x;4^1t zhh!Z>iBCr=zD##N=TB-el=|COA{K1esC{3&c!4=a&|x&bL>Pq2sZ*5|6>)$-@;Yx; z)vl$s>kyMMZguj2b=Uh(U%nW1z1Ih{x~|J`{P_6P6eB&oh{2a97|{479uaSmTqb@A zY!{FX1|&8vb~ZMxa?cI?8G&qkIELsyO)^Y*y@MkwQLx@vuOV*1ij~`2e2N;{w`^4={ z1vWqU+A$SVL+*VNjNgoS_(Gbm?yI<((Bf3E=@g~PhHizV>V0(Qkmqu{Y-vw)y>)U{yCLM4!whcx}&J52xGNdUSKiSzOYXPmd+ zzlAPJF^W4MBw*42TTh4QeSb_DjHJIYFZG7RU~l@Cjy}mp-@RC)270nM#l=Bc-WS%wxI8m+t1TUyXkPWl5Zxx|Hjy!}uwuv$T=2$!1 zs2b#8WWp6<2C4w}I65YV$|PrGrU+Pl&t>HpNsmt`wyu4yN9mnfj1qa16|7hWWiK=fe0y|v#=DOwLdvRYRHO`K)mn& zocUw2tm&Izyt%y(BN-o_c$XP^n2S|DLRD2YeiGl0Im<*k79f2``6OD@o`1)oGWJ*> z6L66q@$EJGh1%@s;{w&xQi|bax$W9@5-IY3Zf5yc4!Wg1q~hP&L2~cUqWYd+3r|S@ z$-1dRl#RWgbna{;iE$|%B0fn|$$t_D_uW5H$HgYTr9nPfY|dX8VQmk)8E;<|_EwTW550RG>cI8&FYc=o--WG4Nvt*s5JddS(t z-1jFi8i0P|;knNG^6-%(SEQs45Ll?IL8BtZv=a9n-dCYr?pcb4zL_>SJWTW`)N4?a zToysNPfojs9Xq^Fg7hFA9orE^qnx)HH=8~IYH%0@=;c>FVCmoHdGh3tt{{=h;CSREovP-nG=-23eOlIOJWbSN$W`=0{XCHM`c^ zw^cVbRwb&(VMJFY?i1PMH3fx?;!i~4tQNJw60lds`ufePmDpO|f-4gTbg}f1E&#f{sXz=P-H&@AVx7n{&r8>#J$gZthDF6s zl>XA4pJ=&%8l&}c60WkCgV%+s(0vMJ2UE1e(pFVgTzh-FFoA_f>YR=aRLFsi%v(PC zD2E;~@`r+LeHw>bPrb6;L3lDNo99Rn(l~R7E$S+Zn(d?4R0-{S<363F6V|mjX_%s| z%#_N@e-Snq zM#%H$3hB5Yb(W^3kGt@}di@sal#!C! z1;wqi(`Mxwu1DM%#^x}EW%sh#Z5a`dN*4#t9D1tZ@Oe`*WnKHM6T!msp~5v;*@gLe zj2LvSzO$#(2L|46s76y8*~t zn1cN2E`Eca%$tXu4LHW1F?rEnkT(_M#`^ss+rmVX3Hs5I5sITcU%q_t)@ezUkHmn` z(d&hvrc+^f4-;FonAhasAj;1rHZig2s)1kc?iUHW%zo+UP?wgLM$=Kfe*MMc$B&Ep zdZEzeSornI8J{N9*Rz^3)k{J;&yBiH$T?d6*Of6nq+QL_MyN%E11!x$c6_ z5t+=500?PH79n8%B1V5>qYcBTdPlcvR)&>Oe6HIgAbt4YjlI zA*^!4Sc#UrdrmZ4;INdPD4KRc88I+4H0BAPZPGR-d&JDd#B}f9Bgc9`7R-$ow>5?F#t5k z4GSZq`PArNXe(V^nuI4o&0FOkWnz`US|=nX=40u^*8T6|dzf}*Wo1R9^(-DjsDJm4 z9j1n%bL4}zF7EE`y1H+Gbql}7C7hq0^z-!vO>><4*5$~HR^YR8A}DlAoSV~c@H^2 zazkVQaBH;7ABc~JkbM;JWr*1SZ6fO2qwTfT)o``QCJ-ffVC501*Wl&GXMg`TSoLqq zGL**ngke2|V_lERd+MR6$ExM`Twnb#xbhAjdZBAq+{2b)__nYWl zQ&e1tS=-=?U~Mph7}8#0q0Y1E-)u`!*;k>e zu8yJI$H%ATBDHV4F}ar_RDI0w#zscm`|Ln{P*Mg&SYebNacrooYwPS(l$STAH9=AF zSe=ih*pHV)@E>`%x^Z{#+W}1JS(Q@&6vnk&W_^iJ3~n%sIqt3JwXp#2;9;^y za;#%L1ygW*YUNIPx%Jpo?92?t+z4J73IUOM!|e7ABSCD-AE=oqdhJb3KWCir_?A0- z^kJpQGp;=%B2pTXvF35$=+U!Or0eVJq)Z^(_wU~iCAEfv)<@v(Z^?h0CK?)WKF!## z=@r?_NK2dIv7jk32VDimq8}=sQ+Ol1^6N1pw*Mf3RW58Af(R6Wv;62n)xbT#5t5`? zIXPpUnMAsKFO$1yH2skE?c2+#hOSBlVw=l`zrXm-f3$`F1MHnD-So;j1dua~K_(jD zgMsoOsnonW%U7PKMg3qpDl{r89Xx}H$(3kG#zv;5X}Q%kHE;V5u~7gQ!*7X5y7}(= z5@m1x_|fMPJ>}@O+1V|hK8YuMLKDLzgiz#hnKeT0fx6AL{59n}K5psQgORfaJ_+MB zQ&ZE($jBV8(X1XXgezDbOmPT>5zw6FC=CcKksv(9dhd0(hnDC=HDS}@^E%tMOMJQE zpkHOpL-yRfJd4c}*N9%e{QQfhwKF-Ujo9RT_@9oPTPTe$v68XsiMF;HnovCUDm3&o z3(Gk66ABN2R>q4byZai>uNdjGBSHTwEr1x_cCFWepp)Sar^ce zCKeHq4G8AgaU`sH_iVolA@==;4`Ci!c564av}T=4kWnyn4(&!|B@UURH(^u(-SDNd z5-yT_%atkNSf#L#kguW~oSf?2n>AwV-hpr5G7GL4-MRA~JTzWtFB7Oa*hixU9n9_I zR@Xh@8@;}1-Q3aP+5XYtFgX2juKjBX@tj>J-G>D(T?z{i&*J}x^d0RhExr~B!vkyp zEmg4A<$gl!!)Mj>BB3m_q_h;`IsmF9k=+85#nz*Df(kX}6dT)9QRA(L;4_r5b2>Wm zq{?Rw?b@{qR!k1vO(7zl*jk^Xtl;(z92_(XmjfI~{P7V=YSln*q1|Ju&)jw3VGq6XR#}-Wo`P`( z+F~9E^gVm_2tGI`DEJ+Ar;AswUIlmB1w9cG6(|FnA0J{{DnOr{H=pk^nD8=FfpsSf18!z-gz?-h1gxZ7%um{OD1&3Hew3ry?5$pyu++%^& zM$kQA^Mf#K52|2mE_$nJh@6J=wV0Qv$9g4=9_WTBcnuKGP>JVfgCU57MMQwC*@9&d z4={>y_+7N3nEm9*3V1zG6p_Sv@84hYSmo8}gW;&+F=cBN?s%iXIw~Zj17s0M^rhdG z06-kQC`pMK*&ZYj%++Z7emj5R|1F$E?#tD45O$9VF&X7@YxMAT=Fv&iu!~+lHQ&`*A`BY}Syi0MnQ5 z?!rHo?lwtqDL}=9SGy?k6blR7nJ=gSMk;jQ?VZ;8@SLjcV}E}+v=bnJJf)r+urTJs~l1yiN`_SA9oK{3^ zL`>;&otM+-nk@;n(0@m*FWzp785|r$H-X{Qx+g!~*!ePwZTP%!iq!9AZjsLBD5@!PL;$m|rnrdoua%+K)X$;P88ko%)L$ z&K0vl!NlHw7evBUtUQSPs*d$=g zyroCXYGS1l^E^&PbqgVAVIc&!zH4?WtDaszlvT8amL;uHS3agekOHNy3vZp?p75TZ z>~#Y#0}jxx zwA??z<;QaQR{jRZ3jYj{I&9!xOTA7!;m^tdcm7OFc);oaG$T!meHzGQk5@G$wD4V0|uaTAG@$0pF*3@#<9@6j7+fXwtN_wCnJCqXd=T`SlqZXqN`;yy}^{6F+|j1_j-P z-|PDXHb$-CTDI(QdV1++v@<9N0b-k7)z!XK1RMX5z`&3Db*k&@C)(2>BzNGiZW|fp zqFx<1dF8dR%Zt9UtmNdsz?bNSbeR7@i-FEVF*7wVAWS(V8Q_IMT;uI1$jF$zF9`{? zZW0T*(wWgpyI7P6!_V6exp9O2X^ zYQwPPztbsR1VjdJPDQ%YI}nB&OAIuCTwwo(XPjXzBZ3H&p`$)W@g-zX2$vqQj>E_e z${P?BQ|>(~=9ksh{bi*Fm1GE?pFeN;N@jz3M$(|j-Ro1agRBP07#6wX@P7b4VYKe% zkRZJQHyjfD!NZ3D`0%md6c0HW+34K)ZMri4PGuww+3(-qZch!aC?`ro^J<0KlVfAD z<*(3FtSl{Gzj>peheeKoo2j(%5V09R@>uO^Y;5f6`T)KEkI-}4^A9}5JBtP(x_Sq~ zSunI4j=eA*V=|8tDB+^f(Jx*QqG3ELE-b`LJs0gQuwi5}f~{q|2;N*6xHDIm!NKCs zHL`!d?O(LRk)rS2-Ov|Wl5m=U)%zKPNQ4z5($dgSg0VHSh+fz^&?DIMwQMM7`|4!6 zR^D|^*b^cehl&0j6WOAqJdZU;$GsZ^Kar{XDt;6@VeI!<>^9}ofI0pI(Iyk5GmE!9 zNPwUIsLxq;b{1~{l1F{}F+(6Yb9(fcgJ_Q(3pg(DLT3qnC+|QQAzG5)Rv`NB(i#1+ zXhbtNJq3yC&!#iUkE5b`pLxZ(8G2JeXGZYLCbYotxruuag`PWiPIx2RLGagic0%^# zpV)%Qa{!4*_UJ7XMfmXnn*nd+btmBbfVsW0PgM5=m0Xk{(UKt|5@mJ4bKQ9{D>PIC z%Y^hiRz3iGmR6p*6rgV^TiJp z#P;`Yog~(t0|ZcKq)db0r;kK+oBR)=i@#sewlAURfx#I&lG)>gIMIZc)gwY7HW z{IV#bT^b`a1)A&~?d={pPO8uCT;(6tP(LC-JUT^1YHMrzx86+6^;Y9tX$KMlHJzS{ zDkdcbi_i#L3K6(@2f~u+qksEa`}hjzhKPuWyuJz^ASl>a!G7ov_&f3D-KX-bIf;7M z5hibjumtebBUTg1ux^C|u;E|eA3%IqGne5c9CM+fjZr5hI3F`MHAR5|c*R71o|QEb zxih#`#tFX|dP{Ok9}H!pN(?ozauN~`b)_22n-4sk7-TZS~YSj&*RrA;rPLAuPO(bvwu^2lR-e6Ih1!wla@b^unVRN zGBVW=kVjYax0!#AR-W#FiU+Q|rM(?6*cx01j+`kD)C1Fky#b>B%6I$r?SBc45YEo$ z)gRC|HV%IIvO+ZYDhn#)xC&j?cuai+rZ8B}(}a~SFf)TCn5zF1W!9bwbl!gKon{(h z-%s)_Oyt2$xkgeDQHzILX8FLg&Tn*V=PulJoK$U?IZ@jyG`9;`n$>g6e_mtKi_? z)dE=bT0()W4<0;dH{Nv~R1#($R2sk}9^!dLFp-CGJTQ7|{qdXn7qvrp{qH%Zlsb2T zBJpRfBKwr9EX>~MtZy#eeNcwJhXyi^g`y@9_ujPO_>g!!m`B7H7+ZH>a5J;BLQTZ7tYJea0JKri23w-&+?cO3_v6qUX`qB<9%5gPuY9lXz zT(!hD;S0kJr2EgEIB_q0IP-A8g>V%ds|R-IS^xv2d6>4Sh-Z&* zuIuJlaoRIFkHzuuvNBPoFPI1wL#}OO7`i6!{m@t_nL3q{GJ-b-!GRNw{cj3az zNg6tU9y&S<;Xueup@8utE~0|^fqs6q*oeQf_H9U*GdFkWWWzzbni~`Iv8M;X*xI^b zcn!*l;g-AJzx;RideQL`uq4?fwo7qR*9Xs;!lv!&rnu+h^Z#RO$Q|c+%_VB zIp|w+v+@ed4HT<@D#JZWASZCXET^OdbI1BaN>L@>b}R?Zih$byVnET)7CQJZnv;VA zX6^clij!PiihxA{pHWs^TwJj7xHe8P;S3~rvYIHGBS#QcIbbG{B{h4b&atsU*Mm&f ztslr2+6G?ty1M!mE-5a2*xBuEZQn)$l za@JF)MBJB^WMyk{v)0x*HJkLbv^8X$w98YzfBf5FI71N|`|J1b&-29K`W^`3CNq_NFg#2#zLLFNGExeX8H8I8{tI4W BV^#nF diff --git a/dev/search_index.js b/dev/search_index.js index 993dd044..da4cbcb7 100644 --- a/dev/search_index.js +++ b/dev/search_index.js @@ -1,3 +1,3 @@ var documenterSearchIndex = {"docs": -[{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"@info \"Scripts docs\"","category":"page"},{"location":"Scripts/#MIToS'-Scripts","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"The MIToS_Scripts.jl package offers a set of easy-to-use scripts for command-line execution without requiring Julia coding. It includes several scripts designed for various bioinformatics tasks, such as measuring estimating residue conservation and inter-residue coevolution, calculating distances between residues in a protein structure, and more.","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"Pages = [\"Scripts.md\"]\nDepth = 4","category":"page"},{"location":"Scripts/#Installation","page":"MIToS' Scripts","title":"Installation","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"To install MIToS_Scripts.jl, you only need Julia 1.9 or later installed on your system. Executing julia in the terminal to open the Julia REPL, and finally, run the following command:","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"using Pkg\nPkg.add(url = \"https://github.com/MIToSOrg/MIToS_Scripts.jl\")","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"Then, you can get the location of the installed scripts by running the following command:","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"using MIToS_Scripts\nscripts_folder = joinpath(pkgdir(MIToS_Scripts), \"scripts\")","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"You can run them from that location. Alternatively, you can add the location to your PATH environment variable, or copy the scripts to a folder already in your PATH to run them from anywhere.","category":"page"},{"location":"Scripts/#Usage","page":"MIToS' Scripts","title":"Usage","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"You can execute each provided script from your command line. For example, to run the Buslje09.jl script—if you are located in the folder where it is the scripts—use:","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"julia Buslje09.jl input_msa_file","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"Refer to the documentation of each script for specific usage instructions; you can access it by running the script with the --help or -h flag:","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"julia Buslje09.jl -h","category":"page"},{"location":"Scripts/#Scripts","page":"MIToS' Scripts","title":"Scripts","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"using Pkg\nproject_folder = \"MIToS_Scripts_Project\"\nisdir(project_folder) || mkdir(project_folder)\nPkg.activate(project_folder)\nPkg.add(url=\"https://github.com/MIToSOrg/MIToS_Scripts.jl\")\nusing MIToS_Scripts\nscripts_folder = joinpath(pkgdir(MIToS_Scripts), \"scripts\")","category":"page"},{"location":"Scripts/#Buslje09.jl","page":"MIToS' Scripts","title":"Buslje09.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"Buslje09.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#BLMI.jl","page":"MIToS' Scripts","title":"BLMI.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"BLMI.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide\n","category":"page"},{"location":"Scripts/#Conservation.jl","page":"MIToS' Scripts","title":"Conservation.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"Conservation.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#DownloadPDB.jl","page":"MIToS' Scripts","title":"DownloadPDB.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"DownloadPDB.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#Distances.jl","page":"MIToS' Scripts","title":"Distances.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"Distances.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#MSADescription.jl","page":"MIToS' Scripts","title":"MSADescription.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"MSADescription.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#PercentIdentity.jl","page":"MIToS' Scripts","title":"PercentIdentity.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"PercentIdentity.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#AlignedColumns.jl","page":"MIToS' Scripts","title":"AlignedColumns.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"AlignedColumns.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#SplitStockholm.jl","page":"MIToS' Scripts","title":"SplitStockholm.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"SplitStockholm.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"@info \"MSA API docs\"","category":"page"},{"location":"MSA_API/#MSA","page":"MSA","title":"MSA","text":"","category":"section"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"MIToS.MSA","category":"page"},{"location":"MSA_API/#MIToS.MSA","page":"MSA","title":"MIToS.MSA","text":"The MSA module of MIToS has utilities for working with Multiple Sequence Alignments of protein Sequences (MSA).\n\nFeatures\n\nRead and write MSAs in Stockholm, FASTA, A3M, PIR, or Raw format\nHandle MSA annotations\nEdit the MSA, e.g. delete columns or sequences, change sequence order, shuffling...\nKeep track of positions and annotations after modifications on the MSA\nDescribe a MSA, e.g. mean percent identity, sequence coverage, gap percentage...\n\nusing MIToS.MSA\n\n\n\n\n\n","category":"module"},{"location":"MSA_API/#Contents","page":"MSA","title":"Contents","text":"","category":"section"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"Pages = [\"MSA_API.md\"]\nDepth = 2","category":"page"},{"location":"MSA_API/#Types","page":"MSA","title":"Types","text":"","category":"section"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"Modules = [MIToS.MSA]\nPrivate = false\nOrder = [:type]","category":"page"},{"location":"MSA_API/#MIToS.MSA.AbstractAlignedObject","page":"MSA","title":"MIToS.MSA.AbstractAlignedObject","text":"MIToS MSA and aligned sequences (aligned objects) are subtypes of AbstractMatrix{Residue}, because MSAs and sequences are stored as Matrix of Residues.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AbstractAlignedSequence","page":"MSA","title":"MIToS.MSA.AbstractAlignedSequence","text":"A MIToS aligned sequence is an AbstractMatrix{Residue} with only 1 row/sequence.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AbstractMultipleSequenceAlignment","page":"MSA","title":"MIToS.MSA.AbstractMultipleSequenceAlignment","text":"MSAs are stored as Matrix{Residue}. It's possible to use a NamedResidueMatrix{Array{Residue,2}} as the most simple MSA with sequence identifiers and column names.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AbstractSequence","page":"MSA","title":"MIToS.MSA.AbstractSequence","text":"A MIToS (unaligned) sequence is an AbstractMatrix{Residue} with only 1 row/sequence.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AlignedSequence","page":"MSA","title":"MIToS.MSA.AlignedSequence","text":"An AlignedSequence wraps a NamedResidueMatrix{Array{Residue,2}} with only 1 row/sequence. The NamedArray stores the sequence name and original column numbers as Strings.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AnnotatedAlignedSequence","page":"MSA","title":"MIToS.MSA.AnnotatedAlignedSequence","text":"This type represent an aligned sequence, similar to AlignedSequence, but It also stores its Annotations.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AnnotatedMultipleSequenceAlignment","page":"MSA","title":"MIToS.MSA.AnnotatedMultipleSequenceAlignment","text":"This type represent an MSA, similar to MultipleSequenceAlignment, but It also stores Annotations. This annotations are used to store residue coordinates (i.e. mapping to UniProt residue numbers).\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AnnotatedSequence","page":"MSA","title":"MIToS.MSA.AnnotatedSequence","text":"An AnnotationSequence wraps a NamedResidueMatrix{Array{Residue,2}} with only 1 row/sequence and its Annotations. The NamedArray stores the sequence name and original position numbers as Strings.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.Annotations","page":"MSA","title":"MIToS.MSA.Annotations","text":"The Annotations type is basically a container for Dicts with the annotations of a multiple sequence alignment. Annotations was designed for storage of annotations of the Stockholm format.\n\nMIToS also uses MSA annotations to keep track of:\n\nModifications of the MSA (MIToS_...) as deletion of sequences or columns.\nPositions numbers in the original MSA file (column mapping: ColMap)\nPosition of the residues in the sequence (sequence mapping: SeqMap)\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.Clusters","page":"MSA","title":"MIToS.MSA.Clusters","text":"Data structure to represent sequence clusters. The sequence data itself is not included.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.GappedAlphabet","page":"MSA","title":"MIToS.MSA.GappedAlphabet","text":"This type defines the usual alphabet of the 20 natural residues and a gap character.\n\njulia> using MIToS.MSA\n\njulia> GappedAlphabet()\nGappedAlphabet of length 21. Residues : res\"ARNDCQEGHILKMFPSTWYV-\"\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.MultipleSequenceAlignment","page":"MSA","title":"MIToS.MSA.MultipleSequenceAlignment","text":"This MSA type include a NamedArray wrapping a Matrix of Residues. The use of NamedArray allows to store sequence names and original column numbers as Strings, and fast indexing using them.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.NoClustering","page":"MSA","title":"MIToS.MSA.NoClustering","text":"Use NoClustering() to avoid the use of clustering where a Clusters type is needed.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.ReducedAlphabet","page":"MSA","title":"MIToS.MSA.ReducedAlphabet","text":"ReducedAlphabet allows the construction of reduced residue alphabets, where residues inside parenthesis belong to the same group.\n\njulia> using MIToS.MSA\n\njulia> ab = ReducedAlphabet(\"(AILMV)(RHK)(NQST)(DE)(FWY)CGP\")\nReducedAlphabet of length 8 : \"(AILMV)(RHK)(NQST)(DE)(FWY)CGP\"\n\njulia> ab[Residue('K')]\n2\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.Residue","page":"MSA","title":"MIToS.MSA.Residue","text":"Most of the MIToS design is created around the Residue bitstype. It has representations for the 20 natural amino acids, a value representing insertions and deletions (GAP, '-') and one representing unknown, ambiguous and non standard residues (XAA, 'X'). Each Residue is encoded as an integer number, with the same bit representation and size than a Int. This allows fast indexing operation of probability or frequency matrices.\n\nResidue creation and conversion\n\nCreation and conversion of Residues should be treated carefully. Residue is encoded as a 32 or 64 bits type similar to Int, to get fast indexing using Int(x::Residue). Int simply calls reinterpret without checking if the residue is valid. Valid residues have integer values in the closed interval [1,22]. convert from Int and Char always returns valid residues, however it's possible to find invalid residues (they are shown using the character '�') after the creation of uninitialized Residue arrays (i.e. using Array). You can use zeros, ones or rand to get initialized Residue arrays with valid residues. Conversions to and from Chars changes the bit representation and allows the use of the usual character representation of residues and amino acids. This conversions are used in IO operations and always return valid residues. In conversions from Char, lowercase letters, '*', '-' and '.' are translated to GAP, letters representing the 20 natural amino (ARNDCQEGHILKMFPSTWYV) acids are translated to their corresponding Residue and any other character is translated to XAA. Since lowercase letters and dots are translated to gaps, Pfam MSA insert columns are converted to columns full of gaps.\n\njulia> using MIToS.MSA\n\njulia> alanine = Residue('A')\nA\n\njulia> Char(alanine)\n'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)\n\njulia> for residue in res\"ARNDCQEGHILKMFPSTWYV-X\"\n println(residue, \" \", Int(residue))\n end\nA 1\nR 2\nN 3\nD 4\nC 5\nQ 6\nE 7\nG 8\nH 9\nI 10\nL 11\nK 12\nM 13\nF 14\nP 15\nS 16\nT 17\nW 18\nY 19\nV 20\n- 21\nX 22\n\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.ResidueAlphabet","page":"MSA","title":"MIToS.MSA.ResidueAlphabet","text":"Abstract type to define residue alphabet types.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.UngappedAlphabet","page":"MSA","title":"MIToS.MSA.UngappedAlphabet","text":"This type defines the usual alphabet of the 20 natural residues, without the gap character.\n\njulia> using MIToS.MSA\n\njulia> UngappedAlphabet()\nUngappedAlphabet of length 20. Residues : res\"ARNDCQEGHILKMFPSTWYV\"\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#Constants","page":"MSA","title":"Constants","text":"","category":"section"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"Modules = [MIToS.MSA]\nPrivate = false\nOrder = [:constant]","category":"page"},{"location":"MSA_API/#MIToS.MSA.GAP","page":"MSA","title":"MIToS.MSA.GAP","text":"GAP is the Residue representation on MIToS for gaps ('-', insertions and deletions). Lowercase residue characters, dots and '*' are encoded as GAP in conversion from Strings and Chars. This Residue constant is encoded as Residue(21).\n\n\n\n\n\n","category":"constant"},{"location":"MSA_API/#MIToS.MSA.WeightTypes","page":"MSA","title":"MIToS.MSA.WeightTypes","text":"The WeightTypes type is the same as Union{Weights,NoClustering,Clusters}. This type is used to represent weights. Most of the functions taking the weights kerword argument in the Information module accept instances of WeightTypes.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.XAA","page":"MSA","title":"MIToS.MSA.XAA","text":"XAA is the Residue representation for unknown, ambiguous and non standard residues. This Residue constant is encoded as Residue(22).\n\n\n\n\n\n","category":"constant"},{"location":"MSA_API/#Macros","page":"MSA","title":"Macros","text":"","category":"section"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"Modules = [MIToS.MSA]\nPrivate = false\nOrder = [:macro]","category":"page"},{"location":"MSA_API/#MIToS.MSA.@res_str-Tuple{Any}","page":"MSA","title":"MIToS.MSA.@res_str","text":"The MIToS macro @res_str takes a string and returns a Vector of Residues (sequence).\n\njulia> using MIToS.MSA\n\njulia> res\"MIToS\"\n5-element Vector{Residue}:\n M\n I\n T\n -\n S\n\n\n\n\n\n","category":"macro"},{"location":"MSA_API/#Methods-and-functions","page":"MSA","title":"Methods and functions","text":"","category":"section"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"Modules = [MIToS.MSA]\nPrivate = false\nOrder = [:function]","category":"page"},{"location":"MSA_API/#Base.isvalid-Tuple{Type{MIToS.MSA.Residue}, MIToS.MSA.Residue}","page":"MSA","title":"Base.isvalid","text":"isvalid(res::Residue)\n\nIt returns true if the encoded integer is in the closed interval [1,22].\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#Base.names-Tuple{MIToS.MSA.ReducedAlphabet}","page":"MSA","title":"Base.names","text":"It returns the name of each group. The name is a string with the one letter code of each residue that belong to the group.\n\njulia> using MIToS.MSA\n\njulia> ab = ReducedAlphabet(\"(AILMV)(RHK)(NQST)(DE)(FWY)CGP\")\nReducedAlphabet of length 8 : \"(AILMV)(RHK)(NQST)(DE)(FWY)CGP\"\n\njulia> names(ab)\n8-element Vector{String}:\n \"AILMV\"\n \"RHK\"\n \"NQST\"\n \"DE\"\n \"FWY\"\n \"C\"\n \"G\"\n \"P\"\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#Base.rand-Tuple{Random.AbstractRNG, Random.SamplerType{MIToS.MSA.Residue}}","page":"MSA","title":"Base.rand","text":"It chooses from the 20 natural residues (it doesn't generate gaps).\n\njulia> using MIToS.MSA\n\njulia> using Random\n\njulia> Random.seed!(1); # Reseed the random number generator.\n\njulia> rand(Residue)\nR\n\njulia> rand(Residue, 4, 4)\n4×4 Matrix{Residue}:\n E D D A\n F S K K\n M S I M\n Y F E D\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.adjustreference","page":"MSA","title":"MIToS.MSA.adjustreference","text":"Creates a new matrix of residues. This function deletes positions/columns of the MSA with gaps in the reference (first) sequence.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.adjustreference!","page":"MSA","title":"MIToS.MSA.adjustreference!","text":"It removes positions/columns of the MSA with gaps in the reference (first) sequence.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.annotate_modification!-Tuple{MIToS.MSA.Annotations, String}","page":"MSA","title":"MIToS.MSA.annotate_modification!","text":"Annotates on file annotations the modifications realized by MIToS on the MSA. It always returns true, so It can be used in a boolean context.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.annotations-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment}","page":"MSA","title":"MIToS.MSA.annotations","text":"The annotations function returns the Annotations of an annotated MSA or aligned sequence. If the object is not annotated, it returns an empty Annotations object.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.column_index-Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}} where AT, AbstractString}","page":"MSA","title":"MIToS.MSA.column_index","text":"column_index(msa, col_name)\n\nReturn the index (integer position) of the column with name col_name in the MSA msa. A KeyError is thrown if the column name does not exist. If col_name is an integer, the same integer is returned without checking if it is a valid index.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.columngapfraction-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.columngapfraction","text":"Fraction of gaps per column/position on the MSA\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.columnname_iterator-Union{Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}}}, Tuple{AT}} where AT","page":"MSA","title":"MIToS.MSA.columnname_iterator","text":"columnname_iterator(msa)\n\nIt returns an iterator that returns the column names of the msa. If the msa is a Matrix{Residue} this function returns the actual column numbers as strings. Otherwise it returns the column number of the original MSA through the wrapped NamedArray column names.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.columnnames-Union{Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}}}, Tuple{AT}} where AT","page":"MSA","title":"MIToS.MSA.columnnames","text":"columnnames(msa)\n\nIt returns a Vector{String} with the sequence names/identifiers. If the msa is a Matrix{Residue} this function returns the actual column numbers as strings. Otherwise it returns the column number of the original MSA through the wrapped NamedArray column names.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.columnpairsmatrix-Union{Tuple{diagonal}, Tuple{T}, Tuple{AbstractMatrix{MIToS.MSA.Residue}, Type{T}, Type{Val{diagonal}}, T}} where {T, diagonal}","page":"MSA","title":"MIToS.MSA.columnpairsmatrix","text":"Initialize an empty PairwiseListMatrix for a pairwise measure in sequence pairs. It uses the sequence names if they are available, otherwise it uses the actual sequence numbers. You can use the positional argument to indicate the number Type (default: Float64), if the PairwiseListMatrix should store the diagonal values on the list (default: false) and a default value for the diagonal (default: NaN).\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.coverage-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.coverage","text":"Coverage of the sequences with respect of the number of positions on the MSA\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.delete_annotated_modifications!-Tuple{MIToS.MSA.Annotations}","page":"MSA","title":"MIToS.MSA.delete_annotated_modifications!","text":"Deletes all the MIToS annotated modifications\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.deletefullgapcolumns!","page":"MSA","title":"MIToS.MSA.deletefullgapcolumns!","text":"Deletes columns with 100% gaps, this columns are generated by inserts.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.filtercolumns!","page":"MSA","title":"MIToS.MSA.filtercolumns!","text":"filtercolumns!(msa, mask[, annotate::Bool=true])\n\nIt allows to filter MSA or aligned sequence columns/positions using a AbstractVector{Bool} mask. Annotations are updated if annotate is true (default).\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.filtercolumns!-Tuple{MIToS.MSA.Annotations, Any}","page":"MSA","title":"MIToS.MSA.filtercolumns!","text":"filtercolumns!(data::Annotations, mask)\n\nIt is useful for deleting column annotations (creating a subset in place).\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.filtercolumns-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Any}","page":"MSA","title":"MIToS.MSA.filtercolumns","text":"It's similar to filtercolumns! but for an AbstractMatrix{Residue}\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.filtersequences!","page":"MSA","title":"MIToS.MSA.filtersequences!","text":"filtersequences!(msa, mask[, annotate::Bool=true])\n\nIt allows to filter msa sequences using a AbstractVector{Bool} mask (It removes sequences with false values). AnnotatedMultipleSequenceAlignment annotations are updated if annotate is true (default).\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.filtersequences!-Tuple{MIToS.MSA.Annotations, Vector{String}, AbstractVector{Bool}}","page":"MSA","title":"MIToS.MSA.filtersequences!","text":"filtersequences!(data::Annotations, ids::Vector{String}, mask::AbstractArray{Bool,1})\n\nIt is useful for deleting sequence annotations. ids should be a list of the sequence names and mask should be a logical vector.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.filtersequences-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Any}","page":"MSA","title":"MIToS.MSA.filtersequences","text":"It's similar to filtersequences! but for an AbstractMatrix{Residue}\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.gapfraction-Tuple{AbstractArray{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.gapfraction","text":"It calculates the fraction of gaps on the Array (alignment, sequence, column, etc.). This function can take an extra dimension argument for calculation of the gap fraction over the given dimension.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.gapstrip!","page":"MSA","title":"MIToS.MSA.gapstrip!","text":"This functions deletes/filters sequences and columns/positions on the MSA on the following order:\n\nRemoves all the columns/position on the MSA with gaps on the reference (first) sequence.\nRemoves all the sequences with a coverage with respect to the number of columns/positions on the MSA less than a coveragelimit (default to 0.75: sequences with 25% of gaps).\nRemoves all the columns/position on the MSA with more than a gaplimit (default to 0.5: 50% of gaps).\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.gapstrip-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.gapstrip","text":"Creates a new matrix of Residues (MSA) with deleted sequences and columns/positions. The MSA is edited in the following way:\n\nRemoves all the columns/position on the MSA with gaps on the reference (first) sequence\nRemoves all the sequences with a coverage with respect to the number of columns/positions on the MSA less than a coveragelimit (default to 0.75: sequences with 25% of gaps)\nRemoves all the columns/position on the MSA with more than a gaplimit (default to 0.5: 50% of gaps)\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.getannotcolumn","page":"MSA","title":"MIToS.MSA.getannotcolumn","text":"getannotcolumn(ann[, feature[,default]])\n\nIt returns per column annotation for feature\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.getannotfile","page":"MSA","title":"MIToS.MSA.getannotfile","text":"getannotfile(ann[, feature[,default]])\n\nIt returns per file annotation for feature\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.getannotresidue","page":"MSA","title":"MIToS.MSA.getannotresidue","text":"getannotresidue(ann[, seqname, feature[,default]])\n\nIt returns per residue annotation for (seqname, feature)\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.getannotsequence","page":"MSA","title":"MIToS.MSA.getannotsequence","text":"getannotsequence(ann[, seqname, feature[,default]])\n\nIt returns per sequence annotation for (seqname, feature)\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.getcolumnmapping-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment}","page":"MSA","title":"MIToS.MSA.getcolumnmapping","text":"It returns a Vector{Int} with the original column number of each column on the actual MSA. The mapping is annotated in the ColMap file annotation of an AnnotatedMultipleSequenceAlignment or in the column names of an NamedArray or MultipleSequenceAlignment.\n\nNOTE: When the MSA results from vertically concatenating MSAs using vcat, the column map annotations from the constituent MSAs (such as 1_ColMap, 2_ColMap, etc.) are not returned. Instead, the column numbers referenced in the column names are provided. To access the original annotations, utilize the getannotfile function.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.gethcatmapping-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment}","page":"MSA","title":"MIToS.MSA.gethcatmapping","text":"It returns a vector of numbers from 1 to N for each column that indicates the source MSA. The mapping is annotated in the \"HCat\" file annotation of an AnnotatedMultipleSequenceAlignment or in the column names of an NamedArray or MultipleSequenceAlignment.\n\nNOTE: When the MSA results from vertically concatenating MSAs using vcat, the \"HCat\" annotations from the constituent MSAs are renamed as \"1_HCat\", \"2_HCat\", etc. In that case, the MSA numbers referenced in the column names are provided. To access the original annotations, utilize the getannotfile function.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.getnamedict-Tuple{MIToS.MSA.ReducedAlphabet}","page":"MSA","title":"MIToS.MSA.getnamedict","text":"It takes a ResidueAlphabet and returns a dictionary from group name to group position.\n\njulia> using MIToS.MSA\n\njulia> ab = ReducedAlphabet(\"(AILMV)(RHK)(NQST)(DE)(FWY)CGP\")\nReducedAlphabet of length 8 : \"(AILMV)(RHK)(NQST)(DE)(FWY)CGP\"\n\njulia> getnamedict(ab)\nOrderedCollections.OrderedDict{String, Int64} with 8 entries:\n \"AILMV\" => 1\n \"RHK\" => 2\n \"NQST\" => 3\n \"DE\" => 4\n \"FWY\" => 5\n \"C\" => 6\n \"G\" => 7\n \"P\" => 8\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.getresidues-Tuple{Matrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.getresidues","text":"getresidues allows you to access the residues stored inside an MSA or aligned sequence as a Matrix{Residue} without annotations nor column/row names.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.getresiduesequences-Tuple{Matrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.getresiduesequences","text":"getresiduesequences returns a Vector{Vector{Residue}} with all the MSA sequences without annotations nor column/sequence names.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.getsequence","page":"MSA","title":"MIToS.MSA.getsequence","text":"getsequence takes an MSA and a sequence number or identifier and returns an aligned sequence object. If the MSA is an AnnotatedMultipleSequenceAlignment, it returns an AnnotatedAlignedSequence with the sequence annotations. From a MultipleSequenceAlignment, It returns an AlignedSequence object. If an Annotations object and a sequence identifier are used, this function returns the annotations related to the sequence.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.getsequencemapping-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment, String}","page":"MSA","title":"MIToS.MSA.getsequencemapping","text":"It returns the sequence coordinates as a Vector{Int} for an MSA sequence. That vector has one element for each MSA column. If the number if 0 in the mapping, there is a gap in that column for that sequence.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.getweight-Tuple{MIToS.MSA.NoClustering, Int64}","page":"MSA","title":"MIToS.MSA.getweight","text":"getweight(c[, i::Int])\n\nThis function returns the weight of the sequence number i. getweight should be defined for any type used for frequencies!/frequencies in order to use his weigths. If i isn't used, this function returns a vector with the weight of each sequence.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.hobohmI-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Any}","page":"MSA","title":"MIToS.MSA.hobohmI","text":"Sequence clustering using the Hobohm I method from Hobohm et al.\n\nReferences\n\nHobohm, Uwe, et al. \"Selection of representative protein data sets.\" Protein Science 1.3 (1992): 409-417.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.join_msas-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment, MIToS.MSA.AnnotatedMultipleSequenceAlignment, Any}","page":"MSA","title":"MIToS.MSA.join_msas","text":"join_msas(msa_a::AnnotatedMultipleSequenceAlignment, \n msa_b::AnnotatedMultipleSequenceAlignment, \n pairing; \n kind::Symbol=:outer, \n axis::Int=1)::AnnotatedMultipleSequenceAlignment\n\njoin_msas(msa_a::AnnotatedMultipleSequenceAlignment, \n msa_b::AnnotatedMultipleSequenceAlignment, \n positions_a, \n positions_b; \n kind::Symbol=:outer, \n axis::Int=1)::AnnotatedMultipleSequenceAlignment\n\nJoin two Multiple Sequence Alignments (MSAs), msa_a and msa_b, based on specified matching positions or names. The function supports two formats: one takes a pairing argument as a list of correspondences, and the other takes positions_a and positions_b as separate lists indicating matching positions or names in each MSA. This function allows for various types of join operations (:inner, :outer, :left, :right) and can merge MSAs by sequences (axis 1) or by columns (axis 2).\n\nParameters:\n\nmsa_a::AnnotatedMultipleSequenceAlignment: The first MSA.\nmsa_b::AnnotatedMultipleSequenceAlignment: The second MSA.\npairing: An iterable where each element is a pair of sequence or column positions (Ints) or names (Strings) to match between msa_a and msa_b. For example, it can be a list of two-element tuples or pairs, or and OrderedDict.\npositions_a, positions_b: Separate lists of positions or names in msa_a and msa_b, respectively.\nkind::Symbol: Type of join operation. Default is :outer.\naxis::Int: The axis along which to join (1 to match sequences, 2 to match columns).\n\nReturns:\n\nAnnotatedMultipleSequenceAlignment: A new MSA resulting from the join operation.\n\nBehavior and Sequence Ordering:\n\nThe order of sequences or columns in the resulting MSA depends on the kind of join operation and the order of elements in the pairing or positions_a and positions_b lists.\n\nFor :inner joins, the function returns an MSA containing only those sequences/columns that are paired in both msa_a and msa_b. The order of elements in the output MSA follows the order in the pairing or position lists.\nFor :outer joins, the output MSA includes all sequences/columns from both msa_a and msa_b. Unpaired sequences/columns are filled with gaps as needed. The sequences/columns from msa_a are placed first. If the pairing or position lists are sorted, the output MSA columns and sequences will keep the same order as in the inputs. That's nice for situations such as profile alignments where the order of columns is important. If the pairing or position lists are not sorted, then the order of sequences/columns in the output MSA is not guaranteed to be the same as in the inputs. In particular, the matched sequences or columns will be placed first, followed by the unmatched ones.\nFor :left joins, all sequences/columns from msa_a are included in the output MSA keeping the same order as in msa_a. Sequences/columns from msa_b are added where matches are found, with gaps filling the unmatched positions.\nFor :right joins, the output MSA behaves like :left joins but with roles of msa_a and msa_b reversed.\n\nWarning: When using Dict for pairing, the order of elements might not be preserved as expected. Dict in Julia does not maintain the order of its elements, which might lead to unpredictable order of sequences/columns in the output MSA. To preserve order, it is recommended to use an OrderedDict or a list of Pairs objects.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.meanpercentidentity","page":"MSA","title":"MIToS.MSA.meanpercentidentity","text":"Returns the mean of the percent identity between the sequences of a MSA. If the MSA has 300 sequences or less, the mean is exact. If the MSA has more sequences and the exact keyword is false (defualt), 44850 random pairs of sequences are used for the estimation. The number of samples can be changed using the second argument. Use exact=true to perform all the pairwise comparison (the calculation could be slow).\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.namedmatrix-Tuple{MIToS.MSA.AbstractResidueMatrix}","page":"MSA","title":"MIToS.MSA.namedmatrix","text":"The namedmatrix function returns the NamedResidueMatrix{Array{Residue,2}} stored in an MSA or aligned sequence.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.ncolumns-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.ncolumns","text":"ncolumns returns the number of MSA columns or positions.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.ncolumns-Tuple{MIToS.MSA.Annotations}","page":"MSA","title":"MIToS.MSA.ncolumns","text":"ncolumns(ann::Annotations) returns the number of columns/residues with annotations. This function returns -1 if there is not annotations per column/residue.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.nsequences-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.nsequences","text":"nsequences returns the number of sequences on the MSA.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.percentidentity-Tuple{Any, Any, Any}","page":"MSA","title":"MIToS.MSA.percentidentity","text":"percentidentity(seq1, seq2, threshold)\n\nComputes quickly if two aligned sequences have a identity value greater than a given threshold value. Returns a boolean value. Positions with gaps in both sequences doesn't count to the length of the sequences. Positions with a XAA in at least one sequence aren't counted.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.percentidentity-Tuple{Any, Any}","page":"MSA","title":"MIToS.MSA.percentidentity","text":"percentidentity(seq1, seq2)\n\nCalculates the fraction of identities between two aligned sequences. The identity value is calculated as the number of identical characters in the i-th position of both sequences divided by the length of both sequences. Positions with gaps in both sequences doesn't count to the length of the sequences. Positions with a XAA in at least one sequence aren't counted.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.percentidentity-Union{Tuple{AbstractMatrix{MIToS.MSA.Residue}}, Tuple{T}, Tuple{AbstractMatrix{MIToS.MSA.Residue}, Type{T}}} where T","page":"MSA","title":"MIToS.MSA.percentidentity","text":"percentidentity(msa[, out::Type=Float64])\n\nCalculates the identity between all the sequences on a MSA. You can indicate the output element type with the last optional parameter (Float64 by default). For a MSA with a lot of sequences, you can use Float32 or Flot16 in order to avoid the OutOfMemoryError().\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.percentsimilarity","page":"MSA","title":"MIToS.MSA.percentsimilarity","text":"Calculates the similarity percent between two aligned sequences. The 100% is the length of the aligned sequences minus the number of columns with gaps in both sequences and the number of columns with at least one residue outside the alphabet. So, columns with residues outside the alphabet (other than the specially treated GAP) aren't counted to the protein length. Two residues are considered similar if they below to the same group in a ReducedAlphabet. The alphabet (third positional argument) by default is:\n\nReducedAlphabet(\"(AILMV)(NQST)(RHK)(DE)(FWY)CGP\")\n\nThe first group is composed of the non polar residues (AILMV), the second group is composed of polar residues, the third group are positive residues, the fourth group are negative residues, the fifth group is composed by the aromatic residues (FWY). C, G and P are considered unique residues.\n\nOther residue groups/alphabets:\n\nSMS (Sequence Manipulation Suite) Ident and Sim (Stothard Paul. 2000):\n\nReducedAlphabet(\"(GAVLI)(FYW)(ST)(KRH)(DENQ)P(CM)\")\n\nStothard P (2000) The Sequence Manipulation Suite: JavaScript programs for analyzing and formatting protein and DNA sequences. Biotechniques 28:1102-1104.\n\nBio3D 2.2 seqidentity (Grant, Barry J., et al. 2006):\n\nReducedAlphabet(\"(GA)(MVLI)(FYW)(ST)(KRH)(DE)(NQ)PC\")\n\nReferences\n\nStothard, Paul. \"The sequence manipulation suite: JavaScript programs for analyzing and formatting protein and DNA sequences.\" Biotechniques 28.6 (2000): 1102-1104.\nGrant, Barry J., et al. \"Bio3d: an R package for the comparative analysis of protein structures.\" Bioinformatics 22.21 (2006): 2695-2696.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.percentsimilarity-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Vararg{Any}}","page":"MSA","title":"MIToS.MSA.percentsimilarity","text":"Calculates the similarity percent between all the sequences on a MSA. You can indicate the output element type with the out keyword argument (Float64 by default). For an MSA with a lot of sequences, you can use out=Float32 or out=Flot16 in order to avoid the OutOfMemoryError().\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.printmodifications-Tuple{MIToS.MSA.Annotations}","page":"MSA","title":"MIToS.MSA.printmodifications","text":"Prints MIToS annotated modifications\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.rename_sequences!-Union{Tuple{T}, Tuple{AT}, Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}}, Vector{T}}} where {AT, T<:AbstractString}","page":"MSA","title":"MIToS.MSA.rename_sequences!","text":"rename_sequences!(msa, newnames::Vector{T}) where {T<:AbstractString}\nrename_sequences!(msa, old2new::AbstractDict)\nrename_sequences!(msa, old2new::Pair...)\n\nRename the sequences of an MSA given a vector of new names, a dictionary mapping old names to new names, or one or more pairs going from old to new names. If the msa is an AnnotatedMultipleSequenceAlignment, the annotations are also updated. The function modifies the msa in place and returns it.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.rename_sequences-Tuple{Any, Any}","page":"MSA","title":"MIToS.MSA.rename_sequences","text":"rename_sequences(msa, newnames::Vector{T}) where {T<:AbstractString}\nrename_sequences(msa, old2new::AbstractDict)\nrename_sequences(msa, old2new::Pair...)\n\nRename the sequences of an MSA given a vector of new names, a dictionary mapping old names to new names, or one or more pairs going from old to new names. If the msa is an AnnotatedMultipleSequenceAlignment, the annotations are also updated. The function returns a new MSA with the sequences renamed without modifying the original MSA.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.residue2three-Tuple{MIToS.MSA.Residue}","page":"MSA","title":"MIToS.MSA.residue2three","text":"This function returns the three letter name of the Residue.\n\njulia> using MIToS.MSA\n\njulia> residue2three(Residue('G'))\n\"GLY\"\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.residuefraction-Tuple{AbstractArray{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.residuefraction","text":"It calculates the fraction of residues (no gaps) on the Array (alignment, sequence, column, etc.). This function can take an extra dimension argument for calculation of the residue fraction over the given dimension\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.sequence_id-Tuple{Union{MIToS.MSA.AbstractAlignedSequence, MIToS.MSA.AbstractSequence}}","page":"MSA","title":"MIToS.MSA.sequence_id","text":"sequence_id(seq::Union{AbstractSequence,AbstractAlignedSequence})\n\nIt returns the sequence identifier of a sequence object.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.sequence_index-Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}} where AT, AbstractString}","page":"MSA","title":"MIToS.MSA.sequence_index","text":"sequence_index(msa, seq_name)\n\nReturn the index (integer position) of the sequence with name seq_name in the MSA msa. A KeyError is thrown if the sequence name does not exist. If seq_name is an integer, the same integer is returned without checking if it is a valid index.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.sequencename_iterator-Union{Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}}}, Tuple{AT}} where AT","page":"MSA","title":"MIToS.MSA.sequencename_iterator","text":"sequencename_iterator(msa)\n\nIt returns an iterator that returns the sequence names/identifiers of the msa.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.sequencenames-Union{Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}}}, Tuple{AT}} where AT<:AbstractArray","page":"MSA","title":"MIToS.MSA.sequencenames","text":"sequencenames(msa)\n\nIt returns a Vector{String} with the sequence names/identifiers.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.sequencepairsmatrix-Union{Tuple{diagonal}, Tuple{T}, Tuple{AbstractMatrix{MIToS.MSA.Residue}, Type{T}, Type{Val{diagonal}}, T}} where {T, diagonal}","page":"MSA","title":"MIToS.MSA.sequencepairsmatrix","text":"Initialize an empty PairwiseListMatrix for a pairwise measure in column pairs. It uses the column mapping (column number in the input MSA file) if it’s available, otherwise it uses the actual column numbers. You can use the positional argument to indicate the number Type (default: Float64), if the PairwiseListMatrix should store the diagonal values on the list (default: false) and a default value for the diagonal (default: NaN).\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.setannotcolumn!","page":"MSA","title":"MIToS.MSA.setannotcolumn!","text":"setannotcolumn!(ann, feature, annotation)\n\nIt stores per column annotation (1 char per column) for feature\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.setannotfile!","page":"MSA","title":"MIToS.MSA.setannotfile!","text":"setannotfile!(ann, feature, annotation)\n\nIt stores per file annotation for feature\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.setannotresidue!","page":"MSA","title":"MIToS.MSA.setannotresidue!","text":"setannotresidue!(ann, seqname, feature, annotation)\n\nIt stores per residue annotation (1 char per residue) for (seqname, feature)\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.setannotsequence!","page":"MSA","title":"MIToS.MSA.setannotsequence!","text":"setannotsequence!(ann, seqname, feature, annotation)\n\nIt stores per sequence annotation for (seqname, feature)\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.setreference!","page":"MSA","title":"MIToS.MSA.setreference!","text":"It puts the sequence i (name or position) as reference (first sequence) of the MSA. This function swaps the sequences 1 and i.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.shuffle_msa!-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Vararg{Any}}","page":"MSA","title":"MIToS.MSA.shuffle_msa!","text":"shuffle_msa!([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false)\n\nIn-place version of shuffle_msa. It randomly permute residues in the MSA msa along sequences (dims=1) or columns (dims=2, the default). The optional positional argument subset allows to shuffle only a subset of the sequences or columns. The optional keyword argument fixedgaps indicates if the gaps should remain their positions (true by default). The optional keyword argument fixed_reference indicates if the residues in the first sequence should remain in their positions (false by default).\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.shuffle_msa-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Vararg{Any}}","page":"MSA","title":"MIToS.MSA.shuffle_msa","text":"shuffle_msa([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false)\n\nIt randomly permute residues in the MSA msa along sequences (dims=1) or columns (dims=2, the default). The optional positional argument subset allows to shuffle only a subset of the sequences or columns. The optional keyword argument fixedgaps indicates if the gaps should remain their positions (true by default). The optional keyword argument fixed_reference indicates if the residues in the first sequence should remain in their positions (false by default). To shuffle in-place, see shuffle_msa!.\n\njulia> using MIToS.MSA\n\njulia> using Random\n\njulia> msa = hcat(res\"RRE\",res\"DDK\", res\"G--\")\n3×3 Matrix{Residue}:\n R D G\n R D -\n E K -\n\njulia> Random.seed!(42);\n\njulia> shuffle_msa(msa, dims=1, fixedgaps=true)\n3×3 Matrix{Residue}:\n G D R\n R D -\n E K -\n\njulia> Random.seed!(42);\n\njulia> shuffle_msa(msa, dims=1, fixedgaps=false)\n3×3 Matrix{Residue}:\n G D R\n R - D\n E K -\n\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.stringsequence-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Any}","page":"MSA","title":"MIToS.MSA.stringsequence","text":"stringsequence(seq)\nstringsequence(msa, i::Int)\nstringsequence(msa, id::String)\n\nIt returns the selected sequence as a String.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.swapsequences!-Tuple{Matrix{MIToS.MSA.Residue}, Int64, Int64}","page":"MSA","title":"MIToS.MSA.swapsequences!","text":"It swaps the sequences on the positions i and j of an MSA. Also it's possible to swap sequences using their sequence names/identifiers when the MSA object as names.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.three2residue-Tuple{String}","page":"MSA","title":"MIToS.MSA.three2residue","text":"It takes a three letter residue name and returns the corresponding Residue. If the name isn't in the MIToS dictionary, a XAA is returned.\n\njulia> using MIToS.MSA\n\njulia> three2residue(\"ALA\")\nA\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.Utils.parse_file","page":"MSA","title":"MIToS.Utils.parse_file","text":"parse_file(io, format[, output; generatemapping, useidcoordinates, deletefullgaps])\n\nThe keyword argument generatemapping (false by default) indicates if the mapping of the sequences (\"SeqMap\") and columns (\"ColMap\") and the number of columns in the original MSA (\"NCol\") should be generated and saved in the annotations. If useidcoordinates is true (default: false) the sequence IDs of the form \"ID/start-end\" are parsed and used for determining the start and end positions when the mappings are generated. deletefullgaps (true by default) indicates if columns 100% gaps (generally inserts from a HMM) must be removed from the MSA.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#Random.shuffle","page":"MSA","title":"Random.shuffle","text":"It's like shuffle but in-place. When a Matrix{Residue} or a AbstractAlignedObject (sequence or MSA) is used, you can indicate if the gaps should remain their positions using the last boolean argument.\n\nDEPRECATED: This method is deprecated. Use shuffle_msa instead.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#Random.shuffle!","page":"MSA","title":"Random.shuffle!","text":"It's like Random.shuffle. When a Matrix{Residue} is used, you can indicate if the gaps should remain their positions using the last boolean argument. The previous argument should be the dimension to shuffle, 1 for shuffling residues in a sequence (row) or 2 for shuffling residues in a column.\n\nDEPRECATED: This method is deprecated. Use shuffle_msa! instead.\n\n\n\n\n\n","category":"function"},{"location":"Information/","page":"Information","title":"Information","text":"CurrentModule = MIToS.Information","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"@info \"Information docs\"","category":"page"},{"location":"Information/#Module-Information","page":"Information","title":"Information","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"The Information module of MIToS defines types and functions useful to calculate information measures (e.g. Mutual Information (MI) and Entropy) over a Multiple Sequence Alignment (MSA). This module was designed to count Residues (defined in the MSA module) in special contingency tables (as fast as possible) and to derive probabilities from these counts. Also, includes methods for applying corrections to those tables, e.g. pseudocounts and pseudo frequencies. Finally, Information allows to use these probabilities and counts to estimate information measures and other frequency based values.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information # to load the Information module","category":"page"},{"location":"Information/#Features","page":"Information","title":"Features","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"Estimate multi dimensional frequencies and probability tables from sequences, MSAs, etc...\nCorrection for small number of observations\nCorrection for data redundancy on a MSA\nEstimate information measures\nCalculate corrected mutual information between residues","category":"page"},{"location":"Information/#Contents","page":"Information","title":"Contents","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"Pages = [\"Information.md\"]\nDepth = 4","category":"page"},{"location":"Information/#Counting-residues","page":"Information","title":"Counting residues","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"MIToS Information module defines a multidimensional ContingencyTable type and two types wrapping it, Frequencies and Probabilities, to store occurrences or probabilities. The ContingencyTable type stores the contingency matrix, its marginal values and total. These types are parametric, taking three ordered parameters:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"T : The type used for storing the counts or probabilities, e.g. Float64. It's possible to use BigFloat if more precision it's needed.\nN : It's the dimension of the table and should be an Int.\nA : This should be a type, subtype of ResidueAlphabet, i.e.: UngappedAlphabet, GappedAlphabet or ReducedAlphabet.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"note: Note\nContingencyTable can be used for storing probabilities or counts. The wrapper types Probabilities and Frequencies are mainly intended to dispatch in methods that need to know if the matrix has probabilities or counts, e.g. shannon_entropy. In general, the use of ContingencyTable is recommended over the use of Probabilities and Frequencies.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"In this way, a matrix for storing pairwise probabilities of residues (without gaps) can be initialized using:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information\n\nPij = ContingencyTable(Float64, Val{2}, UngappedAlphabet())","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"[High level interface] It is possible to use the functions frequencies and probabilities to easily calculate the frequencies of sequences or columns of a MSA, where the number of sequences/columns determine the dimension of the resulting table.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information\nusing MIToS.MSA # to use res\"...\" to create Vector{Residue}\n\ncolumn_i = res\"AARANHDDRDC-\"\ncolumn_j = res\"-ARRNHADRAVY\"\n# Nij[R,R] = 1 1 = 2\n\nNij = frequencies(column_i, column_j)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"You can use sum to get the stored total:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"sum(Nij) # There are 12 Residues, but 2 are gaps","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Contingency tables can be indexed using Int or Residues:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Nij[2, 2] # Use Int to index the table","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Nij[Residue('R'), Residue('R')] # Use Residue to index the table","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"warning: Warning\nThe number makes reference to the specific index in the table e.g [2,2] references the second row and the second column. The use of the number used to encode the residue to index the table is dangerous. The equivalent index number of a residue depends on the used alphabet and Int(Residue('X')) will be always out of bounds.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Indexing with Residues works as expected. It uses the alphabet of the contingency table to find the index of the Residue.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information\nusing MIToS.MSA\n\nalphabet = ReducedAlphabet(\"(AILMV)(NQST)(RHK)(DE)(FWY)CGP\")\n\ncolumn_i = res\"AARANHDDRDC-\"\ncolumn_j = res\"-ARRNHADRAVY\"\n# Fij[R,R] = 1 1 1 = 3 # RHK\n\nFij = frequencies(column_i, column_j, alphabet = alphabet)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Fij[Residue('R'), Residue('R')] # Use Residue to index the table","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"The function getcontingencytable allows to access the wrapped ContingencyTable in a Frequencies object. You can use it, in combination with normalize to get a contingency table of probabilities. The result can be wrapped inside a Probabilities object:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Probabilities(normalize(getcontingencytable(Fij)))","category":"page"},{"location":"Information/#Example:-Plotting-the-probabilities-of-each-residue-in-a-sequence","page":"Information","title":"Example: Plotting the probabilities of each residue in a sequence","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"Similar to the frequencies function, the probabilities function can take at least one sequence (vector of residues) and returns the probabilities of each residue. Optionally, the keyword argument alphabet could be used to count some residues in the same cell of the table.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"probabilities(res\"AARANHDDRDC\", alphabet = alphabet)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Here, we are going to use the probabilities function to get the residue probabilities of a particular sequence from UniProt.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"use the getsequence function, from the MSA module, to get the sequence from a FASTA downloaded from UniProt.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information # to use the probabilities function\nusing MIToS.MSA # to use getsequence on the one sequence FASTA (canonical) from UniProt\nseq = read_file(\"http://www.uniprot.org/uniprot/P29374.fasta\", FASTA) # Small hack: read the single sequence as a MSA\nprobabilities(seq[1, :]) # Select the single sequence and calculate the probabilities","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"@info \"Information: Plots\"\nusing Plots\ngr(size=(600,300))\nusing MIToS.Information # to use the probabilities function\nusing MIToS.MSA # to use getsequence on the one sequence FASTA (canonical) from UniProt\nseq = read_file(\"http://www.uniprot.org/uniprot/P29374.fasta\", FASTA) # Small hack: read the single sequence as a MSA\nPa = probabilities(seq[1,:]) # Select the single sequence and calculate the probabilities","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using Plots # We choose Plots because it's intuitive, concise and backend independent\ngr(size = (600, 300))","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"You can plot together with the probabilities of each residue in a given sequence, the probabilities of each residue estimated with the BLOSUM62 substitution matrix. That matrix is exported as a constant by the Information module as BLOSUM62_Pi.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"bar(1:20, [Pa BLOSUM62_Pi], lab = [\"Sequence\" \"BLOSUM62\"], alpha = 0.5)\npng(\"inf_plotfreq.png\") # hide\nnothing # hide","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"(Image: )","category":"page"},{"location":"Information/#Low-count-corrections","page":"Information","title":"Low count corrections","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"Low number of observations can lead to sparse contingency tables, that lead to wrong probability estimations. It is shown in Buslje et al. [3] that low-count corrections, can lead to improvements in the contact prediction capabilities of the Mutual Information. The Information module has available two low-count corrections:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Additive Smoothing(Image: ); the constant value pseudocount described in Buslje et al. [3].\nBLOSUM62 based pseudo frequencies of residues pairs, similar to Altschul et al. [4].","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.MSA\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n)\n\nfiltercolumns!(msa, columngapfraction(msa) .< 0.5) # delete columns with 50% gaps or more\n\ncolumn_i = msa[:, 1]\ncolumn_j = msa[:, 2]","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"If you have a preallocated ContingencyTable you can use frequencies! to fill it, this prevent to create a new table as frequencies do. However, you should note that frequencies! adds the new counts to the pre existing values, so in this case, we want to start with a table initialized with zeros.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information\n\nconst alphabet = ReducedAlphabet(\"(AILMV)(NQST)(RHK)(DE)(FWY)CGP\")\n\nNij = ContingencyTable(Float64, Val{2}, alphabet)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"frequencies!(Nij, column_i, column_j)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"In cases like the above, where there are few observations, it is possible to apply a constant pseudocount to the counting table. This module defines the type AdditiveSmoothing and the correspond fill! and apply_pseudocount! methods to efficiently add or fill with a constant value each element of the table.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"apply_pseudocount!(Nij, AdditiveSmoothing(1.0))","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"[High level interface.] The frequencies and frequencies! function has a pseudocounts keyword argument that can take a AdditiveSmoothing value to easily calculate occurrences with pseudocounts. Also their alphabet keyword argument can be used to chage the default alphabet.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"frequencies(column_i, column_j, pseudocounts = AdditiveSmoothing(1.0), alphabet = alphabet)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"To use the conditional probability matrix BLOSUM62_Pij in the calculation of pseudo frequencies G for the pair of residues a, b, it should be calculated first the real frequencies/probabilities p_ab. The observed probabilities are then used to estimate the pseudo frequencies.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"G_ab = sum_cd p_cd cdot BLOSUM62( a c ) cdot BLOSUM62( b d )","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Finally, the probability P of each pair of residues a, b between the columns i, j is the weighted mean between the observed frequency p and BLOSUM62-based pseudo frequency G, where α is generally the number of clusters or the number of sequences of the MSA and β is an empiric weight value. β was determined to be close to 8.512.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"P_ab = fracalpha cdot p_ab + beta cdot G_ab alpha + beta","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"This could be easily achieved using the pseudofrequencies keyword argument of the probabilities function. That argument can take a BLOSUM_Pseudofrequencies object that is created with α and β as first and second argument, respectively.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Pij = probabilities(\n column_i,\n column_j,\n pseudofrequencies = BLOSUM_Pseudofrequencies(nsequences(msa), 8.512),\n)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"You can also use apply_pseudofrequencies! in a previously filled probability contingency table. i.e. apply_pseudofrequencies!(Pij, BLOSUM_Pseudofrequencies(α, β))","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"warning: Warning\nBLOSUM_Pseudofrequencies can be only be applied in normalized/probability tables with UngappedAlphabet.","category":"page"},{"location":"Information/#Correction-for-data-redundancy-in-a-MSA","page":"Information","title":"Correction for data redundancy in a MSA","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"A simple way to reduce redundancy in a MSA without losing sequences, is clusterization and sequence weighting. The weight of each sequence should be 1/N, where N is the number of sequences in its cluster. The Clusters type of the MSA module stores the weights. This vector of weights can be extracted (with the getweight function) and used by the frequencies and probabilities functions with the keyword argument weights. Also it's possible to use the Clusters as second argument of the function frequencies!.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"clusters = hobohmI(msa, 62) # from MIToS.MSA","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"frequencies(msa[:, 1], msa[:, 2], weights = clusters)","category":"page"},{"location":"Information/#Estimating-information-measures-on-an-MSA","page":"Information","title":"Estimating information measures on an MSA","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"The Information module has a number of functions defined to calculate information measures from Frequencies and Probabilities:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"shannon_entropy : Shannon entropy (H)\nmarginal_entropy : Shannon entropy (H) of the marginals\nkullback_leibler : Kullback-Leibler (KL) divergence\nmutual_information : Mutual Information (MI)\nnormalized_mutual_information : Normalized Mutual Information (nMI) by Entropy\ngap_intersection_percentage\ngap_union_percentage","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Information measure functions take optionally the base as a keyword argument (default: ℯ). You can set base=2 to measure information in bits.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information\nusing MIToS.MSA\n\nNi = frequencies(res\"PPCDPPPPPKDKKKKDDGPP\") # Ni has the count table of residues in this low complexity sequence\n\nH = shannon_entropy(Ni) # returns the Shannon entropy in nats (base e)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"H = shannon_entropy(Ni, base = 2) # returns the Shannon entropy in bits (base 2)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Information module defines special iteration functions to easily and efficiently compute a measure over a MSA. In particular, mapcolfreq! and mapseqfreq! map a function that takes a table of Frequencies or Probabilities. The table is filled in place with the counts or probabilities of each column or sequence of a MSA, respectively. mapcolpairfreq! and mapseqpairfreq! are similar, but they fill the table using pairs of columns or sequences, respectively.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"This functions take three positional arguments: the function f to be calculated, the msa and table of Frequencies or Probabilities.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"After that, this function takes some keyword arguments:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"weights (default: NoClustering()) : Weights to be used for table counting.\npseudocounts (default: NoPseudocount()) : Pseudocount object to be applied to table.\npseudofrequencies (default: NoPseudofrequencies()) : Pseudofrequencies to be applied to the normalized (probabilities) table.\nusediagonal (default: true) : Indicates if the function should be applied to pairs containing the same sequence or column.\ndiagonalvalue (default to zero) : The value that fills the diagonal elements of the table if usediagonal is false.","category":"page"},{"location":"Information/#Example:-Estimating-*H(X)*-and-*H(X,-Y)*-over-an-MSA","page":"Information","title":"Example: Estimating H(X) and H(X, Y) over an MSA","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"In this example, we are going to use mapcolfreq! and mapcolpairfreq! to estimate Shannon shannon_entropy of MSA columns H(X) and the joint entropy H(X, Y) of columns pairs, respectively.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"@info \"Information: Entropy\"\nusing Plots\ngr()","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.MSA\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"We are going to count residues to estimate the Shannon entropy. The shannon_entropy estimation is performed over a rehused Frequencies object. The result will be a vector containing the values estimated over each column without counting gaps (UngappedAlphabet).","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information\n\nHx = mapcolfreq!(\n shannon_entropy,\n msa,\n Frequencies(ContingencyTable(Float64, Val{1}, UngappedAlphabet())),\n)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"If we want the joint entropy between columns pairs, we need to use a bidimensional table of Frequencies and mapcolpairfreq!.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Hxy = mapcolpairfreq!(\n shannon_entropy,\n msa,\n Frequencies(ContingencyTable(Float64, Val{2}, UngappedAlphabet())),\n)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"In the above examples, we indicate the type of each occurrence in the counting and the probability table to use. Also, it's possible for some measures as entropy and mutual information, to estimate the values only with the count table (without calculate the probability table). Estimating measures only with a ResidueCount table, when this is possible, should be faster than using a probability table.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Time_Pab = map(1:100) do x\n time = @elapsed mapcolpairfreq!(\n shannon_entropy,\n msa,\n Probabilities(ContingencyTable(Float64, Val{2}, UngappedAlphabet())),\n )\nend\n\nTime_Nab = map(1:100) do x\n time = @elapsed mapcolpairfreq!(\n shannon_entropy,\n msa,\n Frequencies(ContingencyTable(Float64, Val{2}, UngappedAlphabet())),\n )\nend\n\nusing Plots\ngr()\n\nhistogram(\n [Time_Pab Time_Nab],\n labels = [\"Using ResidueProbability\" \"Using ResidueCount\"],\n xlabel = \"Execution time [seconds]\",\n)\n\npng(\"inf_entropy.png\") # hide\nnothing # hide","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"(Image: )","category":"page"},{"location":"Information/#Corrected-Mutual-Information","page":"Information","title":"Corrected Mutual Information","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"MIToS ships with two methods to easily calculate corrected mutual information. The first is the algorithm described in Buslje et al. [3]. This algorithm can be accessed through the buslje09 function and includes:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Low count correction using AdditiveSmoothing\nSequence weighting after a hobohmI clustering [2]\nAverage Product Correction (APC) proposed by Dunn et al. [5], through the APC! function that takes a MI matrix.\nZ score correction using the functions shuffle_msa! from the MSA module and zscore from the PairwiseListMatrices package.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"buslje09","category":"page"},{"location":"Information/#MIToS.Information.buslje09","page":"Information","title":"MIToS.Information.buslje09","text":"buslje09 takes a MSA and calculates a Z score and a corrected MI/MIp as described on Busjle et al. 2009.\n\nkeyword argument, type, default value and descriptions:\n\n - lambda Float64 0.05 Low count value\n - clustering Bool true Sequence clustering (Hobohm I)\n - threshold 62 Percent identity threshold for clustering\n - maxgap Float64 0.5 Maximum fraction of gaps in positions included in calculation\n - apc Bool true Use APC correction (MIp)\n - samples Int 100 Number of samples for Z-score\n - fixedgaps Bool true Fix gaps positions for the random samples\n - alphabet ResidueAlphabet UngappedAlphabet() Residue alphabet to be used\n\nThis function returns:\n\n - Z score\n - MI or MIp\n\n\n\n\n\n","category":"function"},{"location":"Information/","page":"Information","title":"Information","text":"The second, implemented in the BLMI function, has the same corrections that the above algorithm, but use BLOSUM62 pseudo frequencies. This function is slower than buslje09 (at the same number of samples), but gives better performance (for structural contact prediction) when the MSA has less than 400 clusters after a Hobohm I at 62% identity.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"BLMI","category":"page"},{"location":"Information/#MIToS.Information.BLMI","page":"Information","title":"MIToS.Information.BLMI","text":"BLMI takes an MSA and calculates a Z score (ZBLMI) and a corrected MI/MIp as described on Busjle et al. 2009 but using using BLOSUM62 pseudo frequencies instead of a fixed pseudocount.\n\nKeyword argument, type, default value and descriptions:\n\n - beta Float64 8.512 β for BLOSUM62 pseudo frequencies\n - lambda Float64 0.0 Low count value\n - threshold 62 Percent identity threshold for sequence clustering (Hobohm I)\n - maxgap Float64 0.5 Maximum fraction of gaps in positions included in calculation\n - apc Bool true Use APC correction (MIp)\n - samples Int 50 Number of samples for Z-score\n - fixedgaps Bool true Fix gaps positions for the random samples\n\nThis function returns:\n\n - Z score (ZBLMI)\n - MI or MIp using BLOSUM62 pseudo frequencies (BLMI/BLMIp)\n\nReferences\n\nBuslje, Cristina Marino, et al. \"Correction for phylogeny, small number of observations and data redundancy improves the identification of coevolving amino acid pairs using mutual information.\" Bioinformatics 25.9 (2009): 1125-1131.\n\n\n\n\n\n","category":"function"},{"location":"Information/#Example:-Estimating-corrected-MI-from-an-MSA","page":"Information","title":"Example: Estimating corrected MI from an MSA","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"@info \"Information: MI\"\nusing Plots\ngr()","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.MSA\nusing MIToS.Information\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n)\nZMIp, MIp = buslje09(msa)\nZMIp","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"ZBLMIp, BLMIp = BLMI(msa)\nZBLMIp","category":"page"},{"location":"Information/#Visualize-Mutual-Information","page":"Information","title":"Visualize Mutual Information","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"You can use the function of the Plots package to visualize the Mutual Information (MI) network between residues. As an example, we are going to visualize the MI between residues of the Pfam domain PF18883. The heatmap is the simplest way to visualize the values of the Mutual Information matrix.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using Plots\ngr()\n\nheatmap(ZMIp, yflip = true)\npng(\"inf_heatmap.png\") # hide\nnothing # hide","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"(Image: )","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"ZMIp is a Z score of the corrected MIp against its distribution on a random MSA (shuffling the residues in each sequence), so pairs with highest values are more likely to co-evolve. Here, we are going to use the top 1% pairs of MSA columns.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using PairwiseListMatrices # to use getlist\nusing Statistics # to use quantile\n\nthreshold = quantile(getlist(ZMIp), 0.99)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"ZMIp[ZMIp. !occursin(r\"_SULIY\", x), sequencenames(msa)) # an element of mask is true if \"_SULIY\" is not in the name","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"filtersequences!(msa, mask) # deletes all the sequences where mask is false\n\nsequencenames(msa)","category":"page"},{"location":"MSA/#Example:-Exporting-a-MSA-for-freecontact-(part-I)","page":"MSA","title":"Example: Exporting a MSA for freecontact (part I)","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The most simple input for the command line tool freecontact(Image: ) (if you don't want to set --mincontsep) is a Raw MSA file with a reference sequence without insertions or gaps. This is easy to get with MIToS using read_file (deletes the insert columns), setreference! (to choose a reference), adjustreference! (to delete columns with gaps in the reference) and write_file (to save it in Raw format) functions.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\nfile_name = \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/test/data/PF09645_full.stockholm\"\nmsa = read_file(file_name, Stockholm)\nmsa_coverage = coverage(msa)\nmaxcoverage, maxindex = findmax(msa_coverage)\nsetreference!(msa, maxindex[1]) # the sequence with the highest coverage\nadjustreference!(msa)\nwrite_file(\"tofreecontact.msa\", msa, Raw)\nprint(read_file(\"tofreecontact.msa\", String)) # display output file","category":"page"},{"location":"MSA/#Column-and-sequence-mappings","page":"MSA","title":"Column and sequence mappings","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Inserts in a Stockholm MSA allow to access the full fragment of the aligned sequences. Using this, combined with the sequence names that contain coordinates used in Pfam, you can know what is the UniProt residue number of each residue in the MSA.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"\"PROT_SPECI/3-15 .....insertALIGNED\"\n# 3456789111111\n# 012345","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"MIToS read_file and parse_file functions delete the insert columns, but they do the mapping between each residue and its residue number before deleting insert columns when generatemapping is true. If you don't set useidcoordinates to true, the residue first i residue will be 1 instead of 3 in the previous example.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\n\nmsa = parse_file(\n \"PROT_SPECI/3-15 .....insertALIGNED\",\n Stockholm,\n generatemapping = true,\n useidcoordinates = true,\n)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"MIToS also keeps the column number of the input MSA and its total number of columns. All this data is stored in the MSA annotations using the SeqMap, ColMap and NCol feature names.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"annotations(msa)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"To have an easy access to mapping data, MIToS provides the getsequencemapping and getcolumnmapping functions.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"getsequencemapping(msa, \"PROT_SPECI/3-15\")","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"getcolumnmapping(msa)","category":"page"},{"location":"MSA/#Example:-Exporting-a-MSA-for-freecontact-(part-II)","page":"MSA","title":"Example: Exporting a MSA for freecontact (part II)","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"If we want to use the --mincontsep argument of freecontact to calculate scores between distant residues, we will need to add a header to the MSA. This header should contains the residue number of the first residue of the sequence and the full fragment of that sequence (with the inserts). This data is used by FreeContact to calculate the residue number of each residue in the reference sequence. We are going to use MIToS mapping data to create this header, so we read the MSA with generatemapping and useidcoordinates set to true.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n generatemapping = true,\n useidcoordinates = true,\n)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Here, we are going to choose the sequence with more coverage of the MSA as our reference sequence.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"msa_coverage = coverage(msa)\nmaxcoverage, maxindex = findmax(msa_coverage)\nsetreference!(msa, maxindex[1])\nadjustreference!(msa)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"MIToS deletes the residues in insert columns, so we are going to use the sequence mapping to generate the whole fragment of the reference sequence (filling the missing regions with 'x').","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"seqmap = getsequencemapping(msa, 1) # seqmap will be a vector with the residue numbers of the first sequence (reference)\n\nseq = collect(stringsequence(msa, 1)) # seq will be a Vector of Chars with the reference sequence\n\nsequence = map(seqmap[1]:seqmap[end]) do seqpos # for each position in the whole fragment\n if seqpos in seqmap # if that position is in the MSA\n popfirst!(seq) # the residue is taken from seq\n else # otherwise\n 'x' # 'x' is included\n end\nend\n\nsequence = join(sequence) # join the Chars on the Vector to create a string","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Once we have the whole fragment of the sequence, we create the file and write the header in the required format (as in the man page of freecontact).","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"open(\"tofreecontact.msa\", \"w\") do fh\n println(fh, \"# querystart=\", seqmap[1])\n println(fh, \"# query=\", sequence)\nend","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"As last (optional) argument, write_file takes the mode in which is opened the file. We use \"a\" here to append the MSA to the header.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"write_file(\"tofreecontact.msa\", msa, Raw, \"a\")","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"print(join(first(readlines(\"tofreecontact.msa\"), 5), '\\n')) # It displays the first five lines","category":"page"},{"location":"MSA/#Get-sequences-from-a-MSA","page":"MSA","title":"Get sequences from a MSA","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"It's possible to index the MSA as any other matrix to get an aligned sequence. This will be return a Array of Residues without annotations but keeping names/identifiers.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/test/data/PF09645_full.stockholm\",\n Stockholm,\n generatemapping = true,\n useidcoordinates = true,\n)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"msa[2, :] # second sequence of the MSA, it keeps column names","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"msa[2:2, :] # Using the range 2:2 to select the second sequence, keeping also the sequence name","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"If you want to obtain the aligned sequence with its name and annotations (and therefore sequence and column mappings), you should use the function getsequence. This function returns an AlignedSequence with the sequence name from a MultipleSequenceAlignment or an AnnotatedAlignedSequence, that also contains annotations, from an AnnotatedMultipleSequenceAlignment.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"secondsequence = getsequence(msa, 2)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"annotations(secondsequence)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Use stringsequence if you want to get the sequence as a string.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"stringsequence(msa, 2)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Because matrices are stored columnwise in Julia, you will find useful the getresiduesequences function when you need to heavily operate over sequences.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"getresiduesequences(msa)","category":"page"},{"location":"MSA/#Describing-your-MSA","page":"MSA","title":"Describing your MSA","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The MSA module has a number of functions to gain insight about your MSA. Using MIToS.MSA, one can easily ask for...","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The number of columns and sequences with the ncolumns and nsequences functions.\nThe fraction of columns with residues (coverage) for each sequence making use of the coverage method.\nThe fraction or percentage of gaps/residues using with the functions gapfraction, residuefraction and columngapfraction.\nThe percentage of identity (PID) between each sequence of the MSA or its mean value with percentidentity and meanpercentidentity.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The percentage identity between two aligned sequences is a common measure of sequence similarity and is used by the hobohmI method to estimate and reduce MSA redundancy. MIToS functions to calculate percent identity don't align the sequences, they need already aligned sequences. Full gaps columns don't count to the alignment length.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\n\nmsa = permutedims(hcat(\n res\"--GGG-\", # res\"...\" uses the @res_str macro to create a (column) Vector{Residue}\n res\"---GGG\",\n), (2, 1))\n# identities 000110 sum 2\n# aligned residues 001111 sum 4","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"percentidentity(msa[1, :], msa[2, :]) # 2 / 4","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"To quickly calculate if the percentage of identity is greater than a determined value, use that threshold as third argument. percentidentity(seqa, seqb, pid) is a lot more faster than percentidentity(seqa, seqb) >= pid.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"percentidentity(msa[1, :], msa[2, :], 62) # 50% >= 62%","category":"page"},{"location":"MSA/#Example:-Plotting-gap-percentage-per-column-and-coverage-per-sequence","page":"MSA","title":"Example: Plotting gap percentage per column and coverage per sequence","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The gapfraction and coverage functions return a vector of numbers between 0.0 and 1.0 (fraction of...). Sometime it's useful to plot this data to quickly understand the MSA structure. In this example, we are going to use the Plots(Image: ) package for plotting, with the GR(Image: ) backend, but you are free to use any of the Julia plotting libraries.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"@info \"MSA: Plots\"\nusing Plots\ngr() # Hide possible warnings","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n)\n\nusing Plots\n\ngr(size = (600, 300))\n\nplot(\n # x is a range from 1 to the number of columns\n 1:ncolumns(msa),\n # y is a Vector{Float64} with the percentage of gaps of each column\n vec(columngapfraction(msa)) .* 100.0,\n linetype = :line,\n ylabel = \"gaps [%]\",\n xlabel = \"columns\",\n legend = false,\n)\n\npng(\"msa_gaps.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"plot(\n # x is a range from 1 to the number of sequences\n 1:nsequences(msa),\n # y is a Vector{Float64} with the coverage of each sequence\n vec(coverage(msa)) .* 100,\n linetype = :line,\n ylabel = \"coverage [%]\",\n xlabel = \"sequences\",\n legend = false,\n)\n\npng(\"msa_coverage.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"plot(msa)\npng(\"msa_msa.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/#Example:-Filter-sequences-per-coverage-and-columns-per-gap-fraction","page":"MSA","title":"Example: Filter sequences per coverage and columns per gap fraction","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Taking advantage of the filter...! functions and the coverage and columngapfraction functions, it's possible to delete short sequences or columns with a lot of gaps.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"println(\"\\tsequences\\tcolumns\")\nprintln(\"Before:\\t\", nsequences(msa), \"\\t\\t\", ncolumns(msa))\n# delete sequences with less than 90% coverage of the MSA length:\nfiltersequences!(msa, coverage(msa) .>= 0.9)\n# delete columns with more than 10% of gaps:\nfiltercolumns!(msa, columngapfraction(msa) .<= 0.1)\nprintln(\"After:\\t\", nsequences(msa), \"\\t\\t\", ncolumns(msa))","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"histogram(\n vec(columngapfraction(msa)),\n # Using vec() to get a Vector{Float64} with the fraction of gaps of each column\n xlabel = \"gap fraction in [0,1]\",\n bins = 20,\n legend = false,\n)\npng(\"msa_hist_gaps.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"histogram(\n vec(coverage(msa) .* 100.0), # Column with the coverage of each sequence\n xlabel = \"coverage [%]\",\n legend = false,\n)\npng(\"msa_hist_coverage.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/#Example:-Plotting-the-percentage-of-identity-between-sequences","page":"MSA","title":"Example: Plotting the percentage of identity between sequences","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The distribution of the percentage of identity between every pair of sequences in an MSA, gives an idea of the MSA diversity. In this example, we are using percentidentity over an MSA to get those identity values.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n)\npid = percentidentity(msa)\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"MIToS stores the matrix of percentage of identity between the aligned sequences as a PairwiseListMatrix from the PairwiseListMatrices(Image: ) package. This matrix type saves RAM, allowing the storage of big matrices. In this example, we use the to_table function of PairwiseListMatrices to convert the matrix into a table with indices.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using PairwiseListMatrices\n\npidtable = to_table(pid, diagonal = false)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The function quantile gives a quick idea of the percentage identity distribution of the MSA.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using Statistics\n\nquantile(convert(Vector{Float64}, pidtable[:, 3]), [0.00, 0.25, 0.50, 0.75, 1.00])","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The function meanpercentidentity gives the mean value of the percent identity distribution for MSA with less than 300 sequences, or a quick estimate (mean PID in a random sample of sequence pairs) otherwise unless you set exact to true.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"meanpercentidentity(msa)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"One can easily plot that matrix and its distribution using the heatmap and histogram functions of the Plots(Image: ) package.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"@info \"MSA: PID\"\nusing Plots\ngr() # Hide possible warnings","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using Plots\ngr()\nheatmap(convert(Matrix, pid), yflip = true, ratio = :equal)\npng(\"msa_heatmap_pid.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"histogram(pidtable[:, 3], xlabel = \"Percentage of identity\", legend = false)\npng(\"msa_hist_pid.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/#Sequence-clustering","page":"MSA","title":"Sequence clustering","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The MSA module allows to clusterize sequences in an MSA. The hobohmI function takes as input an MSA followed by an identity threshold value, and returns a Clusters type with the result of a Hobohm I sequence clustering [2]. The Hobohm I algorithm will add a sequence to an existing cluster, if the percentage of identity is equal or greater than the threshold. The Clusters is sub-type of ClusteringResult from the Clustering.jl(Image: ) package. One advantage of use a sub-type of ClusteringResultis that you are able to use any method defined on Clustering.jl like varinfo (Variation of Information) for example. Also, you can use any clustering algorithm included in Clustering.jl, and convert its result to an Clusters object to use it with MIToS. MSA defines the functions nclusters to get the resulting number of clusters, counts to get the number of sequences on each cluster and assignments to get the cluster number of each sequence. The most important method is getweight, which returns the weight of each sequence. This method is used in the Information module of MIToS to reduce redundancy.","category":"page"},{"location":"MSA/#Example:-Reducing-redundancy-of-a-MSA","page":"MSA","title":"Example: Reducing redundancy of a MSA","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"MSAs can suffer from an unnatural sequence redundancy and a high number of protein fragments. In this example, we are using a sequence clustering to make a non-redundant set of representative sequences. We are going to use the function hobohmI to perform the clustering with the Hobohm I algorithm at 62% identity.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"@info \"MSA: Clusters\"\nusing Plots\nusing StatsPlots\nusing DataFrames\ngr() # Hide possible warnings","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\nusing Clustering # to use the nclusters and assignments functions\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n)\n\nprintln(\"This MSA has \", nsequences(msa), \" sequences...\")","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"clusters = hobohmI(msa, 62)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"println(\n \"...but has only \",\n nclusters(clusters),\n \" sequence clusters after a clustering at 62% identity.\",\n)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using Plots\ngr()\n\nplot(msa)\npng(\"msa_clusters_i.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"We are going to use the DataFrames(Image: ) package to easily select the sequence with the highest coverage of each cluster.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using DataFrames\n\ndf = DataFrame(\n seqnum = 1:nsequences(msa),\n seqname = sequencenames(msa),\n cluster = assignments(clusters), # the cluster number/index of each sequence\n coverage = vec(coverage(msa)),\n)\n\nfirst(df, 5)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"It is possible to use this DataFrame and Plots to plot the sequence coverage of the MSA and also an histogram of the number of sequences in each cluster:","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using StatsPlots # Plotting DataFrames\nh = @df df histogram(:cluster, ylabel = \"nseq\")\np = @df df plot(:cluster, :coverage, linetype = :scatter)\nplot(p, h, nc = 1, xlim = (0, nclusters(clusters) + 1), legend = false)\npng(\"msa_clusters_ii.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"We use the Split-Apply-Combine strategy, though the groupby and combine function of the DataFrames package, to select the sequence of highest coverage for each cluster.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"grouped_df = groupby(df, :cluster)\n\nmaxcoverage = combine(grouped_df) do cl\n row_index = findmax(cl.coverage)[2]\n cl[row_index, [:seqnum, :seqname, :coverage]]\nend\n\nfirst(maxcoverage, 5)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"p = @df maxcoverage plot(:cluster, :coverage, linetype = :scatter)\nh = @df maxcoverage histogram(:cluster, ylabel = \"nseq\")\nplot(p, h, nc = 1, xlim = (0, nclusters(clusters) + 1), legend = false)\npng(\"msa_clusters_iii.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"We can easily generate a mask using list comprehension, to select only the representative sequences of the MSA (deleting the rest of the sequences with filtersequences!).","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"cluster_references = Bool[seqnum in maxcoverage.seqnum for seqnum = 1:nsequences(msa)]","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"filtersequences!(msa, cluster_references)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"plot(msa)\npng(\"msa_clusters_iv.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/#Concatenating-MSAs","page":"MSA","title":"Concatenating MSAs","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Concatenating multiple sequence alignments can be helpful in various bioinformatics applications. It allows researchers to combine the alignments of different sequences or regions into a single MSA for further analysis. Examples of this maneuver are concatenating two protein sequences from the same organism to estimate coevolution among those proteins or to model the protein-protein interaction using tools such as AlphaFold.","category":"page"},{"location":"MSA/#Horizontal-and-Vertical-Concatenation","page":"MSA","title":"Horizontal and Vertical Concatenation","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"We can concatenate two MSAs as matrices using Julia's hcat and vcat functions. However, MIToS defines special methods for these functions on MSA objects to deal with sequence and column names and annotations. To use hcat, we only need the MSA having the same number of sequences. The hcat function will concatenate the first sequence of the first MSA with the first sequence of the second MSA, and so on. For example, let's define two small MSAs msa_a and msa_b, and concatenate them horizontally:","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\nmsa_a = AnnotatedMultipleSequenceAlignment(Residue[\n 'A' 'R' 'N'\n 'D' 'C' 'Q'\n]);\nrename_sequences!(msa_a, [\"SEQ1_A\", \"SEQ2_A\"])\nmsa_b = AnnotatedMultipleSequenceAlignment(Residue[\n 'N' 'Q'\n 'E' 'G'\n]);\nrename_sequences!(msa_b, [\"SEQ1_B\", \"SEQ2_B\"])\nconcatenated_msa = hcat(msa_a, msa_b)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"As you might have noticed, the hcat function preserves the sequence names by concatenating them using _&_ as a separator. So, the first sequence of the concatenated MSA is SEQ1_A_&_SEQ1_B. Also, the column names have changed in the concatenated MSA. For example, the first column of msa_a is now the first column of concatenated_msa, but its name changed from 1 to 1_1. The hcat function renames the columns so that the first number, the one before the underscore, indicates the index of the sub-MSA. The first sub-MSA in the concatenated MSA is 1, the second sub-MSA is 2, and so on. This allows you to track the origin of each column in the concatenated MSA. You can access a vector of those indices using the gethcatmapping function:","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"gethcatmapping(concatenated_msa)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"If we perform multiple concatenations—i.e., if we call hcat on an MSA output of another call to hcat—the hcat function will remember the sub-MSA boundaries to continue the numeration accordingly. For example, let's create and add a third MSA:","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"msa_c = AnnotatedMultipleSequenceAlignment(Residue[\n 'A' 'H'\n 'A' 'H'\n]);\nrename_sequences!(msa_c, [\"SEQ1_C\", \"SEQ2_C\"])\nhcat(concatenated_msa, msa_c)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"As you can see, the hcat function detects the previous concatenation and continues the indexing from the last MSA. So that column 1 of msa_c is now 3_1 in the concatenated MSA. The hcat function can take more than two MSAs as arguments. For example, you can get the same result as above by calling hcat(msa_a, msa_b, msa_c).","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"To concatenate MSAs vertically, you can use the vcat function. The only requirement is that the MSAs have the same number of columns. For example, let's define two small MSAs. The first column of msa_a will be concatenated with the first column of msa_b, and so on:","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\nmsa_a = AnnotatedMultipleSequenceAlignment(Residue[\n 'A' 'R'\n 'D' 'C'\n 'E' 'G'\n])\nmsa_b = AnnotatedMultipleSequenceAlignment(Residue[\n 'N' 'Q'\n 'D' 'R'\n])\nconcatenated_msa = vcat(msa_a, msa_b)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"In this case, vcat adds the MSA index prefix to the sequence names. So, the sequence 1 of msa_a is now 1_1 in the concatenated MSA. The vcat function, similar to hcat, can take more than two MSAs as arguments in case you need to concatenate multiple alignments vertically.","category":"page"},{"location":"MSA/#Joining-MSAs","page":"MSA","title":"Joining MSAs","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Sometimes, you may need to join or merge two MSAs, having different number of sequences or columns. For such cases, MIToS provides the join_msas function. This function allows you to join two MSAs based on specified matching positions or names. It supports different types of joins: inner, outer, left, and right. You can indicate the positions or names to match using an iterable of pairs or separate lists of positions or names. For example, using a vector of Pair objects, you can identify which positions on the first MSA (the first element of the pair) should match with which positions on the second MSA (the second element of the pair). Let's see that in one fictional example:","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\nmsa_a = AnnotatedMultipleSequenceAlignment(Residue[\n 'A' 'R' 'D'\n 'G' 'K' 'E'\n 'G' 'R' 'D'\n]);\nrename_sequences!(msa_a, [\"aa_HUMAN\", \"bb_MOUSE\", \"cc_YEAST\"])\nmsa_b = AnnotatedMultipleSequenceAlignment(Residue[\n 'N' 'A'\n 'E' 'G'\n 'E' 'A'\n]);\nrename_sequences!(msa_b, [\"AA_HUMAN\", \"BB_MOUSE\", \"CC_SHEEP\"])\npairing = [\"aa_HUMAN\" => \"AA_HUMAN\", \"bb_MOUSE\" => \"BB_MOUSE\"]\njoin_msas(msa_a, msa_b, pairing)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"As we can see, the join_msas function has matched the sequences on both MSAs based on the specified pairing—in this example, we create a dictionary to pair the sequences from the same species. The join_msas have two important keyword arguments: kind and axis. By default, the function performs an outer join (kind = :outer) and matches the sequences (axis = 1). You can change these arguments to perform other kinds of joins or to match the columns. Since we performed an outer join, the resulting MSA contains all sequences from both input MSAs, and join_msas have added gaps where the sequences do not match.","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"EditURL = \"cookbook/03_RMSF.jl\"","category":"page"},{"location":"03_RMSF/#Root-Mean-Squared-Fluctuation-(RMSF)","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"","category":"section"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"md # (Image: ) md # (Image: )","category":"page"},{"location":"03_RMSF/#Problem-description","page":"Root Mean Squared Fluctuation (RMSF)","title":"Problem description","text":"","category":"section"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"The Root Mean Squared Fluctuation (RMSF) is a common way to measure residue flexibility in a structural ensemble. It is a measure of how far is the residue moving from its average position in the group of structures. Usually, we represent a residue position with the spatial coordinates of its alpha carbon.","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"The protein structures should be previously superimposed to calculate the RMSF, for example, by using the superimpose function of the PDB module of MIToS. In this example, we are going to measure the RMSF of each residue from an NMR ensemble using the rmsf function.","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"The structure superimposition could be the most complicated step of the process, depending on the input data. In particular, it structures come from different PDB structures or homologous proteins can require the use of external programs, as MAMMOTH-mult or MUSTANG among others, tailored for this task.","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"In this case, we are going to use an NMR ensemble. Therefore, we are not going to need to superimpose the structures as NMR models have the same protein sequence and are, usually, well-aligned.","category":"page"},{"location":"03_RMSF/#MIToS-solution","page":"Root Mean Squared Fluctuation (RMSF)","title":"MIToS solution","text":"","category":"section"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"import MIToS\nusing MIToS.PDB\nusing Plots","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"Lets read the NMR ensemble:","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"pdb_file = abspath(pathof(MIToS), \"..\", \"..\", \"test\", \"data\", \"1AS5.pdb\")\npdb_res = read_file(pdb_file, PDBFile, occupancyfilter = true)\nnothing # hide","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"We set occupancyfilter to true to ensure that we have one single set of coordinates for each atom. That filter isn't essential for NMR structures, but It can avoid multiple alpha carbons in crystallographic structures with disordered atoms. We can get an idea of the alpha carbon positions by plotting these residues:","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"scatter(pdb_res, legend = false)","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"As we saw in the previous plot, the structure doesn't need to be superimposed. Now, we are going to separate each model into different vectors, storing each vector into a Dict:","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"models = Dict{String,Vector{PDBResidue}}()\nfor res in pdb_res\n push!(get!(models, res.id.model, []), res)\nend","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"Then, we simply need to collect all the PDB models in the values of the Dict, to get the vector of PDBResidues vectors required to calculate the RMSF.","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"pdb_models = collect(values(models))\nnothing # hide","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"And, finally, call the rmsf function on the list of structures. It is important that all the vectors has the same number of PDBResidues. This function assumes that the nth element of each vector corresponds to the same residue:","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"RMSF = rmsf(pdb_models)","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"This return the vector of RMSF values for each residue, calculated using the coordinates of the alpha carbons. You can plot this vector to get an idea of the which are the most flexible position in your structure:","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"plot(RMSF, legend = false, xlab = \"Residue\", ylab = \"RMSF [Å]\")","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"This page was generated using Literate.jl.","category":"page"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"@info \"Utils API docs\"","category":"page"},{"location":"Utils_API/#API-Utils","page":"Utils","title":"Utils","text":"","category":"section"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"MIToS.Utils","category":"page"},{"location":"Utils_API/#MIToS.Utils","page":"Utils","title":"MIToS.Utils","text":"The Utils has common utils functions and types used in other modules.\n\nusing MIToS.Utils\n\n\n\n\n\n","category":"module"},{"location":"Utils_API/#Contents","page":"Utils","title":"Contents","text":"","category":"section"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"Pages = [\"Utils_API.md\"]\nDepth = 2","category":"page"},{"location":"Utils_API/#Types","page":"Utils","title":"Types","text":"","category":"section"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"Modules = [MIToS.Utils]\nPrivate = false\nOrder = [:type]","category":"page"},{"location":"Utils_API/#MIToS.Utils.All","page":"Utils","title":"MIToS.Utils.All","text":"All is used instead of MIToS 1.0 \"all\" or \"*\", because it's possible to dispatch on it.\n\n\n\n\n\n","category":"type"},{"location":"Utils_API/#MIToS.Utils.FileFormat","page":"Utils","title":"MIToS.Utils.FileFormat","text":"FileFormat is used for defile special parse_file (called by read_file) and print_file (called by read_file) methods for different file formats.\n\n\n\n\n\n","category":"type"},{"location":"Utils_API/#Constants","page":"Utils","title":"Constants","text":"","category":"section"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"Modules = [MIToS.Utils]\nPrivate = false\nOrder = [:constant]","category":"page"},{"location":"Utils_API/#MIToS.Utils.THREE2ONE","page":"Utils","title":"MIToS.Utils.THREE2ONE","text":"THREE2ONE is a dictionary that maps three-letter amino acid residue codes (String) to their corresponding one-letter codes (Char). The dictionary is generated by parsing components.cif file from the Protein Data Bank.\n\njulia> using MIToS.Utils\n\njulia> one_letter_code = THREE2ONE[\"ALA\"]\n'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)\n\n\n\n\n\n","category":"constant"},{"location":"Utils_API/#Macros","page":"Utils","title":"Macros","text":"","category":"section"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"Modules = [MIToS.Utils]\nPrivate = false\nOrder = [:macro]","category":"page"},{"location":"Utils_API/#Methods-and-functions","page":"Utils","title":"Methods and functions","text":"","category":"section"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"Modules = [MIToS.Utils]\nPrivate = false\nOrder = [:function]","category":"page"},{"location":"Utils_API/#MIToS.Utils.check_file-Tuple{Any}","page":"Utils","title":"MIToS.Utils.check_file","text":"Returns the filename. Throws an ErrorException if the file doesn't exist, or a warning if the file is empty.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.check_pdbcode-Tuple{String}","page":"Utils","title":"MIToS.Utils.check_pdbcode","text":"It checks if a PDB code has the correct format.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.download_file-Tuple{AbstractString, AbstractString}","page":"Utils","title":"MIToS.Utils.download_file","text":"download_file uses Downloads.jl to download files from the web. It takes the file url as first argument and, optionally, a path to save it. Keyword arguments are are directly passed to to Downloads.download.\n\njulia> using MIToS.Utils\n\njulia> download_file(\"https://www.uniprot.org/uniprot/P69905.fasta\", \"seq.fasta\")\n\"seq.fasta\"\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.get_n_words-Tuple{String, Int64}","page":"Utils","title":"MIToS.Utils.get_n_words","text":"get_n_words{T <: Union{ASCIIString, UTF8String}}(line::T, n::Int) It returns a Vector{T} with the first n (possibles) words/fields (delimited by space or tab). If there is more than n words, the last word returned contains the finals words and the delimiters. The length of the returned vector is n or less (if the number of words is less than n). This is used for parsing the Stockholm format.\n\njulia> using MIToS.Utils\n\njulia> get_n_words(\"#=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHHEEEEEEEEEEEEEEEEHHH\", 3)\n3-element Vector{String}:\n \"#=GR\"\n \"O31698/18-71\"\n \"SS CCCHHHHHHHHHHHHHHHEEEEEEEEEEEEEEEEHHH\"\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.getarray-Tuple{NamedArrays.NamedArray}","page":"Utils","title":"MIToS.Utils.getarray","text":"Getter for the array field of NamedArrays\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.hascoordinates-Tuple{Any}","page":"Utils","title":"MIToS.Utils.hascoordinates","text":"hascoordinates(id) It returns true if id/sequence name has the format: UniProt/start-end (i.e. O83071/192-246)\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.isnotemptyfile-Tuple{Any}","page":"Utils","title":"MIToS.Utils.isnotemptyfile","text":"Returns true if the file exists and isn't empty.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.lineiterator-Tuple{String}","page":"Utils","title":"MIToS.Utils.lineiterator","text":"Create an iterable object that will yield each line from a stream or string.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.list2matrix-Union{Tuple{T}, Tuple{AbstractVector{T}, Int64}} where T","page":"Utils","title":"MIToS.Utils.list2matrix","text":"Returns a square symmetric matrix from the vector vec. side is the number of rows/columns. The diagonal is not included by default, set to true if there are diagonal elements in the list.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.matrix2list-Union{Tuple{AbstractMatrix{T}}, Tuple{T}} where T","page":"Utils","title":"MIToS.Utils.matrix2list","text":"Returns a vector with the part (\"upper\" or \"lower\") of the square matrix mat. The diagonal is not included by default.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.read_file-Union{Tuple{T}, Tuple{AbstractString, Type{T}, Vararg{Any}}} where T<:MIToS.Utils.FileFormat","page":"Utils","title":"MIToS.Utils.read_file","text":"read_file(pathname, FileFormat [, Type [, … ] ] ) -> Type\n\nThis function opens a file in the pathname and calls parse_file(io, ...) for the given FileFormat and Type on it. If the pathname is an HTTP or FTP URL, the file is downloaded with download in a temporal file. Gzipped files should end on .gz.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.select_element-Union{Tuple{Vector{T}}, Tuple{T}, Tuple{Vector{T}, String}} where T","page":"Utils","title":"MIToS.Utils.select_element","text":"Selects the first element of the vector. This is useful for unpacking one element vectors. Throws a warning if there are more elements. element_name is element by default, but the name can be changed using the second argument.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.write_file-Union{Tuple{T}, Tuple{AbstractString, Any, Type{T}}, Tuple{AbstractString, Any, Type{T}, String}} where T<:MIToS.Utils.FileFormat","page":"Utils","title":"MIToS.Utils.write_file","text":"write_file{T<:FileFormat}(filename::AbstractString, object, format::Type{T}, mode::ASCIIString=\"w\")\n\nThis function opens a file with filename and mode (default: \"w\") and writes (print_file) the object with the given format. Gzipped files should end on .gz.\n\n\n\n\n\n","category":"method"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"@info \"SIFTS docs\"","category":"page"},{"location":"SIFTS/#Module-SIFTS","page":"SIFTS","title":"SIFTS","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"The SIFTS module of MIToS allows to obtain the residue-level mapping between databases stored in the SIFTS XML files. It makes easy to assign PDB residues to UniProt/Pfam positions. Given the fact that pairwise alignments can lead to misleading association between residues in both sequences, SIFTS offers more reliable association between sequence and structure residue numbers.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS # to load the SIFTS module","category":"page"},{"location":"SIFTS/#Features","page":"SIFTS","title":"Features","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Download and parse SIFTS XML files\nStore residue-level mapping in Julia\nEasy generation of Dicts between residues numbers","category":"page"},{"location":"SIFTS/#Contents","page":"SIFTS","title":"Contents","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Pages = [\"SIFTS.md\"]\nDepth = 4","category":"page"},{"location":"SIFTS/#Simplest-residue-level-mapping","page":"SIFTS","title":"Simplest residue-level mapping","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"This module export the function siftsmapping to generate a Dict between residue numbers. This function takes 5 positional arguments.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"The name of the SIFTS XML file to parse,\nthe source database,\nthe source protein/structure identifier,\nthe destiny database and,\nthe destiny protein/structure identifier. Optionally it’s possible to indicate a particular PDB chain and if missings will be used.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Databases should be indicated using an available sub-type of DataBase. Keys and values types will be depend on the residue number type in that database.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Type db... Database Residue number type\ndbPDBe PDBe (Protein Data Bank in Europe) Int\ndbInterPro InterPro String\ndbUniProt UniProt Int\ndbPfam Pfam (Protein families database) Int\ndbNCBI NCBI (National Center for Biotechnology Information) Int\ndbPDB PDB (Protein Data Bank) String\ndbCATH CATH String\ndbSCOP SCOP (Structural Classification of Proteins) String\ndbEnsembl Ensembl String","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"To download the XML SIFTS file of a determined PDB use the downloadsifts function.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS\n\nimport MIToS # to use pathof(MIToS)\nsiftsfile = joinpath(dirname(pathof(MIToS)), \"..\", \"docs\", \"data\", \"1ivo.xml.gz\");","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"siftsfile = downloadsifts(\"1IVO\")","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"The following example, shows the residue number mapping between Pfam and PDB. Pfam uses UniProt coordinates and PDB uses their own residue numbers with insertion codes. Note that the siftsmapping function is case sensitive, and that SIFTS stores PDB identifiers using lowercase characters.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"siftsmap = siftsmapping(\n siftsfile,\n dbPfam,\n \"PF00757\",\n dbPDB,\n \"1ivo\", # SIFTS stores PDB identifiers in lowercase\n chain = \"A\", # In this example we are only using the chain A of the PDB\n missings = false,\n) # Residues without coordinates aren't used in the mapping","category":"page"},{"location":"SIFTS/#Storing-residue-level-mapping","page":"SIFTS","title":"Storing residue-level mapping","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"If you need more than the residue number mapping between two databases, you could access all the residue-level cross references using the function read_file in the SIFTSXMLFile.Format file. The parse_file function (and therefore the read_file function) for the SIFTSXML format, also takes the keyword arguments chain and missings. The read_file/parse_file function returns a Vector of SIFTSResidues objects that stores the cross references between residues in each database.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"siftsresidues = read_file(siftsfile, SIFTSXML, chain=\"A\", missings=false) # Array{SIFTSResidue,1}\nresidue_data = siftsresidues[301];","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"You are free to access the SIFTSResidue fields in order to get the desired information. SIFTSResidue objects contain db... objects (sub-types of DataBase), with the cross referenced information. You should note that, except for the PDBe and InterPro fields, the field values can be missing. The ismissing function is helpful to know if there is a db... object. For example, getting the UniProt residue name (one letter code of the amino acid) would be:","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"ismissing(residue_data.UniProt) ? \"\" : residue_data.UniProt.name","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"That line of code returns an empty string if the UniProt field is missing. Otherwise, it returns a string with the name of the residue in UniProt. Because that way of access values in a SIFT residue is too verbose, MIToS defines a more complex signature for get. Using MIToS get the previous line of code will be:","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"# SIFTSResidue database field default\nget(residue_data, dbUniProt, :name, \"\")","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"The is not need to use the full signature. Other signatures are possible depending on the value you want to access. In particular, a missing object is returned if a default value is not given at the end of the signature and the value to access is missing:","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"import MIToS # to use pathof(MIToS)\nsiftsfile = joinpath(dirname(pathof(MIToS)), \"..\", \"docs\", \"data\", \"1ivo.xml.gz\")\n\nusing MIToS.SIFTS\nresidue_data = read_file(siftsfile, SIFTSXML)[301]; # hide","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"get(residue_data, dbUniProt) # get takes the database type (`db...`)\nget(residue_data, dbUniProt, :name) # and can also take a field name (Symbol)","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"But you don't need the get function to access the three letter code of the residue in PDBe because the PDBe field can not be missing.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"residue_data.PDBe.name","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"SIFTSResidue also store information about if that residue is missing (i.e. not resolved) in the PDB structure and the information about the secondary structure (sscode and ssname):","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"residue_data.missing\nresidue_data.sscode\nresidue_data.ssname","category":"page"},{"location":"SIFTS/#Accessing-residue-level-cross-references","page":"SIFTS","title":"Accessing residue-level cross references","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"You can ask for particular values in a single SIFTSResidue using the get function.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS\nresidue_data = read_file(siftsfile, SIFTSXML)[301]\n# Is the UniProt residue name in the list of basic amino acids [\"H\", \"K\", \"R\"]?\nget(residue_data, dbUniProt, :name, \"\") in [\"H\", \"K\", \"R\"]","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Use higher order functions and lambda expressions (anonymous functions) or list comprehension to easily ask for information on the Vector{SIFTSResidue}. You can use get with the previous signature or simple direct field access and ismissing.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"# Captures PDB residue numbers if the Pfam id is \"PF00757\"\nresnums = [\n res.PDB.number for res in siftsresidues if\n !ismissing(res.PDB) && get(res, dbPfam, :id, \"\") == \"PF00757\"\n]","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Useful higher order functions are:","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"findall","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"# Which of the residues have UniProt residue names in the list [\"H\", \"K\", \"R\"]? (basic residues)\nindexes = findall(res -> get(res, dbUniProt, :name, \"\") in [\"H\", \"K\", \"R\"], siftsresidues)","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"map","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"map(i -> siftsresidues[i].UniProt, indexes) # UniProt data of the basic residues","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"filter","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"# SIFTSResidues with UniProt names in [\"H\", \"K\", \"R\"]\nbasicresidues =\n filter(res -> get(res, dbUniProt, :name, \"\") in [\"H\", \"K\", \"R\"], siftsresidues)\n\nbasicresidues[1].UniProt # UniProt data of the first basic residue","category":"page"},{"location":"SIFTS/#Example:-Which-residues-are-missing-in-the-PDB-structure","page":"SIFTS","title":"Example: Which residues are missing in the PDB structure","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Given that SIFTSResidue objects store a missing residue flag, it’s easy to get a vector where there is a true value if the residue is missing in the structure.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"import MIToS # to use pathof(MIToS)\nsiftsfile = joinpath(dirname(pathof(MIToS)), \"..\", \"docs\", \"data\", \"1ivo.xml.gz\");","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS\nsifts_1ivo = read_file(siftsfile, SIFTSXML, chain = \"A\"); # SIFTSResidues of the 1IVO chain A\n[res.missing for res in sifts_1ivo]","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"However, if you need to filter using other conditions, you’ll find useful the get function. In this example, we are going to ask for the UniProt id (to avoid problems with fragments, tags or chimeric/fusion proteins). We are also using get to select an specific PDB chain.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS\n\nimport MIToS # to use pathof(MIToS)\nsiftsfile = joinpath(dirname(pathof(MIToS)), \"..\", \"docs\", \"data\", \"1jqz.xml.gz\");","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"siftsfile = downloadsifts(\"1JQZ\")","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS\nsifts_1jqz = read_file(siftsfile, SIFTSXML); # It has an amino terminal his tag\nmissings = [\n (\n (get(res, dbUniProt, :id, \"\") == \"P05230\") &\n (get(res, dbPDB, :chain, \"\") == \"A\") &\n res.missing\n ) for res in sifts_1jqz\n];\nprintln(\n \"There are only \",\n sum(missings),\n \" missing residues in the chain A, associated to UniProt P05230\",\n)\nprintln(\n \"But there are \",\n sum([res.missing for res in sifts_1jqz]),\n \" missing residues in the PDB file.\",\n)","category":"page"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"@info \"Pfam API docs\"","category":"page"},{"location":"Pfam_API/#Pfam","page":"Pfam","title":"Pfam","text":"","category":"section"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"MIToS.Pfam","category":"page"},{"location":"Pfam_API/#MIToS.Pfam","page":"Pfam","title":"MIToS.Pfam","text":"The Pfam module, defines functions to measure the protein contact prediction performance of information measure between column pairs from a Pfam MSA.\n\nFeatures\n\nRead and download Pfam MSAs\nObtain PDB information from alignment annotations\nMap between sequence/alignment residues/columns and PDB structures\nMeasure of AUC (ROC curve) for contact prediction of MI scores\n\nusing MIToS.Pfam\n\n\n\n\n\n","category":"module"},{"location":"Pfam_API/#Contents","page":"Pfam","title":"Contents","text":"","category":"section"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"Pages = [\"Pfam_API.md\"]\nDepth = 2","category":"page"},{"location":"Pfam_API/#Types","page":"Pfam","title":"Types","text":"","category":"section"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"Modules = [MIToS.Pfam]\nPrivate = false\nOrder = [:type]","category":"page"},{"location":"Pfam_API/#Constants","page":"Pfam","title":"Constants","text":"","category":"section"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"Modules = [MIToS.Pfam]\nPrivate = false\nOrder = [:constant]","category":"page"},{"location":"Pfam_API/#Macros","page":"Pfam","title":"Macros","text":"","category":"section"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"Modules = [MIToS.Pfam]\nPrivate = false\nOrder = [:macro]","category":"page"},{"location":"Pfam_API/#Methods-and-functions","page":"Pfam","title":"Methods and functions","text":"","category":"section"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"Modules = [MIToS.Pfam]\nPrivate = false\nOrder = [:function]","category":"page"},{"location":"Pfam_API/#MIToS.Pfam.downloadpfam-Tuple{String}","page":"Pfam","title":"MIToS.Pfam.downloadpfam","text":"It downloads a gzipped Stockholm alignment from InterPro for the Pfam family with the given pfamcode.\n\nBy default, it downloads the full Pfam alignment. You can use the alignment keyword argument to download the seed or the uniprot alignment instead. For example, downloadpfam(\"PF00069\") will download the full alignment for the PF00069 Pfam family, while downloadpfam(\"PF00069\", alignment=\"seed\") will download the seed alignment of the family.\n\nThe extension of the downloaded file is .stockholm.gz by default; you can change it using the filename keyword argument, but the .gz at the end is mandatory.\n\n\n\n\n\n","category":"method"},{"location":"Pfam_API/#MIToS.Pfam.getcontactmasks-Union{Tuple{Vector{T}}, Tuple{T}} where T<:AbstractFloat","page":"Pfam","title":"MIToS.Pfam.getcontactmasks","text":"This function takes a msacontacts or its list of contacts contact_list with 1.0 for true contacts and 0.0 for not contacts (NaN or other numbers for missing values). Returns two BitVectors, the first with trues where contact_list is 1.0 and the second with trues where contact_list is 0.0. There are useful for AUC calculations.\n\n\n\n\n\n","category":"method"},{"location":"Pfam_API/#MIToS.Pfam.getseq2pdb-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment}","page":"Pfam","title":"MIToS.Pfam.getseq2pdb","text":"Generates from a Pfam msa a Dict{String, Vector{Tuple{String,String}}}. Keys are sequence IDs and each value is a list of tuples containing PDB code and chain.\n\njulia> getseq2pdb(msa)\nDict{String,Array{Tuple{String,String},1}} with 1 entry:\n \"F112_SSV1/3-112\" => [(\"2VQC\",\"A\")]\n\n\n\n\n\n","category":"method"},{"location":"Pfam_API/#MIToS.Pfam.hasresidues-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment, AbstractDict{Int64, String}}","page":"Pfam","title":"MIToS.Pfam.hasresidues","text":"Returns a BitVector where there is a true for each column with PDB residue.\n\n\n\n\n\n","category":"method"},{"location":"Pfam_API/#MIToS.Pfam.msacolumn2pdbresidue-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment, Vararg{String, 5}}","page":"Pfam","title":"MIToS.Pfam.msacolumn2pdbresidue","text":"msacolumn2pdbresidue(msa, seqid, pdbid, chain, pfamid, siftsfile; strict=false, checkpdbname=false, missings=true)\n\nThis function returns a OrderedDict{Int,String} with MSA column numbers on the input file as keys and PDB residue numbers (\"\" for missings) as values. The mapping is performed using SIFTS. This function needs correct ColMap and SeqMap annotations. This checks correspondence of the residues between the MSA sequence and SIFTS (It throws a warning if there are differences). Missing residues are included if the keyword argument missings is true (default: true). If the keyword argument strict is true (default: false), throws an Error, instead of a Warning, when residues don't match. If the keyword argument checkpdbname is true (default: false), throws an Error if the three letter name of the PDB residue isn't the MSA residue. If you are working with a downloaded Pfam MSA without modifications, you should read it using generatemapping=true and useidcoordinates=true. If you don't indicate the path to the siftsfile used in the mapping, this function downloads the SIFTS file in the current folder. If you don't indicate the Pfam accession number (pfamid), this function tries to read the AC file annotation.\n\n\n\n\n\n","category":"method"},{"location":"Pfam_API/#MIToS.Pfam.msacontacts","page":"Pfam","title":"MIToS.Pfam.msacontacts","text":"This function takes an AnnotatedMultipleSequenceAlignment with correct ColMap annotations and two dicts:\n\nThe first is an OrderedDict{String,PDBResidue} from PDB residue number to PDBResidue.\nThe second is a Dict{Int,String} from MSA column number on the input file to PDB residue number.\n\nmsacontacts returns a PairwiseListMatrix{Float64,false} of 0.0 and 1.0 where 1.0 indicates a residue contact. Contacts are defined with an inter residue distance less or equal to distance_limit (default to 6.05) angstroms between any heavy atom. NaN indicates a missing value.\n\n\n\n\n\n","category":"function"},{"location":"Pfam_API/#MIToS.Pfam.msaresidues-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment, AbstractDict{String, MIToS.PDB.PDBResidue}, AbstractDict{Int64, String}}","page":"Pfam","title":"MIToS.Pfam.msaresidues","text":"This function takes an AnnotatedMultipleSequenceAlignment with correct ColMap annotations and two dicts:\n\nThe first is an OrderedDict{String,PDBResidue} from PDB residue number to PDBResidue.\nThe second is a Dict{Int,String} from MSA column number on the input file to PDB residue number.\n\nmsaresidues returns an OrderedDict{Int,PDBResidue} from input column number (ColMap) to PDBResidue. Residues on inserts are not included.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"@info \"PDB API docs\"","category":"page"},{"location":"PDB_API/#PDB","page":"PDB","title":"PDB","text":"","category":"section"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"MIToS.PDB","category":"page"},{"location":"PDB_API/#MIToS.PDB","page":"PDB","title":"MIToS.PDB","text":"The module PDB defines types and methods to work with protein structures inside Julia. It is useful to link structural and sequential information, and needed for measure the predictive performance at protein contact prediction of mutual information scores.\n\nFeatures\n\nRead and parse PDF and PDBML files\nCalculate distance and contacts between atoms or residues\nDetermine interaction between residues\n\nusing MIToS.PDB\n\n\n\n\n\n","category":"module"},{"location":"PDB_API/#Contents","page":"PDB","title":"Contents","text":"","category":"section"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"Pages = [\"PDB_API.md\"]\nDepth = 2","category":"page"},{"location":"PDB_API/#Types","page":"PDB","title":"Types","text":"","category":"section"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"Modules = [MIToS.PDB]\nPrivate = false\nOrder = [:type]","category":"page"},{"location":"PDB_API/#MIToS.PDB.Coordinates","page":"PDB","title":"MIToS.PDB.Coordinates","text":"A Coordinates object is a fixed size vector with the coordinates x,y,z.\n\n\n\n\n\n","category":"type"},{"location":"PDB_API/#MIToS.PDB.PDBAtom","page":"PDB","title":"MIToS.PDB.PDBAtom","text":"A PDBAtom object contains the information from a PDB atom, without information of the residue. It has the following fields that you can access at any moment for query purposes:\n\n- `coordinates` : x,y,z coordinates, e.g. `Coordinates(109.641,73.162,42.7)`.\n- `atom` : Atom name, e.g. `\"CA\"`.\n- `element` : Element type of the atom, e.g. `\"C\"`.\n- `occupancy` : A float number with the occupancy, e.g. `1.0`.\n- `B` : B factor as a string, e.g. `\"23.60\"`.\n- `alt_id` : Alternative location ID, e.g. `\"A\"`.\n- `charge` : Charge of the atom, e.g. `\"0\"`.\n\n\n\n\n\n","category":"type"},{"location":"PDB_API/#MIToS.PDB.PDBFile","page":"PDB","title":"MIToS.PDB.PDBFile","text":"PDBFile <: FileFormat\n\nProtein Data Bank (PDB) format. It provides a standard representation for macromolecular structure data derived from X-ray diffraction and NMR studies.\n\n\n\n\n\n","category":"type"},{"location":"PDB_API/#MIToS.PDB.PDBML","page":"PDB","title":"MIToS.PDB.PDBML","text":"PDBML <: FileFormat\n\nProtein Data Bank Markup Language (PDBML), a representation of PDB data in XML format.\n\n\n\n\n\n","category":"type"},{"location":"PDB_API/#MIToS.PDB.PDBResidue","page":"PDB","title":"MIToS.PDB.PDBResidue","text":"A PDBResidue object contains all the information about a PDB residue. It has the following fields that you can access at any moment for query purposes:\n\n- `id` : A `PDBResidueIdentifier` object.\n- `atoms` : A vector of `PDBAtom`s.\n\n\n\n\n\n","category":"type"},{"location":"PDB_API/#MIToS.PDB.PDBResidueIdentifier","page":"PDB","title":"MIToS.PDB.PDBResidueIdentifier","text":"A PDBResidueIdentifier object contains the information needed to identity PDB residues. It has the following fields that you can access at any moment for query purposes:\n\n- `PDBe_number` : It's only used when a PDBML is readed (PDBe number as a string).\n- `number` : PDB residue number, it includes insertion codes, e.g. `\"34A\"`.\n- `name` : Three letter residue name in PDB, e.g. `\"LYS\"`.\n- `group` : It can be `\"ATOM\"` or `\"HETATM\"`.\n- `model` : The model number as a string, e.g. `\"1\"`.\n- `chain` : The chain as a string, e.g. `\"A\"`.\n\n\n\n\n\n","category":"type"},{"location":"PDB_API/#Constants","page":"PDB","title":"Constants","text":"","category":"section"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"Modules = [MIToS.PDB]\nPrivate = false\nOrder = [:constant]","category":"page"},{"location":"PDB_API/#MIToS.PDB.covalentradius","page":"PDB","title":"MIToS.PDB.covalentradius","text":"Covalent radius in Å of each element from the Additional file 1 of PICCOLO (Bickerton et al.). Hydrogen was updated using the value on Table 2 from (Cordero et al.).\n\nReferences\n\n- [Bickerton, George R., Alicia P. Higueruelo, and Tom L. Blundell. \"Comprehensive, \n atomic-level characterization of structurally characterized protein-protein \n interactions: the PICCOLO database.\" BMC bioinformatics \n 12 (2011): 1-15.](@cite 10.1186/1471-2105-12-313)\n- [Cordero, Beatriz, et al. \"Covalent radii revisited.\" Dalton Transactions \n 21 (2008): 2832-2838.](@cite 10.1039/B801115J)\n\n\n\n\n\n","category":"constant"},{"location":"PDB_API/#MIToS.PDB.vanderwaalsradius","page":"PDB","title":"MIToS.PDB.vanderwaalsradius","text":"van der Waals radius in Å from the Additional file 1 of Bickerton et al.\n\nReferences\n\n- [Bickerton, George R., Alicia P. Higueruelo, and Tom L. Blundell. \"Comprehensive, \n atomic-level characterization of structurally characterized protein-protein \n interactions: the PICCOLO database.\" BMC bioinformatics \n 12 (2011): 1-15.](@cite 10.1186/1471-2105-12-313)\n\n\n\n\n\n","category":"constant"},{"location":"PDB_API/#Macros","page":"PDB","title":"Macros","text":"","category":"section"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"Modules = [MIToS.PDB]\nPrivate = false\nOrder = [:macro]","category":"page"},{"location":"PDB_API/#MIToS.PDB.@atoms-Tuple{Any, Symbol, Any, Symbol, Any, Symbol, Any, Symbol, Any, Symbol, Any}","page":"PDB","title":"MIToS.PDB.@atoms","text":"@atoms ... model ... chain ... group ... residue ... atom ...\n\nThese return a vector of PDBAtoms with the selected subset of atoms from a list of residues. You can use the type All to avoid filtering that option.\n\nDEPRECATED: This macro is deprecated. Use the select_atoms function instead.\n\n\n\n\n\n","category":"macro"},{"location":"PDB_API/#MIToS.PDB.@residues-Tuple{Any, Symbol, Any, Symbol, Any, Symbol, Any, Symbol, Any}","page":"PDB","title":"MIToS.PDB.@residues","text":"@residues ... model ... chain ... group ... residue ...\n\nThese return a new vector with the selected subset of residues from a list of residues. You can use the type All to avoid filtering that option.\n\nDEPRECATED: This macro is deprecated. Use the select_residues function instead.\n\n\n\n\n\n","category":"macro"},{"location":"PDB_API/#MIToS.PDB.@residuesdict-Tuple{Any, Symbol, Any, Symbol, Any, Symbol, Any, Symbol, Any}","page":"PDB","title":"MIToS.PDB.@residuesdict","text":"@residuesdict ... model ... chain ... group ... residue ...\n\nThis macro returns a dictionary (using PDB residue numbers as keys) with the selected subset of residues from a list of residues. You can use the type All to avoid filtering that option.\n\nDEPRECATED: This macro is deprecated. Use the residuesdict function instead.\n\n\n\n\n\n","category":"macro"},{"location":"PDB_API/#Methods-and-functions","page":"PDB","title":"Methods and functions","text":"","category":"section"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"Modules = [MIToS.PDB]\nPrivate = false\nOrder = [:function]","category":"page"},{"location":"PDB_API/#Base.angle-Tuple{MIToS.PDB.Coordinates, MIToS.PDB.Coordinates, MIToS.PDB.Coordinates}","page":"PDB","title":"Base.angle","text":"angle(a::Coordinates, b::Coordinates, c::Coordinates)\n\nAngle (in degrees) at b between a-b and b-c\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#Base.any-Tuple{Function, MIToS.PDB.PDBResidue, MIToS.PDB.PDBResidue, Function}","page":"PDB","title":"Base.any","text":"any(f::Function, a::PDBResidue, b::PDBResidue, criteria::Function)\n\nTest if the function f is true for any pair of atoms between the residues a and b. This function only test atoms that returns true for the fuction criteria.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#Base.any-Tuple{Function, MIToS.PDB.PDBResidue, MIToS.PDB.PDBResidue}","page":"PDB","title":"Base.any","text":"any(f::Function, a::PDBResidue, b::PDBResidue)\n\nTest if the function f is true for any pair of atoms between the residues a and b\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.CAmatrix-Tuple{AbstractVector{MIToS.PDB.PDBResidue}}","page":"PDB","title":"MIToS.PDB.CAmatrix","text":"Returns a matrix with the x, y and z coordinates of the Cα with best occupancy for each PDBResidue of the ATOM group. If a residue doesn't have a Cα, its Cα coordinates are NaNs.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.aromatic-Tuple{MIToS.PDB.PDBResidue, MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.aromatic","text":"There's an aromatic interaction if centriods are at 6.0 Å or less.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.aromaticsulphur-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.aromaticsulphur","text":"Returns true if an sulphur and an aromatic atoms are 5.3 Å or less\"\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.bestoccupancy-Tuple{Vector{MIToS.PDB.PDBAtom}}","page":"PDB","title":"MIToS.PDB.bestoccupancy","text":"Takes a Vector of PDBAtoms and returns a Vector of the PDBAtoms with best occupancy.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.center!-Tuple{AbstractMatrix{Float64}}","page":"PDB","title":"MIToS.PDB.center!","text":"center!(A::AbstractMatrix{Float64})\n\nTakes a set of points A as an NxD matrix (N: number of points, D: dimension). Translates A in place so that its centroid is at the origin of coordinates\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.centeredcoordinates","page":"PDB","title":"MIToS.PDB.centeredcoordinates","text":"Returns a Matrix{Float64} with the centered coordinates of all the atoms in residues. An optional positional argument CA (default: true) defines if only Cα carbons should be used to center the matrix.\n\n\n\n\n\n","category":"function"},{"location":"PDB_API/#MIToS.PDB.centeredresidues","page":"PDB","title":"MIToS.PDB.centeredresidues","text":"Returns a new Vector{PDBResidue} with the PDBResidues having centered coordinates. An optional positional argument CA (default: true) defines if only Cα carbons should be used to center the matrix.\n\n\n\n\n\n","category":"function"},{"location":"PDB_API/#MIToS.PDB.change_coordinates","page":"PDB","title":"MIToS.PDB.change_coordinates","text":"change_coordinates(residue::PDBResidue, coordinates::AbstractMatrix{Float64}, offset::Int=1)\n\nReturns a new PDBResidues with (x,y,z) from a coordinates AbstractMatrix{Float64} You can give an offset indicating in wich matrix row starts the (x,y,z) coordinates of the residue.\n\n\n\n\n\n","category":"function"},{"location":"PDB_API/#MIToS.PDB.change_coordinates-Tuple{AbstractVector{MIToS.PDB.PDBResidue}, AbstractMatrix{Float64}}","page":"PDB","title":"MIToS.PDB.change_coordinates","text":"change_coordinates(residues::AbstractVector{PDBResidue}, coordinates::AbstractMatrix{Float64})\n\nReturns a new Vector{PDBResidues} with (x,y,z) from a coordinates Matrix{Float64}\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.change_coordinates-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.Coordinates}","page":"PDB","title":"MIToS.PDB.change_coordinates","text":"change_coordinates(atom::PDBAtom, coordinates::Coordinates)\n\nReturns a new PDBAtom but with a new coordinates\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.check_atoms_for_interactions-Tuple{MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.check_atoms_for_interactions","text":"This function takes a PDBResidue and returns true only if all the atoms can be used for checking interactions.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.contact-Tuple{MIToS.PDB.Coordinates, MIToS.PDB.Coordinates, AbstractFloat}","page":"PDB","title":"MIToS.PDB.contact","text":"contact(a::Coordinates, b::Coordinates, limit::AbstractFloat)\n\nIt returns true if the distance is less or equal to the limit. It doesn't call sqrt because it does squared_distance(a,b) <= limit^2.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.contact-Tuple{MIToS.PDB.PDBResidue, MIToS.PDB.PDBResidue, AbstractFloat}","page":"PDB","title":"MIToS.PDB.contact","text":"contact(A::PDBResidue, B::PDBResidue, limit::AbstractFloat; criteria::String=\"All\")\n\nReturns true if the residues A and B are at contact distance (limit). The available distance criteria are: Heavy, All, CA, CB (CA for GLY)\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.contact-Tuple{Vector{MIToS.PDB.PDBResidue}, AbstractFloat}","page":"PDB","title":"MIToS.PDB.contact","text":"contact(residues::Vector{PDBResidue}, limit::AbstractFloat; criteria::String=\"All\")\n\nIf contact takes a Vector{PDBResidue}, It returns a matrix with all the pairwise comparisons (contact map).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.coordinatesmatrix-Tuple{MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.coordinatesmatrix","text":"Returns a matrix with the x, y, z coordinates of each atom in each PDBResidue\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.covalent-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.covalent","text":"Returns true if the distance between atoms is less than the sum of the covalentradius of each atom.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.distance-Tuple{MIToS.PDB.Coordinates, MIToS.PDB.Coordinates}","page":"PDB","title":"MIToS.PDB.distance","text":"It calculates the squared euclidean distance.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.distance-Tuple{Vector{MIToS.PDB.PDBResidue}}","page":"PDB","title":"MIToS.PDB.distance","text":"distance(residues::Vector{PDBResidue}; criteria::String=\"All\")\n\nIf distance takes a Vector{PDBResidue} returns a PairwiseListMatrix{Float64, false} with all the pairwise comparisons (distance matrix).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.disulphide-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.disulphide","text":"Returns true if two CYS's S are at 2.08 Å or less\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.download_alphafold_structure-Union{Tuple{String}, Tuple{T}} where T<:MIToS.Utils.FileFormat","page":"PDB","title":"MIToS.PDB.download_alphafold_structure","text":"download_alphafold_structure(uniprot_accession::String; format::Type{T}=MMCIFFile) where T<:FileFormat\n\nThis function downloads the structure file (PDB or mmCIF) for a given UniProt Accession from AlphaFoldDB. The uniprot_accession parameter specifies the UniProt Accession of the protein, e.g. \"P00520\". The format parameter specifies the file format to download, with the default being mmCIF, i.e. MMCIFFile. You can set format to PDBFile if you want to download a PDB file.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.downloadpdb-Union{Tuple{String}, Tuple{T}} where T<:MIToS.Utils.FileFormat","page":"PDB","title":"MIToS.PDB.downloadpdb","text":"downloadpdb(pdbcode::String; format::Type{T} = MMCIFFile, filename, baseurl, kargs...)\n\nIt downloads a gzipped PDB file from PDB database. It requires a four character pdbcode. Its default format is MMCIFFile (mmCIF) and It uses the baseurl \"http://www.rcsb.org/pdb/files/\". filename is the path/name of the output file. This function calls MIToS.Utils.download_file that calls Downloads.download. So, you can use keyword arguments, such as headers, from that function.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.downloadpdbheader-Tuple{String}","page":"PDB","title":"MIToS.PDB.downloadpdbheader","text":"It downloads a JSON file containing the PDB header information.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.findCB-Tuple{MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.findCB","text":"Returns a vector of indices for CB (CA for GLY)\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.findatoms-Tuple{Vector{MIToS.PDB.PDBAtom}, String}","page":"PDB","title":"MIToS.PDB.findatoms","text":"findatoms(res::PDBResidue, atom::String)\n\nReturns a index vector of the atoms with the given atom name.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.findheavy-Tuple{Vector{MIToS.PDB.PDBAtom}}","page":"PDB","title":"MIToS.PDB.findheavy","text":"Returns a list with the index of the heavy atoms (all atoms except hydrogen) in the PDBResidue\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.getCA-Tuple{MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.getCA","text":"Returns the Cα with best occupancy in the PDBResidue. If the PDBResidue has no Cα, missing is returned.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.getpdbdescription-Tuple{String}","page":"PDB","title":"MIToS.PDB.getpdbdescription","text":"Access general information about a PDB entry (e.g., Header information) using the GraphQL interface of the PDB database. It parses the JSON answer into a JSON3.Object that can be used as a dictionary.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.hydrogenbond-Tuple{MIToS.PDB.PDBResidue, MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.hydrogenbond","text":"This function only works if there are hydrogens in the structure. The criteria for a hydrogen bond are:\n\nd(Ai, Aj) < 3.9Å\nd(Ah, Aacc) < 2.5Å\nθ(Adon, Ah, Aacc) > 90°\nθ(Adon, Aacc, Aacc-antecedent) > 90°\nθ(Ah, Aacc, Aacc-antecedent) > 90°\n\nWhere Ah is the donated hydrogen atom, Adon is the hydrogen bond donor atom, Aacc is the hydrogen bond acceptor atom and Aacc-antecednt is the atom antecedent to the hydrogen bond acceptor atom.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.hydrophobic-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.hydrophobic","text":"There's an hydrophobic interaction if two hydrophobic atoms are at 5.0 Å or less.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.ionic-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.ionic","text":"There's an ionic interaction if a cationic and an anionic atoms are at 6.0 Å or less.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.is_aminoacid-Tuple{MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.is_aminoacid","text":"is_aminoacid(residue::PDBResidue)\nis_aminoacid(residue_id::PDBResidueIdentifier)\n\nThis function returns true if the PDB residue is an amino acid residue. It checks if the residue's three-letter name exists in the MIToS.Utils.THREE2ONE dictionary, and returns false otherwise.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.isanionic-Tuple{MIToS.PDB.PDBAtom, String}","page":"PDB","title":"MIToS.PDB.isanionic","text":"Returns true if the atom, e.g. (\"GLU\",\"CD\"), is an anionic atom in the residue.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.isaromatic-Tuple{MIToS.PDB.PDBAtom, String}","page":"PDB","title":"MIToS.PDB.isaromatic","text":"Returns true if the atom, e.g. (\"HIS\",\"CG\"), is an aromatic atom in the residue.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.isatom-Tuple{MIToS.PDB.PDBAtom, Any}","page":"PDB","title":"MIToS.PDB.isatom","text":"It tests if the atom has the indicated atom name.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.iscationic-Tuple{MIToS.PDB.PDBAtom, String}","page":"PDB","title":"MIToS.PDB.iscationic","text":"Returns true if the atom, e.g. (\"ARG\",\"NE\"), is a cationic atom in the residue.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.ishbondacceptor-Tuple{MIToS.PDB.PDBAtom, String}","page":"PDB","title":"MIToS.PDB.ishbondacceptor","text":"Returns true if the atom, e.g. (\"ARG\",\"O\"), is an acceptor in H bonds.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.ishbonddonor-Tuple{MIToS.PDB.PDBAtom, String}","page":"PDB","title":"MIToS.PDB.ishbonddonor","text":"Returns true if the atom, e.g. (\"ARG\",\"N\"), is a donor in H bonds.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.isresidue-Tuple{MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.isresidue","text":" isresidue(res; model=All, chain=All, group=All, residue=All)\n\nThis function tests if a PDBResidue has the indicated model, chain, group and residue names/numbers. You can use the type All (default value) to avoid filtering that level.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.kabsch-Tuple{AbstractMatrix{Float64}, AbstractMatrix{Float64}}","page":"PDB","title":"MIToS.PDB.kabsch","text":"kabsch(A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})\n\nThis function takes two sets of points, A (refrence) and B as NxD matrices, where D is the dimension and N is the number of points. Assumes that the centroids of A and B are at the origin of coordinates. You can call center! on each matrix before calling kabsch to center the matrices in the (0.0, 0.0, 0.0). Rotates B so that rmsd(A,B) is minimized. Returns the rotation matrix. You should do B * RotationMatrix to get the rotated B.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.mean_coordinates-Union{Tuple{AbstractVector{T}}, Tuple{T}} where T<:AbstractMatrix{Float64}","page":"PDB","title":"MIToS.PDB.mean_coordinates","text":"Calculates the average/mean position of each atom in a set of structure. The function takes a vector (AbstractVector) of vectors (AbstractVector{PDBResidue}) or matrices (AbstractMatrix{Float64}) as first argument. As second (optional) argument this function can take an AbstractVector{Float64} of matrix/structure weights to return a weighted mean. When a AbstractVector{PDBResidue} is used, if the keyword argument calpha is false the RMSF is calculated for all the atoms. By default only alpha carbons are used (default: calpha=true).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.modelled_sequences-Union{Tuple{AbstractArray{MIToS.PDB.PDBResidue, N}}, Tuple{N}} where N","page":"PDB","title":"MIToS.PDB.modelled_sequences","text":"modelled_sequences(residue_list::AbstractArray{PDBResidue,N}; \n model::Union{String,Type{All}}=All, chain::Union{String,Type{All}}=All, \n group::Union{String,Regex,Type{All}}=All) where N\n\nThis function returns an OrderedDict where each key is a named tuple (containing the model and chain identifiers), and each value is the protein sequence corresponding to the modelled residues in those chains. Therefore, the obtained sequences do not contain missing residues. All modelled residues are included by default, but those that don't satisfy specified criteria based on the model, chain, or group keyword arguments are excluded. One-letter residue names are obtained from the MIToS.Utils.THREE2ONE dictionary for all residue names that return true for is_aminoacid.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.pication-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.pication","text":"There's a Π-Cation interaction if a cationic and an aromatic atoms are at 6.0 Å or less\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.proximitymean-Union{Tuple{T}, Tuple{Vector{MIToS.PDB.PDBResidue}, AbstractVector{T}}, Tuple{Vector{MIToS.PDB.PDBResidue}, AbstractVector{T}, T}} where T<:AbstractFloat","page":"PDB","title":"MIToS.PDB.proximitymean","text":"proximitymean calculates the proximity mean/average for each residue as the average score (from a scores list) of all the residues within a certain physical distance to a given amino acid. The score of that residue is not included in the mean unless you set include to true. The default values are 6.05 for the distance threshold/limit and \"Heavy\" for the criteria keyword argument. This function allows to calculate pMI (proximity mutual information) and pC (proximity conservation) as in Buslje et al..\n\nReferences\n\nMarino Buslje, Cristina, et al. \"Networks of high mutual information define the structural proximity of catalytic sites: implications for catalytic residue identification.\" PLoS computational biology 6.11 (2010): e1000978.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.query_alphafolddb-Tuple{String}","page":"PDB","title":"MIToS.PDB.query_alphafolddb","text":"query_alphafolddb(uniprot_accession::String)\n\nThis function queries the AlphaFoldDB API to retrieve structure information for a given uniprot_accession, e.g. \"P00520\". This function returns the structure information as a JSON3.Object.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.residuepairsmatrix-Union{Tuple{diagonal}, Tuple{T}, Tuple{Vector{MIToS.PDB.PDBResidue}, Type{T}, Type{Val{diagonal}}, T}} where {T, diagonal}","page":"PDB","title":"MIToS.PDB.residuepairsmatrix","text":"It creates a NamedArray containing a PairwiseListMatrix where each element (column, row) is identified with a PDBResidue from the input vector. You can indicate the value type of the matrix (default to Float64), if the list should have the diagonal values (default to Val{false}) and the diagonal values (default to NaN).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.residues-Union{Tuple{N}, Tuple{AbstractArray{MIToS.PDB.PDBResidue, N}, Vararg{Any, 4}}} where N","page":"PDB","title":"MIToS.PDB.residues","text":"The residues function for AbstractArray{PDBResidue,N} is deprecated. Use the select_residues function instead. So, residues(residue_list, model, chain, group, residue) becomes select_residues(residue_list; model=model, chain=chain, group=group, residue=residue).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.residuesdict-Union{Tuple{AbstractArray{MIToS.PDB.PDBResidue, N}}, Tuple{N}} where N","page":"PDB","title":"MIToS.PDB.residuesdict","text":" residuesdict(residue_list; model=All, chain=All, group=All, residue=All)\n\nThis function returns a dictionary (using PDB residue numbers as keys) with the selected subset of residues. The residues are selected using the keyword arguments model, chain, group and residue. You can use the type All (default value) to avoid filtering at a particular level.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.rmsd-Tuple{AbstractMatrix{Float64}, AbstractMatrix{Float64}}","page":"PDB","title":"MIToS.PDB.rmsd","text":"rmsd(A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})\n\nReturn RMSD between two sets of points A and B, given as NxD matrices (N: number of points, D: dimension).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.rmsd-Tuple{AbstractVector{MIToS.PDB.PDBResidue}, AbstractVector{MIToS.PDB.PDBResidue}}","page":"PDB","title":"MIToS.PDB.rmsd","text":"rmsd(A::AbstractVector{PDBResidue}, B::AbstractVector{PDBResidue}; superimposed::Bool=false)\n\nReturns the Cα RMSD value between two PDB structures: A and B. If the structures are already superimposed between them, use superimposed=true to avoid a new superimposition (superimposed is false by default).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.rmsf-Union{Tuple{AbstractVector{T}}, Tuple{T}} where T<:AbstractMatrix{Float64}","page":"PDB","title":"MIToS.PDB.rmsf","text":"Calculates the RMSF (Root Mean-Square-Fluctuation) between an atom and its average position in a set of structures. The function takes a vector (AbstractVector) of vectors (AbstractVector{PDBResidue}) or matrices (AbstractMatrix{Float64}) as first argument. As second (optional) argument this function can take an AbstractVector{Float64} of matrix/structure weights to return the root weighted mean-square-fluctuation around the weighted mean structure. When a Vector{PDBResidue} is used, if the keyword argument calpha is false the RMSF is calculated for all the atoms. By default only alpha carbons are used (default: calpha=true).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.select_atoms-Union{Tuple{AbstractArray{MIToS.PDB.PDBResidue, N}}, Tuple{N}} where N","page":"PDB","title":"MIToS.PDB.select_atoms","text":"select_atoms(residue_list; model=All, chain=All, group=All, residue=All, atom=All, alt_id=All, charge=All)\n\nThis function returns a vector of PDBAtoms with the selected subset of atoms from a list of residues. The atoms are selected using the keyword arguments model, chain, group, residue, atom, alt_id, and charge. You can use the type All (default value) to avoid filtering at a particular level.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.select_residues-Union{Tuple{AbstractArray{MIToS.PDB.PDBResidue, N}}, Tuple{N}} where N","page":"PDB","title":"MIToS.PDB.select_residues","text":"select_residues(residue_list; model=All, chain=All, group=All, residue=All)\n\nThis function returns a new vector with the selected subset of residues from a list of residues. You can use the keyword arguments model, chain, group and residue to select the residues. You can use the type All (default value) to avoid filtering at a particular level.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.selectbestoccupancy-Tuple{Vector{MIToS.PDB.PDBAtom}, Vector{Int64}}","page":"PDB","title":"MIToS.PDB.selectbestoccupancy","text":"Takes a PDBResidue and a Vector of atom indices. Returns the index value of the Vector with maximum occupancy.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.squared_distance-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom}","page":"PDB","title":"MIToS.PDB.squared_distance","text":"It calculates the squared euclidean distance, i.e. it doesn't spend time in sqrt\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.squared_distance-Tuple{MIToS.PDB.PDBResidue, MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.squared_distance","text":"squared_distance(A::PDBResidue, B::PDBResidue; criteria::String=\"All\")\n\nReturns the squared distance between the residues A and B. The available criteria are: Heavy, All, CA, CB (CA for GLY)\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.superimpose","page":"PDB","title":"MIToS.PDB.superimpose","text":"Asuper, Bsuper, RMSD = superimpose(A, B, matches=nothing)\n\nThis function takes A::AbstractVector{PDBResidue} (reference) and B::AbstractVector{PDBResidue}. Translates A and B to the origin of coordinates, and rotates B so that rmsd(A,B) is minimized with the Kabsch algorithm (using only their α carbons). Returns the rotated and translated versions of A and B, and the RMSD value.\n\nOptionally provide matches which iterates over matched index pairs in A and B, e.g., matches = [(3, 5), (4, 6), ...]. The alignment will be constructed using just the matching residues.\n\n\n\n\n\n","category":"function"},{"location":"PDB_API/#MIToS.PDB.vanderwaals-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.vanderwaals","text":"Test if two atoms or residues are in van der Waals contact using: distance(a,b) <= 0.5 + vanderwaalsradius[a] + vanderwaalsradius[b]. It returns distance <= 0.5 if the atoms aren't in vanderwaalsradius.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.vanderwaalsclash-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.vanderwaalsclash","text":"Returns true if the distance between the atoms is less than the sum of the vanderwaalsradius of the atoms. If the atoms aren't on the list (i.e. OXT), the vanderwaalsradius of the element is used. If there is not data in the dict, distance 0.0 is used.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.Utils.parse_file-Tuple{LightXML.XMLDocument, Type{MIToS.PDB.PDBML}}","page":"PDB","title":"MIToS.Utils.parse_file","text":"parse_file(pdbml, ::Type{PDBML}; chain=All, model=All, group=All, atomname=All, onlyheavy=false, label=true, occupancyfilter=false)\n\nReads a LightXML.XMLDocument representing a pdb file. Returns a list of PDBResidues (view MIToS.PDB.PDBResidues). Setting chain, model, group, atomname and onlyheavy values can be used to select of a subset of all residues. If not set, all residues are returned. If the keyword argument label (default: true) is false,the auth_ attributes will be use instead of the label_ attributes for chain, atom and residue name fields. The auth_ attributes are alternatives provided by an author in order to match the identification/values used in the publication that describes the structure. If the keyword argument occupancyfilter (default: false) is true, only the atoms with the best occupancy are returned.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.Utils.parse_file-Tuple{Union{IO, String}, Type{MIToS.PDB.MMCIFFile}}","page":"PDB","title":"MIToS.Utils.parse_file","text":"parse_file(io, ::Type{MMCIFFile}; chain=All, model=All, group=All, atomname=All, onlyheavy=false, label=true, occupancyfilter=false)\n\nParse an mmCIF file and returns a list of PDBResidues. Setting chain, model, group, atomname and onlyheavy values can be used to select a subset of residues. Group can be \"ATOM\" or \"HETATM\". If those keyword arguments are not set, all residues are returned. If the keyword argument label (default: true) is false, the auth_ attributes will be used instead of the label_ attributes for chain, atom, and residue name fields. The auth_ attributes are alternatives provided by an author in order to match the identification/values used in the publication that describes the structure. If the keyword argument occupancyfilter (default: false) is true, only the atoms with the best occupancy are returned.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.Utils.parse_file-Tuple{Union{IO, String}, Type{MIToS.PDB.PDBFile}}","page":"PDB","title":"MIToS.Utils.parse_file","text":"parse_file(io, ::Type{PDBFile}; chain=All, model=All, group=All, atomname=All, onlyheavy=false, occupancyfilter=false)\n\nReads a text file of a PDB entry. Returns a list of PDBResidue (view MIToS.PDB.PDBResidues). Setting chain, model, group, atomname and onlyheavy values can be used to select of a subset of all residues. Group can be \"ATOM\" or \"HETATM\". If not set, all residues are returned. If the keyword argument occupancyfilter (default: false) is true, only the atoms with the best occupancy are returned.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.Utils.print_file","page":"PDB","title":"MIToS.Utils.print_file","text":"print_file(io, res, format::Type{PDBFile}) print_file(res, format::Type{PDBFile})\n\nPrint a PDBResidue or a vector of PDBResidues in PDB format.\n\n\n\n\n\n","category":"function"},{"location":"References/","page":"References","title":"References","text":"@info \"References\"","category":"page"},{"location":"References/#References","page":"References","title":"References","text":"","category":"section"},{"location":"References/","page":"References","title":"References","text":"D. J. Zea, D. Anfossi, M. Nielsen and C. Marino-Buslje. MIToS. jl: mutual information tools for protein sequence analysis in the Julia language. Bioinformatics 33, 564–565 (2017).\n\n\n\nU. Hobohm, M. Scharf, R. Schneider and C. Sander. Selection of representative protein data sets. Protein Science 1, 409–417 (1992).\n\n\n\nC. M. Buslje, J. Santos, J. M. Delfino and M. Nielsen. Correction for phylogeny, small number of observations and data redundancy improves the identification of coevolving amino acid pairs using mutual information. Bioinformatics 25, 1125–1131 (2009).\n\n\n\nS. F. Altschul, T. L. Madden, A. A. Schäffer, J. Zhang, Z. Zhang, W. Miller and D. J. Lipman. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic acids research 25, 3389–3402 (1997).\n\n\n\nS. D. Dunn, L. M. Wahl and G. B. Gloor. Mutual information without the influence of phylogeny or entropy dramatically improves residue contact prediction. Bioinformatics 24, 333–340 (2008).\n\n\n\nS. Velankar, J. M. Dana, J. Jacobsen, G. van Ginkel, P. J. Gane, J. Luo, T. J. Oldfield, C. O’Donovan, M.-J. Martin and G. J. Kleywegt. SIFTS: Structure Integration with Function, Taxonomy and Sequences resource. Nucleic Acids Research 41, D483-D489 (2012).\n\n\n\nP. Stothard. The sequence manipulation suite: JavaScript programs for analyzing and formatting protein and DNA sequences. Biotechniques 28, 1102–1104 (2000).\n\n\n\nB. J. Grant, A. P. Rodrigues, K. M. ElSawy, J. A. McCammon and L. S. Caves. Bio3d: an R package for the comparative analysis of protein structures. Bioinformatics 22, 2695–2696 (2006).\n\n\n\nW. Perks. Some observations on inverse probability including a new indifference rule. Journal of the Institute of Actuaries 73, 285–334 (1947).\n\n\n\nS. Trybula. Some problems of simultaneous minimax estimation. The Annals of Mathematical Statistics 29, 245–253 (1958).\n\n\n\nH. Jeffreys. An invariant form for the prior probability in estimation problems. Proceedings of the Royal Society of London. Series A. Mathematical and Physical Sciences 186, 453–461 (1946).\n\n\n\nC. Marino Buslje, E. Teppa, T. Di Doménico, J. M. Delfino and M. Nielsen. Networks of high mutual information define the structural proximity of catalytic sites: implications for catalytic residue identification. PLoS computational biology 6, e1000978 (2010).\n\n\n\n","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"EditURL = \"cookbook/01_Change_B_factors.jl\"","category":"page"},{"location":"01_Change_B_factors/#Change-B-factors","page":"Change B-factors","title":"Change B-factors","text":"","category":"section"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"(Image: ) (Image: )","category":"page"},{"location":"01_Change_B_factors/#Problem-description","page":"Change B-factors","title":"Problem description","text":"","category":"section"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"It is a common practice to change the B-factors of a PDB to store information about atoms or residues to be used by other programs. In particular, values in the B-factor column can be easily used to colour residues with PyMOL or Chimera.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"We cannot simply assign a new value to the B field of a PDBAtom because this type is immutable. However, we can make use of the @set macro of the Setfield package to create a new PDBAtom with a different B-factor value.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"In a PDB file, B-factors are stored from the column 61 to 66. Therefore, new B-factors should be a String with 6 or fewer characters, normally using two characters for decimal values. We can use pyfmt and FormatSpec from the Format package to create a proper B-factor string.","category":"page"},{"location":"01_Change_B_factors/#MIToS-solution","page":"Change B-factors","title":"MIToS solution","text":"","category":"section"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"For this example we are going to use the small heat shock protein AgsA from Salmonella typhimurium (PDB code: 4ZJ9) available in MIToS docs data:","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"using MIToS\npdbfile = abspath(pathof(MIToS), \"..\", \"..\", \"docs\", \"data\", \"4zj9.pdb\")\nnothing # hide","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"First, we need to read the PDB file using the MIToS.PDB module:","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"using MIToS.PDB\npdb_residues = read_file(pdbfile, PDBFile)\nnothing # hide","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"For this example, we are going to replace the B-factor of the alpha-carbons by the residue hydrophobicity according to the hydrophobicity scale of Kyte and Doolittle used by Chimera:","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"hydrophobicity = Dict(\n \"ILE\" => 4.5,\n \"VAL\" => 4.2,\n \"LEU\" => 3.8,\n \"PHE\" => 2.8,\n \"CYS\" => 2.5,\n \"MET\" => 1.9,\n \"ALA\" => 1.8,\n \"GLY\" => -0.4,\n \"THR\" => -0.7,\n \"SER\" => -0.8,\n \"TRP\" => -0.9,\n \"TYR\" => -1.3,\n \"PRO\" => -1.6,\n \"HIS\" => -3.2,\n \"GLU\" => -3.5,\n \"GLN\" => -3.5,\n \"ASP\" => -3.5,\n \"ASN\" => -3.5,\n \"LYS\" => -3.9,\n \"ARG\" => -4.5,\n)\nnothing # hide","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"First, we define a helper function using Format to create a proper B-factor string with the PDB format; 6 characters and 2 digits after the decimal point. The PDB format description describe this field as:","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"COLUMNS DATA TYPE FIELD DEFINITION\n------------------------------------------------------\n61 - 66 Real(6.2) tempFactor Temperature factor.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"using Format\n\n\"\"\"\nReturn value as a string with the B factor format described in PDB. # e.g. 1.5 -> \" 1.50\"\n\"\"\"\nformat_b_factor(value) = pyfmt(FormatSpec(\"6.2f\"), value) # e.g. 1.5 -> \" 1.50\"\nnothing # hide","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"Then, where are using that helper function to define a function that returns a new PDBAtom by changing the B factor field using the Setfield package.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"using Setfield\n\n\"\"\"\nReturn a new PDBAtom with the B-factor changed to value.\n\"\"\"\nfunction change_b_factor(atom::PDBAtom, value)\n b_factor_string = format_b_factor(value)\n b_factor_string = strip(b_factor_string) # e.g. \" 1.50\" -> \"1.50\"\n if length(b_factor_string) > 6\n throw(ErrorException(\"$b_factor_string has more than 6 characters.\"))\n end\n @set atom.B = b_factor_string\nend\nnothing # hide","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"Now, we can use the change_b_factor function to change the B-factor of each \"CA\" atom:","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"for res in pdb_residues\n for i in eachindex(res.atoms)\n atom = res.atoms[i]\n if atom.atom == \"CA\"\n res.atoms[i] = change_b_factor(atom, hydrophobicity[res.id.name])\n end\n end\nend","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"Finally, we can save the changed residues in a new PDB file.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"write_file(\"4zj9_hydrophobicity.pdb\", pdb_residues, PDBFile)","category":"page"},{"location":"01_Change_B_factors/#Discussion","page":"Change B-factors","title":"Discussion","text":"","category":"section"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"While we have focused on changing the B-factor field of a PDBAtom, you can use the same approach to change other fields. However, if you want to change atom coordinates, it is better to use the change_coordinates function from the PDB module of MIToS.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"MIToS atoms and residues generally stores the string present in the input file without surrounding spaces. You can use the Format module to create these strings and strip to get rid of the spaces. You can see the PDB format description to know what is the format of the expected string or see the MIToS PDB print_file source code to get a quick idea.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"This page was generated using Literate.jl.","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"@info \"Example\"","category":"page"},{"location":"Example/#Example","page":"Example","title":"Example","text":"","category":"section"},{"location":"Example/","page":"Example","title":"Example","text":"In this simple demonstration, you will see how to calculate ZBLMIp (Z score of the corrected MIp using BLOSUM62 pseudo frequencies) for a Pfam(Image: ) MSA from the Julia REPL or using a MIToS script in the system command line.","category":"page"},{"location":"Example/#juliarepl","page":"Example","title":"MIToS in the Julia REPL","text":"","category":"section"},{"location":"Example/","page":"Example","title":"Example","text":"If you load the Pfam module from MIToS, you will get access to a set of functions that work with Pfam MSAs. In this case, we are going to use it for download a Stockholm(Image: ) MSA from the Pfam website and read it into Julia.","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"using MIToS.Pfam\npfam_file = downloadpfam(\"PF10660\")\nmsa = read_file(pfam_file, Stockholm, generatemapping = true, useidcoordinates = true)","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"note: Generation of sequence and column mappings\nThe keyword argument generatemapping of read_file allows to generate sequence and column mappings for the MSA. Column mapping is the map between of each column on the MSA object and the column number in the file. Sequence mappings will use the start and end coordinates in the sequence ids for enumerate each residue in the sequence if useidcoordinates is true.","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"You can plot this MSA and other MIToS’ objects using the Plots(Image: ) package. The installation of Plots is described in the Installation section of this site:","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"using Plots\nplot(msa)\npng(\"msa.png\") # hide\nnothing # hide","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"(Image: )","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"The Information module of MIToS has functions to calculate measures from the Information Theory(Image: ), such as Shannon Entropy and Mutual Information (MI), on a MSA. In this example, we will estimate covariation between columns of the MSA with a corrected MI that use the BLOSUM62 matrix for calculate pseudo frequencies (BLMI).","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"using MIToS.Information\nZBLMIp, BLMIp = BLMI(msa)\nZBLMIp # shows ZBLMIp scores","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"Once the Plots package is installed and loaded, you can use its capabilities to visualize this results:","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"heatmap(ZBLMIp, yflip = true, c = :grays)\npng(\"blmi.png\") # hide\nnothing # hide","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"(Image: )","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"rm(pfam_file) # clean up","category":"page"},{"location":"Example/#commandline","page":"Example","title":"MIToS in system command line","text":"","category":"section"},{"location":"Example/","page":"Example","title":"Example","text":"Calculate ZBLMIp on the system shell is easy using the script called BLMI.jl in the MIToS_Scripts.jl(Image: ) package. This script reads a MSA file, and writes a file with the same base name of the input but with the .BLMI.csv extension.","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"julia BLMI.jl PF14972.stockholm.gz","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"@info \"PDB docs\"","category":"page"},{"location":"PDB/#Module-PDB","page":"PDB","title":"PDB","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"The module PDB defines types and methods to work with protein structures inside Julia. It is useful to link structural and sequential information, and needed for measure the predictive performance at protein contact prediction of mutual information scores.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using MIToS.PDB # to load the PDB module","category":"page"},{"location":"PDB/#Features","page":"PDB","title":"Features","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"Read and parse mmCIF, PDB, and PDBML files.\nDownload structures from the PDB and AlphaFold databases.\nCalculate distance and contacts between atoms or residues.\nDetermine interaction between residues.","category":"page"},{"location":"PDB/#Contents","page":"PDB","title":"Contents","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"Pages = [\"PDB.md\"]\nDepth = 4","category":"page"},{"location":"PDB/#Retrieve-information-from-PDB-database","page":"PDB","title":"Retrieve information from PDB database","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"This module exports the downloadpdb function, to retrieve a PDB file from PDB database(Image: ). By default, this function downloads a gzipped mmCIF file (format=MMCIFFile), which could be easily read by MIToS. You are able to determine the format as PDBFile if you want to download a PDB file instead.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using MIToS.PDB\n\npdbfile = downloadpdb(\"1IVO\", format = PDBFile)","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"PDB module also exports a getpdbdescription to access the header information of a PDB entry.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"getpdbdescription(\"1IVO\")","category":"page"},{"location":"PDB/#Retrieve-information-from-AlphaFold-database","page":"PDB","title":"Retrieve information from AlphaFold database","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"This module provides functions to download and query protein structures from AlphaFold DB.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"The download_alphafold_structure function downloads the structure file, in mmCIF format by default, for a given UniProt Accession ID. You can set format to PDBFile to download a PDB file instead.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using MIToS.PDB\n\n# Get the structure for the human insulin\nfile = download_alphafold_structure(\"P01308\")","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"If you need more information about that entry, you can use the query_alphafolddb function. The query_alphafolddb function returns an JSON3.Object that works like a dictionary.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"json_result = query_alphafolddb(\"P01308\")","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"You can access the information in the JSON3.Object using the keys. For example, to get the URL to the PAE matrix image:","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"pae_image_url = json_result[\"paeImageUrl\"]","category":"page"},{"location":"PDB/#Read-and-parse-PDB-files","page":"PDB","title":"Read and parse PDB files","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"This is easy using the read_file and parse_file functions, indicating the filename and the FileFormat: PDBML for PDB XML files or PDBFile for usual PDB files. These functions returns a Vector of PDBResidue objects with all the residues in the PDB. To return only a specific subset of residues/atoms you can use any of the following keyword arguments:","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"keyword arguments default returns only ...\nchain All residues from a PDB chain, i.e. \"A\"\nmodel All residues from a determined model, i.e. \"1\"\ngroup All residues from a group: \"ATOM\", \"HETATM\" or All for both\natomname All atoms with a specific name, i.e. \"CA\"\nonlyheavy false heavy atoms (not hydrogens) if it's true\noccupancyfilter false only the atoms with the best occupancy are returned if it's true","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"note: Note\nFor PDBML files it is possible to use the keyword argument label to false (default to true) to get the auth_ attributes instead of the label_ attributes for chain, atom and residue name fields. The auth_ attributes are alternatives provided by an author in order to match the identification/values used in the publication that describes the structure.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"# Read α carbon of each residue from the 1ivo pdb file, in the model 1, chain A and in the ATOM group.\nCA_1ivo =\n read_file(pdbfile, PDBFile, model = \"1\", chain = \"A\", group = \"ATOM\", atomname = \"CA\")\n\nCA_1ivo[1] # First residue. It has only the α carbon.","category":"page"},{"location":"PDB/#Looking-for-particular-residues","page":"PDB","title":"Looking for particular residues","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"MIToS parse PDB files to vector of residues, instead of using a hierarchical structure like other packages. This approach makes the search and selection of residues or atoms a little different. To make it easy, this module exports the select_residues and select_atoms functions. Given the fact that residue numbers from different chains, models, etc. can collide, we can indicate the model, chain, group, residue number and atom name using the keyword arguments of those functions. If you want to select all the residues in one of the categories, you are able to use the type All (this is the default value of such arguments). You can also use regular expressions or functions to make the selections.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using MIToS.PDB\npdbfile = downloadpdb(\"1IVO\", format = PDBFile)\nresidues_1ivo = read_file(pdbfile, PDBFile)\n# Select residue number 9 from model 1 and chain B (it looks in both ATOM and HETATM groups)\nselect_residues(residues_1ivo, model = \"1\", chain = \"B\", residue = \"9\")","category":"page"},{"location":"PDB/#Getting-a-Dict-of-PDBResidues","page":"PDB","title":"Getting a Dict of PDBResidues","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"If you prefer a Dict of PDBResidue, indexed by their residue numbers, you can use the residuedict function.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"# Dict of residues from the model 1, chain A and from the ATOM group\nchain_a = residuesdict(residues_1ivo, model = \"1\", chain = \"A\", group = \"ATOM\")\nchain_a[\"9\"]","category":"page"},{"location":"PDB/#Select-particular-residues","page":"PDB","title":"Select particular residues","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"Use the select_residues function to collect specific residues. It's possible to use a single residue number (i.e. \"2\") or even a function which should return true for the selected residue numbers. Also regular expressions can be used to select residues. Use All to select all the residues.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"residue_list = map(string, 2:5)\n\n# If the list is large, you can use a `Set` to gain performance\n# residue_set = Set(map(string, 2:5))","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"first_res = select_residues(\n residues_1ivo,\n model = \"1\",\n chain = \"A\",\n group = \"ATOM\",\n residue = resnum -> resnum in residue_list,\n)\n\nfor res in first_res\n println(res.id.name, \" \", res.id.number)\nend","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"A more complex example using an anonymous function:","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"# Select all the residues of the model 1, chain A of the ATOM group with residue number less than 5\n\nfirst_res = select_residues(\n residues_1ivo,\n model = \"1\",\n chain = \"A\",\n group = \"ATOM\",\n residue = x -> parse(Int, match(r\"^(\\d+)\", x)[1]) <= 5,\n)\n# The anonymous function takes the residue number (string) and use a regular expression\n# to extract the number (without insertion code).\n# It converts the number to `Int` to test if the it is `<= 5`.\n\nfor res in first_res\n println(res.id.name, \" \", res.id.number)\nend","category":"page"},{"location":"PDB/#Select-particular-atoms","page":"PDB","title":"Select particular atoms","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"The select_atoms function allow to select a particular set of atoms.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"# Select all the atoms with name starting with \"C\" using a regular expression\n# from all the residues of the model 1, chain A of the ATOM group\n\ncarbons = select_atoms(\n residues_1ivo,\n model = \"1\",\n chain = \"A\",\n group = \"ATOM\",\n residue = All,\n atom = r\"C.+\",\n)\n\ncarbons[1]","category":"page"},{"location":"PDB/#Protein-contact-map","page":"PDB","title":"Protein contact map","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"The PDB module offers a number of functions to measure distances between atoms or residues, to detect possible interactions or contacts. In particular the contact function calls the distance function using a threshold or limit in an optimized way. The measure can be done between alpha carbons (\"CA\"), beta carbons (\"CB\") (alpha carbon for glycine), any heavy atom (\"Heavy\") or any (\"All\") atom of the residues.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"In the following example, whe are going to plot a contact map for the 1ivo chain A. Two residues will be considered in contact if their β carbons (α carbon for glycine) have a distance of 8Å or less.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using MIToS.PDB\n\npdbfile = downloadpdb(\"1IVO\", format = PDBFile)\n\nresidues_1ivo = read_file(pdbfile, PDBFile)\n\npdb = select_residues(residues_1ivo, model = \"1\", chain = \"A\", group = \"ATOM\")\n\ndmap = distance(pdb, criteria = \"All\") # Minimum distance between residues using all their atoms","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"Use the contact function to get a contact map:","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"cmap = contact(pdb, 8.0, criteria = \"CB\") # Contact map","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"@info \"PDB: Cmap\"\nusing Plots\ngr() # Hide possible warnings","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using Plots\ngr()\n\nheatmap(dmap, grid = false, yflip = true, ratio = :equal)\n\npng(\"pdb_dmap.png\") # hide\nnothing # hide","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"(Image: )","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"heatmap(cmap, grid = false, yflip = true, ratio = :equal)\n\npng(\"pdb_cmap.png\") # hide\nnothing # hide","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"(Image: )","category":"page"},{"location":"PDB/#Structural-superposition","page":"PDB","title":"Structural superposition","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"@info \"PDB: RMSD\"\nusing Plots\ngr() # Hide possible warnings","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using MIToS.PDB\n\npdbfile = downloadpdb(\"2HHB\")\n\nres_2hhb = read_file(pdbfile, MMCIFFile)\n\nchain_A = select_residues(res_2hhb, model = \"1\", chain = \"A\", group = \"ATOM\", residue = All)\nchain_C = select_residues(res_2hhb, model = \"1\", chain = \"C\", group = \"ATOM\", residue = All)\n\nusing Plots\ngr()\n\nscatter3d(chain_A, label = \"A\", alpha = 0.5)\nscatter3d!(chain_C, label = \"C\", alpha = 0.5)\n\npng(\"pdb_unaligned.png\") # hide\nnothing # hide","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"(Image: )","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"superimposed_A, superimposed_C, RMSD = superimpose(chain_A, chain_C)\n\nRMSD","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"scatter3d(superimposed_A, label = \"A\", alpha = 0.5)\nscatter3d!(superimposed_C, label = \"C\", alpha = 0.5)\npng(\"pdb_aligned.png\") # hide\nnothing # hide","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"(Image: )","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"@info \"Pfam docs\"","category":"page"},{"location":"Pfam/#Module-Pfam","page":"Pfam","title":"Pfam","text":"","category":"section"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"MIToS defines methods and types useful for any MSA. The Pfam module uses other MIToS modules in the context of Pfam MSAs, where it’s possible to us determine how structure and sequence information should be mapped. This module defines functions that go from a Pfam MSA to the protein contact prediction performance of pairwise scores estimated from that MSA.","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"using MIToS.Pfam # to load the Pfam module","category":"page"},{"location":"Pfam/#Features","page":"Pfam","title":"Features","text":"","category":"section"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"Download and read Pfam MSAs.\nObtain PDB information from alignment annotations.\nMap between sequence/alignment residues/columns and PDB structures.\nMeasure of AUC (ROC curve) for protein contact prediction of MI scores.","category":"page"},{"location":"Pfam/#Contents","page":"Pfam","title":"Contents","text":"","category":"section"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"Pages = [\"Pfam.md\"]\nDepth = 4","category":"page"},{"location":"Pfam/#Getting-a-Pfam-MSA","page":"Pfam","title":"Getting a Pfam MSA","text":"","category":"section"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"The function downloadpfam takes a Pfam accession and downloads a Pfam MSA in Stockholm format. In that way, you can do","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"pfamfile = downloadpfam(\"PF18883\")","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"to get the MSA. But, we are going to use an already downloaded file in this case:","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"using MIToS\npfamfile = joinpath(dirname(pathof(MIToS)), \"..\", \"docs\", \"data\", \"PF18883.stockholm.gz\");","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"Use read_file function and the Stockholm FileFormat to get a AnnotatedMultipleSequenceAlignment object with the MSA and its Pfam annotations. You must set generatemapping and useidcoordinates to true the first time you read the downloaded MSA. This is necessary to some of the methods in the Pfam module.","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"msa = read_file(pfamfile, Stockholm, generatemapping = true, useidcoordinates = true)","category":"page"},{"location":"Pfam/#Getting-PDB-information-from-an-MSA","page":"Pfam","title":"Getting PDB information from an MSA","text":"","category":"section"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"The function getseq2pdb parses the MSA annotations to return a Dict from the sequence identifier in the MSA to PDB and chain codes.","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"getseq2pdb(msa)","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"Once you know the association between PDB chains and sequences, you can use that information together with the msacolumn2pdbresidue function to get the PDB residue number that correspond to each MSA column for given a determined sequence and PDB chain. That function downloads information from SIFTS to generate the mapping.","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"col2res = msacolumn2pdbresidue(msa, \"ICSA_SHIFL/611-720\", \"3ML3\", \"A\")","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"The returned dictionary can be used to get the PDB residue associated to each column (using the msaresidues function)...","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"using MIToS.PDB\npdbfile = downloadpdb(\"3ML3\")\npdb = read_file(pdbfile, MMCIFFile)\nresdict = residuesdict(pdb, model = \"1\", chain = \"A\", group = \"ATOM\")\n\nmsaresidues(msa, resdict, col2res)","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"...or to delete the columns without PDB residues (using the hasresidues function):","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"using MIToS.MSA\nfiltercolumns!(msa, hasresidues(msa, col2res))","category":"page"},{"location":"Pfam/#PDB-contacts-and-AUC","page":"Pfam","title":"PDB contacts and AUC","text":"","category":"section"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"The Dict between MSA columns and PDB residue number also can be used to generate a protein contact map associated to the MSA.","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"cmap = msacontacts(msa, resdict, col2res)","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"That protein contact map can be used to calculate the Area Under the ROC Curve for a given score with the AUC function.","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"using MIToS.Information\nZMIp, MIp = buslje09(msa)\n\nusing ROCAnalysis # You need to load ROCAnalysis to use the AUC function\n\nAUC(ZMIp, cmap)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"EditURL = \"cookbook/02_Linking_structural_and_evolutionary_information.jl\"","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/#Linking-structural-and-evolutionary-information","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"","category":"section"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"(Image: ) (Image: )","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/#Problem-description","page":"Linking structural and evolutionary information","title":"Problem description","text":"","category":"section"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"It is a very common task to map sequence to structure residue number. For example, to link structural information coming from PDB and evolutionary information calculated from multiple sequence alignments.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"The naive way of mapping sequence and structure is to perform global pairwise alignment between the sequence and the PDB sequence (using the residues in ATOM). The problem with this approach is that the sequences can have missing regions and standard pairwise alignment algorithms often yield incorrect assignations around those regions (Velankar et.al. 2013 [6]). This is particularly important when aligning PDB sequences, that can have missing residues, and sequences coming from multiple sequence alignments, that can be incomplete or have unaligned regions (e.g. insert states).","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"The SIFTS (Structure Integration with Function, Taxonomy and Sequences) database solves this problem and provides residue level mapping between PDB and other databases (e.g. UniProt and Pfam).","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"The SIFTS module of MIToS has functions to access this residue level mapping between PDB and other databases. Also, MIToS keeps track of the residue number of each residue in a multiple sequence alignment (MSA) using its annotations. Both things together, allow the correct mapping of sequence and structure without performing error-prone pairwise alignments.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"Particular solutions depend on problem details, here we show some common ways to use MIToS and SIFTS to map evolutionary information calculated in an MSA (e.g. Shannon entropy) with structural information (e.g. B-factors).","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/#PDB-and-Pfam-alignment-mapping","page":"Linking structural and evolutionary information","title":"PDB and Pfam alignment mapping","text":"","category":"section"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"This is the easiest problem to solve with the MIToS Pfam module because SIFTS already has a residue level mapping between PDB and Pfam.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"For this example, we are going to map the columns in the multiple sequence alignment of the PF09645 Pfam family and the residues in the chain A from the 2VQC PDB file. The needed files are available in the MIToS test suite:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS\npdb_file = abspath(pathof(MIToS), \"..\", \"..\", \"test\", \"data\", \"2VQC.pdb\")\npfam_file = abspath(pathof(MIToS), \"..\", \"..\", \"test\", \"data\", \"PF09645_full.stockholm\")\nsifts_file = abspath(pathof(MIToS), \"..\", \"..\", \"test\", \"data\", \"2vqc.xml.gz\")\nnothing # hide","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"You can also use downloadpdb from MIToS.PDB, downloadpfam from MIToS.Pfam and downloadsifts from MIToS.SIFTS to get the corresponding files from those databases.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"It is important to read the Pfam MSA file using generatemapping=true and useidcoordinates=true because that allows keeping track of the residue number using the MSA annotations.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS.Pfam\nmsa = read_file(pfam_file, Stockholm, generatemapping = true, useidcoordinates = true)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"First, we need to know what is the sequence in the MSA that correspond to the PDB we want to link. Luckily, Pfam Stockholm files store the mapping between sequences and PDB chains. You can access that mapping using the getseq2pdb function from MIToS.Pfam","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"seq2pdbs = getseq2pdb(msa)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"The returned dictionary gives you all the PDB chains associated with a determined sequence in the MSA. But, in this case, we want to go in the other direction to find all the sequences associated with a determined PDB chain. We are going to use a list comprehension because it is possible for a single chain to be associated with more than one sequence in the Pfam MSA (e.g. domain repeats).","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"pdb_code = \"2VQC\"\npdb_chain = \"A\"\nseq_ids = [seq for (seq, pdbs) in seq2pdbs if (pdb_code, pdb_chain) in pdbs]","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"In this example, we are going to use the only sequence we found for the A of 2VQC.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"seq_id = seq_ids[1]","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"Finally, we can use the msacolumn2pdbresidue function from the Pfam module to get a dictionary from the MSA column index to the PDB residue number:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"pfam_id = \"PF09645\"\nmsacol2pdbres = msacolumn2pdbresidue(msa, seq_id, pdb_code, pdb_chain, pfam_id, sifts_file)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"This dictionary has the mapping between MSA column and PDB residue that allows the mapping between evolutionary and structural information. For example, to measure the correlation between entropy (related to residue variation in an MSA column) and the mean B factor of the residue:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS.Information\nHx = mapcolfreq!(\n shannon_entropy,\n msa,\n Frequencies(ContingencyTable(Int, Val{1}, UngappedAlphabet())),\n)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"To get quick access to each PDB residue based on its residue number, we can read the PDB file into a dictionary using the read_file and residuesdict functions from the MIToS PDB module:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS.PDB\nres_dict = residuesdict(\n read_file(pdb_file, PDBFile, occupancyfilter = true),\n model = \"1\",\n chain = \"A\",\n)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"Then, we can iterate the mapping dictionary to link the MSA and PDB based values:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using Statistics\n\nx = Float64[]\ny = Float64[]\n\nfor (col_index, res_number) in msacol2pdbres\n if res_number != \"\" # i.e. MSA column has an associated PDB residue\n push!(x, Hx[col_index])\n push!(y, mean(parse(Float64, atom.B) for atom in res_dict[res_number].atoms))\n end\nend\n\ncor(x, y)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/#Unknown-sequence-coordinates","page":"Linking structural and evolutionary information","title":"Unknown sequence coordinates","text":"","category":"section"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"While Pfam alignments have the start and end of the aligned region indicated in the sequence name, other multiple sequence alignments don't give any hint about that. In those cases, we should use pairwise alignments. However, instead of aligning the sequence coming from the MSA and the PDB sequence, we can align the MSA sequence to the UniProt sequence to reduce the possibility of mapping errors. Once we have the mapping of the MSA sequence to the UniProt sequence, we can use SIFTS to map the PDB sequence to the MSA sequence using the UniProt numeration.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"For this example, we are going to use the following files included in MIToS documentation:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS\npdb_file = abspath(pathof(MIToS), \"..\", \"..\", \"docs\", \"data\", \"1dur.pdb\")\nmsa_file = abspath(pathof(MIToS), \"..\", \"..\", \"docs\", \"data\", \"blast_alignment.fa\")\nsifts_file = abspath(pathof(MIToS), \"..\", \"..\", \"docs\", \"data\", \"1dur.xml.gz\")\nuniprot_file = abspath(pathof(MIToS), \"..\", \"..\", \"docs\", \"data\", \"P00193.fasta\")\nnothing # hide","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"First, we are going to read the MSA file. In this case, we can not use useidcoordinates=true because the sequence names don't have the sequence coordinates in the Pfam format. However, we are going to use generatemapping=true to get the default mapping for each sequence in the alignment (from 1 to the length of the aligned region):","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS.MSA\nmsa = read_file(msa_file, FASTA, generatemapping = true)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"After that, we get the first sequence of the MSA, the one we know that corresponds to the PDB of interest. We need the sequence as a String without gaps (unaligned), so we use the MIToS.MSA stringsequence function together with replace:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"msa_seq = replace(stringsequence(msa, 1), '-' => \"\")","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"Also, we are going to read the UniProt sequence. You can easily download the sequence from UniProt by doing:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS.Utils\ndownload_file(\"https://www.uniprot.org/uniprot/P00193.fasta\", \"P00193.fasta\")","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"To read the FASTA file we are going to use the FastaIO package:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using FastaIO\nuniprot_sequences = readfasta(uniprot_file)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"And get the unique sequence:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"uniprot_seq = uniprot_sequences[1][2]","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"We can perform a pairwise sequence alignment between both sequences by using the BioAlignments package from the BioJulia suite. In this case, we use a semi-global alignment (no start/end gap penalty) because we know that the MSA sequence is a region of the UniProt sequence.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using BioAlignments\ncostmodel = AffineGapScoreModel(BLOSUM62, gap_open = -10, gap_extend = -1)\naln = pairalign(SemiGlobalAlignment(), msa_seq, uniprot_seq, costmodel)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"Then, we only need to iterate the alignment to designate the positions and store the equivalences in a dictionary:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"function seq2refnumber(aln)\n seq_pos = 0\n ref_pos = 0\n last_seq_pos = 0\n seq2ref = Dict{Int,Int}()\n for (seq_res, ref_res) in alignment(aln)\n if seq_res != '-'\n seq_pos += 1\n end\n if ref_res != '-'\n ref_pos += 1\n end\n if seq_pos != last_seq_pos\n seq2ref[seq_pos] = ref_pos\n last_seq_pos = seq_pos\n end\n end\n seq2ref\nend\n\nseqnum2uniprotnum = seq2refnumber(aln)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"Then, we can use getsequencemapping to go from MSA column number to UniProt residue, and siftsmapping to go from UniProt to PDB:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"seqmap = getsequencemapping(msa, 1)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"colnum2uniprotnum = Dict{Int,Int}()\nfor (colnum, seqnum) in enumerate(seqmap)\n if seqnum != 0 # getsequencemapping returns 0 where there is a gap\n colnum2uniprotnum[colnum] = seqnum2uniprotnum[seqnum]\n end\nend\ncolnum2uniprotnum","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS.SIFTS\n\nuniprotnum2pdbnum = siftsmapping(\n sifts_file,\n dbUniProt,\n \"P00193\",\n dbPDB,\n \"1dur\", # SIFTS stores PDB identifiers in lowercase\n chain = \"A\",\n missings = false,\n) # residues without coordinates aren't used in the mapping","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"To finally get the dictionary from MSA column index to PDB residue number","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"colnum2pdbnum = Dict{Int,String}()\nfor (colnum, uniprotnum) in colnum2uniprotnum\n pdbresnum = get(uniprotnum2pdbnum, string(uniprotnum), \"\")\n if pdbresnum != \"\"\n colnum2pdbnum[colnum] = pdbresnum\n end\nend\n\ncolnum2pdbnum","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"This page was generated using Literate.jl.","category":"page"},{"location":"Information_API/","page":"Information","title":"Information","text":"@info \"Information API docs\"","category":"page"},{"location":"Information_API/#Information","page":"Information","title":"Information","text":"","category":"section"},{"location":"Information_API/","page":"Information","title":"Information","text":"MIToS.Information","category":"page"},{"location":"Information_API/#MIToS.Information","page":"Information","title":"MIToS.Information","text":"The Information module of MIToS defines types and functions useful to calculate information measures (e.g. Mutual Information (MI) and Entropy) over a Multiple Sequence Alignment (MSA). This module was designed to count Residues (defined in the MSA module) in special contingency tables (as fast as possible) and to derive probabilities from this counts. Also, includes methods for applying corrections to that tables, e.g. pseudocounts and pseudo frequencies. Finally, Information allows to use this probabilities and counts to estimate information measures and other frequency based values.\n\nFeatures\n\nEstimate multi dimensional frequencies and probabilities tables from sequences, MSAs, etc...\nCorrection for small number of observations\nCorrection for data redundancy on a MSA\nEstimate information measures\nCalculate corrected mutual information between residues\n\nusing MIToS.Information\n\n\n\n\n\n","category":"module"},{"location":"Information_API/#Contents","page":"Information","title":"Contents","text":"","category":"section"},{"location":"Information_API/","page":"Information","title":"Information","text":"Pages = [\"Information_API.md\"]\nDepth = 2","category":"page"},{"location":"Information_API/#Types","page":"Information","title":"Types","text":"","category":"section"},{"location":"Information_API/","page":"Information","title":"Information","text":"Modules = [MIToS.Information]\nPrivate = false\nOrder = [:type]","category":"page"},{"location":"Information_API/#MIToS.Information.AdditiveSmoothing","page":"Information","title":"MIToS.Information.AdditiveSmoothing","text":"Additive Smoothing or fixed pseudocount λ for ResidueCount (in order to estimate probabilities when the number of samples is low).\n\nCommon values of λ are:\n\n0 : No cell frequency prior, gives you the maximum likelihood estimator.\n0.05 is the optimum value for λ found in Buslje et al. 2009, similar results was obtained for λ in the range [0.025, 0.075].\n1 / p : Perks prior (Perks, 1947) where p the number of parameters (i.e. residues, pairs of residues) to estimate. If p is the number of residues (20 without counting gaps), this gives you 0.05.\nsqrt(n) / p : Minimax prior (Trybula, 1958) where n is the number of samples and p the number of parameters to estimate. If the number of samples n is 400 (minimum number of sequence clusters for achieve good performance in Buslje et al. 2009) for estimating 400 parameters (pairs of residues without counting gaps) this gives you 0.05.\n0.5 : Jeffreys prior (Jeffreys, 1946).\n1 : Bayes-Laplace uniform prior, aka. Laplace smoothing.\n\nReferences\n\nBuslje, Cristina Marino, et al. \"Correction for phylogeny, small number of observations and data redundancy improves the identification of coevolving amino acid pairs using mutual information.\" Bioinformatics 25.9 (2009): 1125-1131.\nPerks, Wilfred. \"Some observations on inverse probability including a new indifference rule.\" Journal of the Institute of Actuaries 73.2 (1947): 285-334.\nTrybula, Stanislaw. \"Some problems of simultaneous minimax estimation.\" The Annals of Mathematical Statistics 29.1 (1958): 245-253.\nJeffreys, Harold. \"An invariant form for the prior probability in estimation problems.\" Proceedings of the Royal Society of London. Series A. Mathematical and Physical Sciences 186.1007 (1946): 453-461.\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.BLOSUM_Pseudofrequencies","page":"Information","title":"MIToS.Information.BLOSUM_Pseudofrequencies","text":"BLOSUM_Pseudofrequencies type. It takes to arguments/fields:\n\nα : Usually the number of sequences or sequence clusters in the MSA.\nβ : The weight of the pseudofrequencies, a value close to 8.512 when α is the number of sequence clusters.\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.ContingencyTable","page":"Information","title":"MIToS.Information.ContingencyTable","text":"A ContingencyTable is a multidimensional array. It stores the contingency matrix, its marginal values and total. The type also has an internal and private temporal array and an alphabet object. It's a parametric type, taking three ordered parameters:\n\nT : The element type of the multidimensional array.\nN : It's the dimension of the array and should be an Int.\nA : This should be a type, subtype of ResidueAlphabet, i.e.: UngappedAlphabet, GappedAlphabet or ReducedAlphabet.\n\nA ContingencyTable can be created from an alphabet if all the parameters are given. Otherwise, you need to give a type, a number (Val) and an alphabet. You can also create a ContingencyTable using a matrix and a alphabet. For example:\n\nContingencyTable{Float64,2,UngappedAlphabet}(UngappedAlphabet())\nContingencyTable(Float64, Val{2}, UngappedAlphabet())\nContingencyTable(zeros(Float64, 20, 20), UngappedAlphabet())\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.Frequencies","page":"Information","title":"MIToS.Information.Frequencies","text":"A Frequencies object wraps a ContingencyTable storing counts/frequencies.\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.NoPseudocount","page":"Information","title":"MIToS.Information.NoPseudocount","text":"You can use NoPseudocount() to avoid pseudocount corrections where a Pseudocount type is needed.\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.NoPseudofrequencies","page":"Information","title":"MIToS.Information.NoPseudofrequencies","text":"You can use NoPseudofrequencies() to avoid pseudocount corrections where a Pseudofrequencies type is needed.\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.Probabilities","page":"Information","title":"MIToS.Information.Probabilities","text":"A Probabilities object wraps a ContingencyTable storing probabilities. It doesn't perform any check. If the total isn't one, you must use normalize or normalize!on the ContingencyTable before wrapping it to make the sum of the probabilities equal to one.\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.Pseudocount","page":"Information","title":"MIToS.Information.Pseudocount","text":"Parametric abstract type to define pseudocount types\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.Pseudofrequencies","page":"Information","title":"MIToS.Information.Pseudofrequencies","text":"Parametric abstract type to define pseudofrequencies types\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#Constants","page":"Information","title":"Constants","text":"","category":"section"},{"location":"Information_API/","page":"Information","title":"Information","text":"Modules = [MIToS.Information]\nPrivate = false\nOrder = [:constant]","category":"page"},{"location":"Information_API/#MIToS.Information.BLOSUM62_Pi","page":"Information","title":"MIToS.Information.BLOSUM62_Pi","text":"BLOSUM62 probabilities P(aa) for each residue on the UngappedAlphabet. SUM: 0.9987\n\n\n\n\n\n","category":"constant"},{"location":"Information_API/#MIToS.Information.BLOSUM62_Pij","page":"Information","title":"MIToS.Information.BLOSUM62_Pij","text":"Table with conditional probabilities of residues based on BLOSUM62. The normalization is done row based. The firts row contains the P(aa|A) and so one.\n\n\n\n\n\n","category":"constant"},{"location":"Information_API/#Macros","page":"Information","title":"Macros","text":"","category":"section"},{"location":"Information_API/","page":"Information","title":"Information","text":"Modules = [MIToS.Information]\nPrivate = false\nOrder = [:macro]","category":"page"},{"location":"Information_API/#Methods-and-functions","page":"Information","title":"Methods and functions","text":"","category":"section"},{"location":"Information_API/","page":"Information","title":"Information","text":"Modules = [MIToS.Information]\nPrivate = false\nOrder = [:function]","category":"page"},{"location":"Information_API/#Base.count!-Union{Tuple{A}, Tuple{N}, Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, N, A}, Any, MIToS.Information.Pseudocount, Vararg{AbstractArray{MIToS.MSA.Residue}, N}}} where {T, N, A}","page":"Information","title":"Base.count!","text":"It populates a ContingencyTable (first argument) using the frequencies in the sequences (last positional arguments). The dimension of the table must match the number of sequences and all the sequences must have the same length. You must indicate the used weights and pseudocounts as second and third positional arguments respectively. You can use NoPseudofrequencies() and NoClustering() to avoid the use of sequence weighting and pseudocounts, respectively.\n\nDEPRECATED: Use frequencies! instead. Note that frequencies! defines the weigths and pseudocounts using keyword arguments instead of positional arguments.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#Base.count-Union{Tuple{Vararg{AbstractArray{MIToS.MSA.Residue}, N}}, Tuple{N}} where N","page":"Information","title":"Base.count","text":"It returns a ContingencyTable wrapped in a Frequencies type with the frequencies of residues in the sequences that takes as arguments. The dimension of the table is equal to the number of sequences. You can use the keyword arguments alphabet, weights and pseudocounts to indicate the alphabet of the table (default to UngappedAlphabet()), a clustering result (default to NoClustering()) and the pseudocounts (default to NoPseudocount()) to be used during the estimation of the frequencies.\n\nDEPRECATED: Use frequencies instead. Note that frequencies defines the alphabet, weigths and pseudocounts using keyword arguments instead of positional arguments.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#LinearAlgebra.normalize!-Union{Tuple{MIToS.Information.ContingencyTable{T, N, A}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T, N, A}","page":"Information","title":"LinearAlgebra.normalize!","text":"normalize! makes the sum of the frequencies to be one, in place.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#LinearAlgebra.normalize-Union{Tuple{MIToS.Information.ContingencyTable{T, N, A}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T, N, A}","page":"Information","title":"LinearAlgebra.normalize","text":"normalize returns another table where the sum of the frequencies is one.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.APC!-Union{Tuple{Matrix{T}}, Tuple{T}} where T","page":"Information","title":"MIToS.Information.APC!","text":"APC\n\nReferences\n\nDunn, Stanley D., Lindi M. Wahl, and Gregory B. Gloor. \"Mutual information without the influence of phylogeny or entropy dramatically improves residue contact prediction.\" Bioinformatics 24.3 (2008): 333-340.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.BLMI-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.BLMI","text":"BLMI takes an MSA and calculates a Z score (ZBLMI) and a corrected MI/MIp as described on Busjle et al. 2009 but using using BLOSUM62 pseudo frequencies instead of a fixed pseudocount.\n\nKeyword argument, type, default value and descriptions:\n\n - beta Float64 8.512 β for BLOSUM62 pseudo frequencies\n - lambda Float64 0.0 Low count value\n - threshold 62 Percent identity threshold for sequence clustering (Hobohm I)\n - maxgap Float64 0.5 Maximum fraction of gaps in positions included in calculation\n - apc Bool true Use APC correction (MIp)\n - samples Int 50 Number of samples for Z-score\n - fixedgaps Bool true Fix gaps positions for the random samples\n\nThis function returns:\n\n - Z score (ZBLMI)\n - MI or MIp using BLOSUM62 pseudo frequencies (BLMI/BLMIp)\n\nReferences\n\nBuslje, Cristina Marino, et al. \"Correction for phylogeny, small number of observations and data redundancy improves the identification of coevolving amino acid pairs using mutual information.\" Bioinformatics 25.9 (2009): 1125-1131.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.apply_pseudocount!-Union{Tuple{A}, Tuple{N}, Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, N, A}, T}} where {T, N, A}","page":"Information","title":"MIToS.Information.apply_pseudocount!","text":"It adds the pseudocount value to the table cells.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.apply_pseudofrequencies!-Union{Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, 2, MIToS.MSA.UngappedAlphabet}, MIToS.Information.BLOSUM_Pseudofrequencies}} where T","page":"Information","title":"MIToS.Information.apply_pseudofrequencies!","text":"apply_pseudofrequencies!{T}(Pab::ContingencyTable{T,2,UngappedAlphabet}, pseudofrequencies::BLOSUM_Pseudofrequencies)\n\nWhen a BLOSUM_Pseudofrequencies(α,β) is used, this function applies pseudofrequencies Gab over Pab, as a weighted mean of both. It uses the conditional probability matrix BLOSUM62_Pij and the real frequencies/probabilities Pab to estimate the pseudofrequencies Gab. α is the weight of the real frequencies Pab and β the weight of the pseudofrequencies.\n\nGab = Σcd Pcd ⋅ BLOSUM62( a | c ) ⋅ BLOSUM62( b | d ) Pab = (α ⋅ Pab + β ⋅ Gab )/(α + β)\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.buslje09-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.buslje09","text":"buslje09 takes a MSA and calculates a Z score and a corrected MI/MIp as described on Busjle et al. 2009.\n\nkeyword argument, type, default value and descriptions:\n\n - lambda Float64 0.05 Low count value\n - clustering Bool true Sequence clustering (Hobohm I)\n - threshold 62 Percent identity threshold for clustering\n - maxgap Float64 0.5 Maximum fraction of gaps in positions included in calculation\n - apc Bool true Use APC correction (MIp)\n - samples Int 100 Number of samples for Z-score\n - fixedgaps Bool true Fix gaps positions for the random samples\n - alphabet ResidueAlphabet UngappedAlphabet() Residue alphabet to be used\n\nThis function returns:\n\n - Z score\n - MI or MIp\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.cumulative-Union{Tuple{VT}, Tuple{D}, Tuple{T}, Tuple{PairwiseListMatrices.PairwiseListMatrix{T, D, VT}, T}} where {T, D, VT}","page":"Information","title":"MIToS.Information.cumulative","text":"cumulative allows to calculate cumulative scores (i.e. cMI) as defined in Marino Buslje et al. 2010:\n\n\"We calculated a cumulative mutual information score (cMI) for each residue as the sum of MI values above a certain threshold for every amino acid pair where the particular residue appears. This value defines to what degree a given amino acid takes part in a mutual information network.\"\n\nReferences\n\nMarino Buslje, Cristina, et al. \"Networks of high mutual information define the structural proximity of catalytic sites: implications for catalytic residue identification.\" PLoS computational biology 6.11 (2010): e1000978.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.delete_dimensions!-Union{Tuple{A}, Tuple{S}, Tuple{N}, Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, S, A}, MIToS.Information.ContingencyTable{T, N, A}, Vararg{Int64}}} where {T, N, S, A}","page":"Information","title":"MIToS.Information.delete_dimensions!","text":"delete_dimensions!(out::ContingencyTable, in::ContingencyTable, dimensions::Int...)\n\nThis function fills a ContingencyTable with the counts/probabilities on in after the deletion of dimensions. i.e. This is useful for getting Pxy from Pxyz.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.delete_dimensions-Union{Tuple{I}, Tuple{A}, Tuple{N}, Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, N, A}, Vararg{Int64, I}}} where {T, N, A, I}","page":"Information","title":"MIToS.Information.delete_dimensions","text":"delete_dimensions(in::ContingencyTable, dimensions::Int...)\n\nThis function creates a ContingencyTable with the counts/probabilities on in after the deletion of dimensions. i.e. This is useful for getting Pxy from Pxyz.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.frequencies!-Union{Tuple{A}, Tuple{N}, Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, N, A}, Vararg{AbstractArray{MIToS.MSA.Residue}, N}}} where {T, N, A}","page":"Information","title":"MIToS.Information.frequencies!","text":"frequencies!(table, seqs...; weights::WeightTypes, pseudocounts::Pseudocount)\n\nIt populates a ContingencyTable or Frequencies table (first argument) using the frequencies in the given sequences (last positional arguments). The dimension of the table must match the number of sequences and all the sequences must have the same length. You must indicate the used weights and pseudocounts as keyword arguments. Those arguments default to NoClustering() and NoPseudocount() respectively, to avoid the use of sequence weighting and pseudocounts.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.frequencies-Union{Tuple{Vararg{AbstractArray{MIToS.MSA.Residue}, N}}, Tuple{N}} where N","page":"Information","title":"MIToS.Information.frequencies","text":"frequencies(seqs...; alphabet=UngappedAlphabet(), weights=NoClustering(), pseudocounts=NoPseudocount()\n\nThis function returns a Frequencies object wrapping a ContingencyTable with the frequencies of residues in the sequences that takes as arguments. The dimension of the table is equal to the number of sequences. You can use the keyword arguments alphabet, weights and pseudocounts to indicate the alphabet of the table, a clustering result and the pseudocounts to be used during the estimation of the frequencies.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.gap_intersection_percentage-Union{Tuple{MIToS.Information.Frequencies{T, 2, MIToS.MSA.GappedAlphabet}}, Tuple{T}} where T","page":"Information","title":"MIToS.Information.gap_intersection_percentage","text":"It calculates the gap intersection as percentage from a table of Frequencies.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.gap_union_percentage-Union{Tuple{MIToS.Information.Frequencies{T, 2, MIToS.MSA.GappedAlphabet}}, Tuple{T}} where T","page":"Information","title":"MIToS.Information.gap_union_percentage","text":"It calculates the gap union as percentage from a table of Frequencies.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.gaussdca-Tuple{Any}","page":"Information","title":"MIToS.Information.gaussdca","text":"Wrapper function to GaussDCA.gDCA. You need to install GaussDCA:\n\nusing Pkg\n\nPkg.add(PackageSpec(url = \"https://github.com/carlobaldassi/GaussDCA.jl\", rev = \"master\"))\n\nLook into GaussDCA.jl README for further information. If you use this wrapper, please cite the GaussDCA publication and the package's doi.\n\nIt's possible to indicate the path to the julia binary where GaussDCA is installed. However, it's recommended to use the same version where MIToS is installed. That is because this function use serialize/deserialize to transfer data between the processes.\n\nGaussDCA Publication: Baldassi, Carlo, Marco Zamparo, Christoph Feinauer, Andrea Procaccini, Riccardo Zecchina, Martin Weigt, and Andrea Pagnani. \"Fast and accurate multivariate Gaussian modeling of protein families: predicting residue contacts and protein-interaction partners.\" PloS one 9, no. 3 (2014): e92721.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.getalphabet-Tuple{MIToS.Information.ContingencyTable}","page":"Information","title":"MIToS.Information.getalphabet","text":"getalphabet allows to access the stored alphabet object.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.getcontingencytable-Union{Tuple{MIToS.Information.Probabilities{T, N, A}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T, N, A}","page":"Information","title":"MIToS.Information.getcontingencytable","text":"getcontingencytable allows to access the wrapped ContingencyTable in a Probabilities or Frequencies object.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.getmarginals-Tuple{MIToS.Information.ContingencyTable}","page":"Information","title":"MIToS.Information.getmarginals","text":"getmarginals allows to access the array with the marginal values (NamedArray).\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.getmarginalsarray-Tuple{MIToS.Information.ContingencyTable}","page":"Information","title":"MIToS.Information.getmarginalsarray","text":"getmarginalsarray allows to access the array with the marginal values (Array without names).\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.gettable-Tuple{MIToS.Information.ContingencyTable}","page":"Information","title":"MIToS.Information.gettable","text":"gettable allows to access the table (NamedArray).\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.gettablearray-Tuple{MIToS.Information.ContingencyTable}","page":"Information","title":"MIToS.Information.gettablearray","text":"gettablearray allows to access the table (Array without names).\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.gettotal-Tuple{MIToS.Information.ContingencyTable}","page":"Information","title":"MIToS.Information.gettotal","text":"gettotal allows to access the stored total value.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.kullback_leibler-Tuple{AbstractArray{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.kullback_leibler","text":"kullback_leibler(msa::AbstractArray{Residue}; background::Union{Array{T,N}, Probabilities{T,N,A}, ContingencyTable{T,N,A}}=BLOSUM62_Pi, base::Number=ℯ, kargs...)\n\nIt calculates the Kullback-Leibler (KL) divergence from a multiple sequence alignment (MSA). You can use the keyword argument background to set the background distribution. This argument can take an Array, Probabilities, or ContingencyTable object. The background distribution must have the same size and alphabet as the probabilities. The default is the BLOSUM62_Pi table. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.\n\nThe other keyword arguments are passed to the mapfreq function.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.kullback_leibler-Union{Tuple{MIToS.Information.Probabilities{T, N, A}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T<:Number, N, A<:MIToS.MSA.ResidueAlphabet}","page":"Information","title":"MIToS.Information.kullback_leibler","text":"kullback_leibler(probabilities::Probabilities{T,N,A}, background::Union{\n AbstractArray{T,N}, Probabilities{T,N,A}, ContingencyTable{T,N,A}}=BLOSUM62_Pi, \n base::Number=ℯ)\n\nIt calculates the Kullback-Leibler (KL) divergence from a table of Probabilities. You can use the keyword argument background to set the background distribution. This argument can take an Array, Probabilities, or ContingencyTable object. The background distribution must have the same size and alphabet as the probabilities. The default is the BLOSUM62_Pi table. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mapcolfreq!-Union{Tuple{A}, Tuple{T}, Tuple{Function, AbstractMatrix{MIToS.MSA.Residue}, Union{MIToS.Information.Frequencies{T, 1, A}, MIToS.Information.Probabilities{T, 1, A}}}} where {T, A}","page":"Information","title":"MIToS.Information.mapcolfreq!","text":"It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each column from the msa (second argument).\n\nweights (default: NoClustering()): Weights to be used for table counting.\npseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.\npseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mapcolpairfreq!-Union{Tuple{A}, Tuple{T}, Tuple{Function, AbstractMatrix{MIToS.MSA.Residue}, Union{MIToS.Information.Frequencies{T, 2, A}, MIToS.Information.Probabilities{T, 2, A}}}} where {T, A}","page":"Information","title":"MIToS.Information.mapcolpairfreq!","text":"It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each pair of columns from the msa (second argument).\n\nweights (default: NoClustering()): Weights to be used for table counting.\npseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.\npseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.\nusediagonal (default: true): If true, the function will be also applied to the diagonal elements.\ndiagonalvalue (default: zero): Value to fill diagonal elements if usediagonal is false.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mapfreq-Tuple{Function, AbstractArray{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.mapfreq","text":"mapfreq(f, msa; rank = 1, dims = 2, alphabet = UngappedAlphabet(), \n weights = NoClustering(), pseudocounts = NoPseudocount(), \n pseudofrequencies = NoPseudofrequencies(), probabilities = true, \n usediagonal = false, diagonalvalue = NaN, kargs...)\n\nIt efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (depending on the probabilities keyword argument) calculated on sequences (dims = 1) or columns (dims = 2, the default) of an msa (second argument). If rank = 1, the default, the function is applied to each sequence or column. If rank = 2, the function is applied to each pair of sequences or columns. In that case, we can set the usediagonal keyword argument to true to apply the function to pairs of the same sequence or column. The diagonalvalue keyword argument is used to set the value of the diagonal elements if usediagonal is false. By default, the function is not applied to the diagonal elements (i.e. usediagonal = false) and the diagonalvalue is set to NaN. The alphabet keyword argument can be used to set the alphabet used to construct the contingency table. The function also accepts the following keyword arguments:\n\nweights (default: NoClustering()): Weights to be used for table counting.\npseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.\npseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.\n\nNote that the pseudofrequencies argument is only valid if probabilities = true. All the other keyword arguments are passed to the function f.\n\njulia> using Random, MIToS.MSA, MIToS.Information\n\njulia> msa = rand(Random.MersenneTwister(1), Residue, 3, 6) # random MSA as an example\n3×6 Matrix{Residue}:\n F A F D E V\n T R R G F I\n N V S W Q T\n\njulia> mapfreq(sum, msa) # default: rank=1, dims=2, probabilities=true\n1×6 Named Matrix{Float64}\nFunction ╲ Col │ 1 2 3 4 5 6\n───────────────┼─────────────────────────────\nsum │ 1.0 1.0 1.0 1.0 1.0 1.0\n\njulia> mapfreq(sum, msa, probabilities=false)\n1×6 Named Matrix{Float64}\nFunction ╲ Col │ 1 2 3 4 5 6\n───────────────┼─────────────────────────────\nsum │ 3.0 3.0 3.0 3.0 3.0 3.0\n\njulia> mapfreq(sum, msa, dims=1)\n3×1 Named Matrix{Float64}\nSeq ╲ Function │ sum\n───────────────┼────\n1 │ 1.0\n2 │ 1.0\n3 │ 1.0\n\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mapseqfreq!-Union{Tuple{A}, Tuple{T}, Tuple{Function, AbstractMatrix{MIToS.MSA.Residue}, Union{MIToS.Information.Frequencies{T, 1, A}, MIToS.Information.Probabilities{T, 1, A}}}} where {T, A}","page":"Information","title":"MIToS.Information.mapseqfreq!","text":"It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each sequence from the msa (second argument).\n\nweights (default: NoClustering()): Weights to be used for table counting.\npseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.\npseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mapseqpairfreq!-Union{Tuple{A}, Tuple{T}, Tuple{Function, AbstractMatrix{MIToS.MSA.Residue}, Union{MIToS.Information.Frequencies{T, 2, A}, MIToS.Information.Probabilities{T, 2, A}}}} where {T, A}","page":"Information","title":"MIToS.Information.mapseqpairfreq!","text":"It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each pair of sequences from the msa (second argument).\n\nweights (default: NoClustering()): Weights to be used for table counting.\npseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.\npseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.\nusediagonal (default: true): If true, the function will be also applied to the diagonal elements.\ndiagonalvalue (default: zero): Value to fill diagonal elements if usediagonal is false.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.marginal_entropy-Union{Tuple{Union{MIToS.Information.Frequencies{T, N, A}, MIToS.Information.Probabilities{T, N, A}}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T, N, A}","page":"Information","title":"MIToS.Information.marginal_entropy","text":"marginal_entropy(table::Union{Frequencies{T,N,A},Probabilities{T,N,A}}; margin::Int=1, \n base::Number=ℯ)\n\nIt calculates marginal entropy (H) from a table of Frequencies or Probabilities. It takes two keyword arguments: margin and base. The first one is used to indicate the margin used to calculate the entropy, e.g. it estimates the entropy H(X) if margin is 1, H(Y) for 2, etc. The default value of margin is 1. The second keyword argument is used to change the base of the log. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mutual_information-Tuple{AbstractArray{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.mutual_information","text":"mutual_information(msa::AbstractArray{Residue}; base::Number=ℯ, kargs...)\n\nIt calculates Mutual Information (MI) from a multiple sequence alignment (MSA). The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits. The minimum value for rank is 2 (the default value). By defualt, it uses counts/frequencies to calculate the MI, as it's faster. You can use the keyword argument probabilities = true to calculate the MI from probabilities.\n\njulia> using Random, MIToS.MSA, MIToS.Information\n\njulia> msa = rand(Random.MersenneTwister(37), Residue, 3, 4)\n3×4 Matrix{Residue}:\n T R F K\n S H C I\n G G R V\n\njulia> mi = mutual_information(msa);\n\njulia> mi[1, 2]\n1.0986122886681098\n\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mutual_information-Union{Tuple{MIToS.Information.Probabilities{T, 2, A}}, Tuple{A}, Tuple{T}} where {T, A}","page":"Information","title":"MIToS.Information.mutual_information","text":"mutual_information(table::Union{Frequencies{T,2,A},Probabilities{T,2,A}}; base::Number=ℯ)\n\nIt calculates Mutual Information (MI) from a table of Frequencies or Probabilities. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits. Note that calculating MI from Frequencies is faster than from Probabilities.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mutual_information-Union{Tuple{Union{MIToS.Information.Frequencies{T, 3, A}, MIToS.Information.Probabilities{T, 3, A}}}, Tuple{A}, Tuple{T}} where {T, A}","page":"Information","title":"MIToS.Information.mutual_information","text":"mutual_information(table::Union{Frequencies{T,3,A},Probabilities{T,3,A}}; base::Number=ℯ)\n\nIt calculates Mutual Information (MI) from a table of Frequencies or Probabilities with three dimensions. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.\n\njulia> using Random, MIToS.MSA, MIToS.Information\n\njulia> msa = rand(Random.MersenneTwister(37), Residue, 3, 4)\n3×4 Matrix{Residue}:\n T R F K\n S H C I\n G G R V\n\njulia> Nxyz = frequencies(msa[:, 1], msa[:, 2], msa[:, 3]);\n\njulia> mutual_information(Nxyz)\n1.0986122886681093\n\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.normalized_mutual_information-Tuple{AbstractArray{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.normalized_mutual_information","text":"normalized_mutual_information(msa::AbstractArray{Residue}; kargs...)\n\nThis function calculates the Normalized Mutual Information (nMI) from a multiple sequence alignment using the mapfreq function—all the keyword arguments are passed to mapfreq. The mutual information score is normalized by the joint entropy of the two variables: nMI(X Y) = MI(X Y) H(X Y) By default, it uses counts/frequencies to estimate the nMI, as it's faster than using probabilities.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.normalized_mutual_information-Union{Tuple{Union{MIToS.Information.Frequencies{T, N, A}, MIToS.Information.Probabilities{T, N, A}}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T, N, A}","page":"Information","title":"MIToS.Information.normalized_mutual_information","text":"It calculates a Normalized Mutual Information (nMI) from a table of Frequencies or Probabilities. The mutual information score is normalized by the joint entropy of the two variables: nMI(X Y) = MI(X Y) H(X Y)\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.pairwisegapfraction-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.pairwisegapfraction","text":"It takes a MSA or a file and a FileFormat as first arguments. It calculates the percentage of gaps on columns pairs (union and intersection) using sequence clustering (Hobohm I).\n\nArgument, type, default value and descriptions:\n\n - clustering Bool true Sequence clustering (Hobohm I)\n - threshold 62 Percent identity threshold for sequence clustering (Hobohm I)\n\nThis function returns:\n\n - pairwise gap union as percentage\n - pairwise gap intersection as percentage\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.probabilities!-Union{Tuple{A}, Tuple{N}, Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, N, A}, Any, MIToS.Information.Pseudocount, MIToS.Information.Pseudofrequencies, Vararg{AbstractArray{MIToS.MSA.Residue}, N}}} where {T, N, A}","page":"Information","title":"MIToS.Information.probabilities!","text":"It populates a ContingencyTable (first argument) using the probabilities in the sequences (last positional arguments). The dimension of the table must match the number of sequences and all the sequences must have the same length. You must indicate the used weights, pseudocounts and pseudofrequencies as second, third and fourth positional arguments respectively. You can use NoClustering(), NoPseudocount() and NoPseudofrequencies() to avoid the use of sequence weighting, pseudocounts and pseudofrequencies, respectively.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.probabilities-Union{Tuple{Vararg{AbstractArray{MIToS.MSA.Residue}, N}}, Tuple{N}} where N","page":"Information","title":"MIToS.Information.probabilities","text":"It returns a ContingencyTable wrapped in a Probabilities type with the probabilities of residues in the sequences that takes as arguments. The dimension of the table is equal to the number of sequences. You can use the keyword arguments alphabet, weights, pseudocounts and pseudofrequencies to indicate the alphabet of the table (default to UngappedAlphabet()), a clustering result (default to NoClustering()), the pseudocounts (default to NoPseudocount()) and the pseudofrequencies (default to NoPseudofrequencies()) to be used during the estimation of the probabilities.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.shannon_entropy-Tuple{AbstractArray{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.shannon_entropy","text":"shannon_entropy(msa::AbstractArray{Residue}; base::Number=ℯ, \n probabilities::Bool=false, usediagonal::Bool=true, kargs...)\n\nIt calculates the Shannon entropy (H) on a MSA. You can use the keyword argument base to change the base of the log. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits. It uses mapfreq under the hood, so it takes the same keyword arguments. By default, it measures the entropy of each column in the MSA. You can use dims = 1 to measure the entropy of each sequence. You can also set rank = 2to measure the joint entropy of each pair of sequences or columns. This function sets by default the probabilities keyword argument to false because it's faster to calculate the entropy from counts/frequencies. It also sets usediagonal = true to also calculate the entropy of the individual variables (sequences or columns).\n\n```jldoctest julia> using MIToS.MSA, MIToS.Information\n\njulia> msa = Residue['C' 'G'; 'C' 'L'; 'C' 'I'] 3×2 Matrix{Residue}: C G C L C I\n\njulia> shannonentropy(msa) 1×2 Named Matrix{Float64} Function ╲ Col │ 1 2 ────────────────┼───────────────── shannonentropy │ 0.0 1.09861\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.shannon_entropy-Union{Tuple{MIToS.Information.Probabilities{T, N, A}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T, N, A}","page":"Information","title":"MIToS.Information.shannon_entropy","text":"shannon_entropy(table::Union{Frequencies{T,N,A},Probabilities{T,N,A}}; base::Number=ℯ)\n\nIt calculates the Shannon entropy (H) from a table of Frequencies or Probabilities. Use last and optional positional argument to change the base of the log. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.\n\n\n\n\n\n","category":"method"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"@info \"SIFTS API docs\"","category":"page"},{"location":"SIFTS_API/#SIFTS","page":"SIFTS","title":"SIFTS","text":"","category":"section"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"MIToS.SIFTS","category":"page"},{"location":"SIFTS_API/#MIToS.SIFTS","page":"SIFTS","title":"MIToS.SIFTS","text":"The SIFTS module of MIToS allows to obtain the residue-level mapping between databases stored in the SIFTS XML files. It makes easy to assign PDB residues to UniProt/Pfam positions. Given the fact that pairwise alignments can lead to misleading association between residues in both sequences, SIFTS offers more reliable association between sequence and structure residue numbers.\n\nFeatures\n\nDownload and parse SIFTS XML files\nStore residue-level mapping in Julia\nEasy generation of OrderedDicts between residues numbers\n\nusing MIToS.SIFTS\n\n\n\n\n\n","category":"module"},{"location":"SIFTS_API/#Contents","page":"SIFTS","title":"Contents","text":"","category":"section"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"Pages = [\"SIFTS_API.md\"]\nDepth = 2","category":"page"},{"location":"SIFTS_API/#Types","page":"SIFTS","title":"Types","text":"","category":"section"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"Modules = [MIToS.SIFTS]\nPrivate = false\nOrder = [:type]","category":"page"},{"location":"SIFTS_API/#MIToS.SIFTS.SIFTSResidue","page":"SIFTS","title":"MIToS.SIFTS.SIFTSResidue","text":"A SIFTSResidue object stores the SIFTS residue level mapping for a residue. It has the following fields that you can access at any moment for query purposes:\n\n- `PDBe` : A `dbPDBe` object, it's present in all the `SIFTSResidue`s.\n- `UniProt` : A `dbUniProt` object or `missing`.\n- `Pfam` : A `dbPfam` object or `missing`.\n- `NCBI` : A `dbNCBI` object or `missing`.\n- `InterPro` : An array of `dbInterPro` objects.\n- `PDB` : A `dbPDB` object or `missing`.\n- `SCOP` : A `dbSCOP` object or `missing`.\n- `SCOP2` : An array of `dbSCOP2` objects.\n- `SCOP2B` : A `dbSCOP2B` object or `missing`.\n- `CATH` : A `dbCATH` object or `missing`.\n- `Ensembl` : An array of `dbEnsembl` objects.\n- `missing` : It's `true` if the residue is missing, i.e. not observed, in the structure.\n- `sscode` : A string with the secondary structure code of the residue.\n- `ssname` : A string with the secondary structure name of the residue.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbCATH","page":"SIFTS","title":"MIToS.SIFTS.dbCATH","text":"dbCATH stores the residue id, number, name and chain in CATH as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbEnsembl","page":"SIFTS","title":"MIToS.SIFTS.dbEnsembl","text":"dbEnsembl stores the residue (gene) accession id, the transcript, translation and exon ids in Ensembl as strings, together with the residue number and name using the UniProt coordinates.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbInterPro","page":"SIFTS","title":"MIToS.SIFTS.dbInterPro","text":"dbInterPro stores the residue id, number, name and evidence in InterPro as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbNCBI","page":"SIFTS","title":"MIToS.SIFTS.dbNCBI","text":"dbNCBI stores the residue id, number and name in NCBI as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbPDB","page":"SIFTS","title":"MIToS.SIFTS.dbPDB","text":"dbPDB stores the residue id, number, name and chain in PDB as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbPDBe","page":"SIFTS","title":"MIToS.SIFTS.dbPDBe","text":"dbPDBe stores the residue number and name in PDBe as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbPfam","page":"SIFTS","title":"MIToS.SIFTS.dbPfam","text":"dbPfam stores the residue id, number and name in Pfam as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbSCOP","page":"SIFTS","title":"MIToS.SIFTS.dbSCOP","text":"dbSCOP stores the residue id, number, name and chain in SCOP as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbSCOP2","page":"SIFTS","title":"MIToS.SIFTS.dbSCOP2","text":"dbSCOP2 stores the residue id, number, name and chain in SCOP2 as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbSCOP2B","page":"SIFTS","title":"MIToS.SIFTS.dbSCOP2B","text":"dbSCOP2B stores the residue id, number, name and chain in SCOP2B as strings. SCOP2B is expansion of SCOP2 domain annotations at superfamily level to every PDB with same UniProt accession having at least 80% SCOP2 domain coverage.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbUniProt","page":"SIFTS","title":"MIToS.SIFTS.dbUniProt","text":"dbUniProt stores the residue id, number and name in UniProt as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#Constants","page":"SIFTS","title":"Constants","text":"","category":"section"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"Modules = [MIToS.SIFTS]\nPrivate = false\nOrder = [:constant]","category":"page"},{"location":"SIFTS_API/#Macros","page":"SIFTS","title":"Macros","text":"","category":"section"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"Modules = [MIToS.SIFTS]\nPrivate = false\nOrder = [:macro]","category":"page"},{"location":"SIFTS_API/#Methods-and-functions","page":"SIFTS","title":"Methods and functions","text":"","category":"section"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"Modules = [MIToS.SIFTS]\nPrivate = false\nOrder = [:function]","category":"page"},{"location":"SIFTS_API/#MIToS.SIFTS.downloadsifts-Tuple{String}","page":"SIFTS","title":"MIToS.SIFTS.downloadsifts","text":"downloadsifts(pdbcode::String; filename::String, source::String=\"https\")\n\nDownload the gzipped SIFTS XML file for the provided pdbcode. The downloaded file will have the default extension .xml.gz. While you can change the filename, it must include the .xml.gz ending. The source keyword argument is set to \"https\" by default. Alternatively, you can choose \"ftp\" as the source, which will retrieve the file from the EBI FTP server at ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/. However, please note that using \"https\" is highly recommended. This option will download the file from the EBI PDBe server at https://www.ebi.ac.uk/pdbe/files/sifts/.\n\n\n\n\n\n","category":"method"},{"location":"SIFTS_API/#MIToS.SIFTS.siftsmapping-Union{Tuple{T}, Tuple{F}, Tuple{String, Type{F}, String, Type{T}, String}} where {F, T}","page":"SIFTS","title":"MIToS.SIFTS.siftsmapping","text":"Parses a SIFTS XML file and returns a OrderedDict between residue numbers of two DataBases with the given identifiers. A chain could be specified (All by default). If missings is true (default) all the residues are used, even if they haven’t coordinates in the PDB file.\n\n\n\n\n\n","category":"method"},{"location":"SIFTS_API/#MIToS.Utils.parse_file-Tuple{LightXML.XMLDocument, Type{MIToS.SIFTS.SIFTSXML}}","page":"SIFTS","title":"MIToS.Utils.parse_file","text":"parse_file(document::LightXML.XMLDocument, ::Type{SIFTSXML}; chain=All, missings::Bool=true)\n\nReturns a Vector{SIFTSResidue} parsed from a SIFTSXML file. By default, parses all the chains and includes missing residues.\n\n\n\n\n\n","category":"method"},{"location":"","page":"Home","title":"Home","text":"\"MIToS\"/\n\"MIToS\"/","category":"page"},{"location":"","page":"Home","title":"Home","text":"A Julia Package to Analyze Protein Sequences, Structures, and Evolutionary Information","category":"page"},{"location":"#Modules","page":"Home","title":"Modules","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"MIToS tools are separated into different modules for different tasks.","category":"page"},{"location":"","page":"Home","title":"Home","text":"MSA: This module defines multiple functions and types for dealing with Multiple Sequence Alignments (MSAs) and their annotations. It also includes facilities for sequence clustering and shuffling, among others.\nPDB: This module defines types and methods to work with protein structures from different sources, such as the Protein Data Bank (PDB) or AlphaFold DB. It includes functions to superpose structures, measure the distance between residues, and much more.\nInformation: This module defines residue contingency tables and methods on them to estimate information measures. This allow to measure evolutionary information on MSAs positions. It includes functions to estimate corrected mutual information (ZMIp, ZBLMIp) between MSA columns, as well as conservation estimations using Shannon entropy and the Kullback-Leibler divergence.\nSIFTS: This module allows access to SIFTS residue-level mapping of UniProt, Pfam, and other databases with PDB entries.\nPfam: This module uses the previous modules to work with Pfam MSAs. It also has useful parameter optimization functions to be used with Pfam alignments.\nUtils: MIToS has also a Utils module with common utils functions and types used in different modules of this package.","category":"page"},{"location":"#Citation","page":"Home","title":"Citation","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"If you use MIToS [1], please cite:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Diego J. Zea, Diego Anfossi, Morten Nielsen, Cristina Marino-Buslje; MIToS.jl: mutual information tools for protein sequence analysis in the Julia language, Bioinformatics, Volume 33, Issue 4, 15 February 2017, Pages 564–565, https://doi.org/10.1093/bioinformatics/btw646","category":"page"},{"location":"#Older-MIToS-versions","page":"Home","title":"Older MIToS versions","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"You can change the MIToS version of the documentation at the bottom left of this site—the older version available is MIToS 2.0. If you are using MIToS v1 in a version of Julia pre-1.0, please read this older documentation instead.","category":"page"},{"location":"#Acknowledgments","page":"Home","title":"Acknowledgments","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"MIToS was initially developed at the Structural Bioinformatics Unit of the Fundación Instituto Leloir (FIL) in Argentina. Its development now continues at the Molecular Assemblies and Genome Integrity group of the Institute for Integrative Biology of the Cell (I2BC) in France.","category":"page"},{"location":"","page":"Home","title":"Home","text":"We want to thank all contributors who have helped improve MIToS. We also thank the Julia community and all the MIToS users for their feedback and support.","category":"page"},{"location":"","page":"Home","title":"Home","text":"\"FIL\n\"FIL","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"@info \"Installation docs\"","category":"page"},{"location":"Installation/#Installation","page":"Installation","title":"Installation","text":"","category":"section"},{"location":"Installation/","page":"Installation","title":"Installation","text":"First you need to install Julia.(Image: ) MIToS' stable version can be installed by typing on the Julia REPL:","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"using Pkg\nPkg.add(\"MIToS\")","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"If everything goes well with the installation, MIToS will be loaded without errors by typing:","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"using MIToS","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"To update MIToS to the latest version, you can run:","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"using Pkg\nPkg.update(\"MIToS\")","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"tip: Ways to run Julia\nJulia REPL (Image: ): Built-in Julia command line. Start a Julia interactive session (REPL) by double-clicking the Julia executable or running julia from the system command line.\nIJulia (Image: ): Jupyter/IPython notebook for Julia.\nPluto (Image: ): A simple reactive notebook for Julia.\nVS Code Extension for Julia (Image: ): The Julia's Integrated Development Environment (IDE).","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"info: Running the test suite\nOptionally, you can run the test suite to ensure everything works as expected. The test suite is extensive and can take several minutes to run. It is the same test suite used for MIToS' continuous integration (CI), so everything should pass. To run the test suite, execute using Pkg; Pkg.test(\"MIToS\") in the Julia REPL.","category":"page"},{"location":"Installation/#Plots-installation","page":"Installation","title":"Plots installation","text":"","category":"section"},{"location":"Installation/","page":"Installation","title":"Installation","text":"Julia plotting capabilities are available through external packages. MIToS makes use of RecipesBase to define plot recipes, which can be plotted using Plots(Image: ) and its different backends. You need to install Plots(Image: ) to plot MIToS objects:","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"using Pkg\nPkg.add(\"Plots\")","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"Once it is installed, you need to load Plots in order to use the plot function. There is more information about it in the Plots documentation(Image: ).","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"using Plots","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"To generate graph (network), arc and chord (circo) plots, you also need to install and load GraphRecipes(Image: ).","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"using Pkg\nPkg.add(\"GraphRecipes\")\n\nusing GraphRecipes","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"You can look for examples in the GraphRecipes documentation(Image: ).","category":"page"}] +[{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"@info \"Scripts docs\"","category":"page"},{"location":"Scripts/#MIToS'-Scripts","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"The MIToS_Scripts.jl package offers a set of easy-to-use scripts for command-line execution without requiring Julia coding. It includes several scripts designed for various bioinformatics tasks, such as measuring estimating residue conservation and inter-residue coevolution, calculating distances between residues in a protein structure, and more.","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"Pages = [\"Scripts.md\"]\nDepth = 4","category":"page"},{"location":"Scripts/#Installation","page":"MIToS' Scripts","title":"Installation","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"To install MIToS_Scripts.jl, you only need Julia 1.9 or later installed on your system. Executing julia in the terminal to open the Julia REPL, and finally, run the following command:","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"using Pkg\nPkg.add(url = \"https://github.com/MIToSOrg/MIToS_Scripts.jl\")","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"Then, you can get the location of the installed scripts by running the following command:","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"using MIToS_Scripts\nscripts_folder = joinpath(pkgdir(MIToS_Scripts), \"scripts\")","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"You can run them from that location. Alternatively, you can add the location to your PATH environment variable, or copy the scripts to a folder already in your PATH to run them from anywhere.","category":"page"},{"location":"Scripts/#Usage","page":"MIToS' Scripts","title":"Usage","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"You can execute each provided script from your command line. For example, to run the Buslje09.jl script—if you are located in the folder where it is the scripts—use:","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"julia Buslje09.jl input_msa_file","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"Refer to the documentation of each script for specific usage instructions; you can access it by running the script with the --help or -h flag:","category":"page"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"julia Buslje09.jl -h","category":"page"},{"location":"Scripts/#Scripts","page":"MIToS' Scripts","title":"Scripts","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"using Pkg\nproject_folder = \"MIToS_Scripts_Project\"\nisdir(project_folder) || mkdir(project_folder)\nPkg.activate(project_folder)\nPkg.add(url=\"https://github.com/MIToSOrg/MIToS_Scripts.jl\")\nusing MIToS_Scripts\nscripts_folder = joinpath(pkgdir(MIToS_Scripts), \"scripts\")","category":"page"},{"location":"Scripts/#Buslje09.jl","page":"MIToS' Scripts","title":"Buslje09.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"Buslje09.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#BLMI.jl","page":"MIToS' Scripts","title":"BLMI.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"BLMI.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide\n","category":"page"},{"location":"Scripts/#Conservation.jl","page":"MIToS' Scripts","title":"Conservation.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"Conservation.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#DownloadPDB.jl","page":"MIToS' Scripts","title":"DownloadPDB.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"DownloadPDB.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#Distances.jl","page":"MIToS' Scripts","title":"Distances.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"Distances.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#MSADescription.jl","page":"MIToS' Scripts","title":"MSADescription.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"MSADescription.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#PercentIdentity.jl","page":"MIToS' Scripts","title":"PercentIdentity.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"PercentIdentity.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#AlignedColumns.jl","page":"MIToS' Scripts","title":"AlignedColumns.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"AlignedColumns.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"Scripts/#SplitStockholm.jl","page":"MIToS' Scripts","title":"SplitStockholm.jl","text":"","category":"section"},{"location":"Scripts/","page":"MIToS' Scripts","title":"MIToS' Scripts","text":"script_path = joinpath(scripts_folder, \"SplitStockholm.jl\") # hide\nprintln(read(`$(Base.julia_cmd()) --project=$project_folder $script_path -h`, String)) #hide","category":"page"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"@info \"MSA API docs\"","category":"page"},{"location":"MSA_API/#MSA","page":"MSA","title":"MSA","text":"","category":"section"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"MIToS.MSA","category":"page"},{"location":"MSA_API/#MIToS.MSA","page":"MSA","title":"MIToS.MSA","text":"The MSA module of MIToS has utilities for working with Multiple Sequence Alignments of protein Sequences (MSA).\n\nFeatures\n\nRead and write MSAs in Stockholm, FASTA, A3M, PIR, or Raw format\nHandle MSA annotations\nEdit the MSA, e.g. delete columns or sequences, change sequence order, shuffling...\nKeep track of positions and annotations after modifications on the MSA\nDescribe a MSA, e.g. mean percent identity, sequence coverage, gap percentage...\n\nusing MIToS.MSA\n\n\n\n\n\n","category":"module"},{"location":"MSA_API/#Contents","page":"MSA","title":"Contents","text":"","category":"section"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"Pages = [\"MSA_API.md\"]\nDepth = 2","category":"page"},{"location":"MSA_API/#Types","page":"MSA","title":"Types","text":"","category":"section"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"Modules = [MIToS.MSA]\nPrivate = false\nOrder = [:type]","category":"page"},{"location":"MSA_API/#MIToS.MSA.AbstractAlignedObject","page":"MSA","title":"MIToS.MSA.AbstractAlignedObject","text":"MIToS MSA and aligned sequences (aligned objects) are subtypes of AbstractMatrix{Residue}, because MSAs and sequences are stored as Matrix of Residues.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AbstractAlignedSequence","page":"MSA","title":"MIToS.MSA.AbstractAlignedSequence","text":"A MIToS aligned sequence is an AbstractMatrix{Residue} with only 1 row/sequence.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AbstractMultipleSequenceAlignment","page":"MSA","title":"MIToS.MSA.AbstractMultipleSequenceAlignment","text":"MSAs are stored as Matrix{Residue}. It's possible to use a NamedResidueMatrix{Array{Residue,2}} as the most simple MSA with sequence identifiers and column names.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AbstractSequence","page":"MSA","title":"MIToS.MSA.AbstractSequence","text":"A MIToS (unaligned) sequence is an AbstractMatrix{Residue} with only 1 row/sequence.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AlignedSequence","page":"MSA","title":"MIToS.MSA.AlignedSequence","text":"An AlignedSequence wraps a NamedResidueMatrix{Array{Residue,2}} with only 1 row/sequence. The NamedArray stores the sequence name and original column numbers as Strings.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AnnotatedAlignedSequence","page":"MSA","title":"MIToS.MSA.AnnotatedAlignedSequence","text":"This type represent an aligned sequence, similar to AlignedSequence, but It also stores its Annotations.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AnnotatedMultipleSequenceAlignment","page":"MSA","title":"MIToS.MSA.AnnotatedMultipleSequenceAlignment","text":"This type represent an MSA, similar to MultipleSequenceAlignment, but It also stores Annotations. This annotations are used to store residue coordinates (i.e. mapping to UniProt residue numbers).\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.AnnotatedSequence","page":"MSA","title":"MIToS.MSA.AnnotatedSequence","text":"An AnnotationSequence wraps a NamedResidueMatrix{Array{Residue,2}} with only 1 row/sequence and its Annotations. The NamedArray stores the sequence name and original position numbers as Strings.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.Annotations","page":"MSA","title":"MIToS.MSA.Annotations","text":"The Annotations type is basically a container for Dicts with the annotations of a multiple sequence alignment. Annotations was designed for storage of annotations of the Stockholm format.\n\nMIToS also uses MSA annotations to keep track of:\n\nModifications of the MSA (MIToS_...) as deletion of sequences or columns.\nPositions numbers in the original MSA file (column mapping: ColMap)\nPosition of the residues in the sequence (sequence mapping: SeqMap)\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.Clusters","page":"MSA","title":"MIToS.MSA.Clusters","text":"Data structure to represent sequence clusters. The sequence data itself is not included.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.GappedAlphabet","page":"MSA","title":"MIToS.MSA.GappedAlphabet","text":"This type defines the usual alphabet of the 20 natural residues and a gap character.\n\njulia> using MIToS.MSA\n\njulia> GappedAlphabet()\nGappedAlphabet of length 21. Residues : res\"ARNDCQEGHILKMFPSTWYV-\"\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.MultipleSequenceAlignment","page":"MSA","title":"MIToS.MSA.MultipleSequenceAlignment","text":"This MSA type include a NamedArray wrapping a Matrix of Residues. The use of NamedArray allows to store sequence names and original column numbers as Strings, and fast indexing using them.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.NoClustering","page":"MSA","title":"MIToS.MSA.NoClustering","text":"Use NoClustering() to avoid the use of clustering where a Clusters type is needed.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.ReducedAlphabet","page":"MSA","title":"MIToS.MSA.ReducedAlphabet","text":"ReducedAlphabet allows the construction of reduced residue alphabets, where residues inside parenthesis belong to the same group.\n\njulia> using MIToS.MSA\n\njulia> ab = ReducedAlphabet(\"(AILMV)(RHK)(NQST)(DE)(FWY)CGP\")\nReducedAlphabet of length 8 : \"(AILMV)(RHK)(NQST)(DE)(FWY)CGP\"\n\njulia> ab[Residue('K')]\n2\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.Residue","page":"MSA","title":"MIToS.MSA.Residue","text":"Most of the MIToS design is created around the Residue bitstype. It has representations for the 20 natural amino acids, a value representing insertions and deletions (GAP, '-') and one representing unknown, ambiguous and non standard residues (XAA, 'X'). Each Residue is encoded as an integer number, with the same bit representation and size than a Int. This allows fast indexing operation of probability or frequency matrices.\n\nResidue creation and conversion\n\nCreation and conversion of Residues should be treated carefully. Residue is encoded as a 32 or 64 bits type similar to Int, to get fast indexing using Int(x::Residue). Int simply calls reinterpret without checking if the residue is valid. Valid residues have integer values in the closed interval [1,22]. convert from Int and Char always returns valid residues, however it's possible to find invalid residues (they are shown using the character '�') after the creation of uninitialized Residue arrays (i.e. using Array). You can use zeros, ones or rand to get initialized Residue arrays with valid residues. Conversions to and from Chars changes the bit representation and allows the use of the usual character representation of residues and amino acids. This conversions are used in IO operations and always return valid residues. In conversions from Char, lowercase letters, '*', '-' and '.' are translated to GAP, letters representing the 20 natural amino (ARNDCQEGHILKMFPSTWYV) acids are translated to their corresponding Residue and any other character is translated to XAA. Since lowercase letters and dots are translated to gaps, Pfam MSA insert columns are converted to columns full of gaps.\n\njulia> using MIToS.MSA\n\njulia> alanine = Residue('A')\nA\n\njulia> Char(alanine)\n'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)\n\njulia> for residue in res\"ARNDCQEGHILKMFPSTWYV-X\"\n println(residue, \" \", Int(residue))\n end\nA 1\nR 2\nN 3\nD 4\nC 5\nQ 6\nE 7\nG 8\nH 9\nI 10\nL 11\nK 12\nM 13\nF 14\nP 15\nS 16\nT 17\nW 18\nY 19\nV 20\n- 21\nX 22\n\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.ResidueAlphabet","page":"MSA","title":"MIToS.MSA.ResidueAlphabet","text":"Abstract type to define residue alphabet types.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.UngappedAlphabet","page":"MSA","title":"MIToS.MSA.UngappedAlphabet","text":"This type defines the usual alphabet of the 20 natural residues, without the gap character.\n\njulia> using MIToS.MSA\n\njulia> UngappedAlphabet()\nUngappedAlphabet of length 20. Residues : res\"ARNDCQEGHILKMFPSTWYV\"\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#Constants","page":"MSA","title":"Constants","text":"","category":"section"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"Modules = [MIToS.MSA]\nPrivate = false\nOrder = [:constant]","category":"page"},{"location":"MSA_API/#MIToS.MSA.GAP","page":"MSA","title":"MIToS.MSA.GAP","text":"GAP is the Residue representation on MIToS for gaps ('-', insertions and deletions). Lowercase residue characters, dots and '*' are encoded as GAP in conversion from Strings and Chars. This Residue constant is encoded as Residue(21).\n\n\n\n\n\n","category":"constant"},{"location":"MSA_API/#MIToS.MSA.WeightTypes","page":"MSA","title":"MIToS.MSA.WeightTypes","text":"The WeightTypes type is the same as Union{Weights,NoClustering,Clusters}. This type is used to represent weights. Most of the functions taking the weights kerword argument in the Information module accept instances of WeightTypes.\n\n\n\n\n\n","category":"type"},{"location":"MSA_API/#MIToS.MSA.XAA","page":"MSA","title":"MIToS.MSA.XAA","text":"XAA is the Residue representation for unknown, ambiguous and non standard residues. This Residue constant is encoded as Residue(22).\n\n\n\n\n\n","category":"constant"},{"location":"MSA_API/#Macros","page":"MSA","title":"Macros","text":"","category":"section"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"Modules = [MIToS.MSA]\nPrivate = false\nOrder = [:macro]","category":"page"},{"location":"MSA_API/#MIToS.MSA.@res_str-Tuple{Any}","page":"MSA","title":"MIToS.MSA.@res_str","text":"The MIToS macro @res_str takes a string and returns a Vector of Residues (sequence).\n\njulia> using MIToS.MSA\n\njulia> res\"MIToS\"\n5-element Vector{Residue}:\n M\n I\n T\n -\n S\n\n\n\n\n\n","category":"macro"},{"location":"MSA_API/#Methods-and-functions","page":"MSA","title":"Methods and functions","text":"","category":"section"},{"location":"MSA_API/","page":"MSA","title":"MSA","text":"Modules = [MIToS.MSA]\nPrivate = false\nOrder = [:function]","category":"page"},{"location":"MSA_API/#Base.isvalid-Tuple{Type{MIToS.MSA.Residue}, MIToS.MSA.Residue}","page":"MSA","title":"Base.isvalid","text":"isvalid(res::Residue)\n\nIt returns true if the encoded integer is in the closed interval [1,22].\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#Base.names-Tuple{MIToS.MSA.ReducedAlphabet}","page":"MSA","title":"Base.names","text":"It returns the name of each group. The name is a string with the one letter code of each residue that belong to the group.\n\njulia> using MIToS.MSA\n\njulia> ab = ReducedAlphabet(\"(AILMV)(RHK)(NQST)(DE)(FWY)CGP\")\nReducedAlphabet of length 8 : \"(AILMV)(RHK)(NQST)(DE)(FWY)CGP\"\n\njulia> names(ab)\n8-element Vector{String}:\n \"AILMV\"\n \"RHK\"\n \"NQST\"\n \"DE\"\n \"FWY\"\n \"C\"\n \"G\"\n \"P\"\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#Base.rand-Tuple{Random.AbstractRNG, Random.SamplerType{MIToS.MSA.Residue}}","page":"MSA","title":"Base.rand","text":"It chooses from the 20 natural residues (it doesn't generate gaps).\n\njulia> using MIToS.MSA\n\njulia> using Random\n\njulia> Random.seed!(1); # Reseed the random number generator.\n\njulia> rand(Residue)\nR\n\njulia> rand(Residue, 4, 4)\n4×4 Matrix{Residue}:\n E D D A\n F S K K\n M S I M\n Y F E D\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.adjustreference","page":"MSA","title":"MIToS.MSA.adjustreference","text":"Creates a new matrix of residues. This function deletes positions/columns of the MSA with gaps in the reference (first) sequence.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.adjustreference!","page":"MSA","title":"MIToS.MSA.adjustreference!","text":"It removes positions/columns of the MSA with gaps in the reference (first) sequence.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.annotate_modification!-Tuple{MIToS.MSA.Annotations, String}","page":"MSA","title":"MIToS.MSA.annotate_modification!","text":"Annotates on file annotations the modifications realized by MIToS on the MSA. It always returns true, so It can be used in a boolean context.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.annotations-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment}","page":"MSA","title":"MIToS.MSA.annotations","text":"The annotations function returns the Annotations of an annotated MSA or aligned sequence. If the object is not annotated, it returns an empty Annotations object.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.column_index-Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}} where AT, AbstractString}","page":"MSA","title":"MIToS.MSA.column_index","text":"column_index(msa, col_name)\n\nReturn the index (integer position) of the column with name col_name in the MSA msa. A KeyError is thrown if the column name does not exist. If col_name is an integer, the same integer is returned without checking if it is a valid index.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.columngapfraction-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.columngapfraction","text":"Fraction of gaps per column/position on the MSA\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.columnname_iterator-Union{Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}}}, Tuple{AT}} where AT","page":"MSA","title":"MIToS.MSA.columnname_iterator","text":"columnname_iterator(msa)\n\nIt returns an iterator that returns the column names of the msa. If the msa is a Matrix{Residue} this function returns the actual column numbers as strings. Otherwise it returns the column number of the original MSA through the wrapped NamedArray column names.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.columnnames-Union{Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}}}, Tuple{AT}} where AT","page":"MSA","title":"MIToS.MSA.columnnames","text":"columnnames(msa)\n\nIt returns a Vector{String} with the sequence names/identifiers. If the msa is a Matrix{Residue} this function returns the actual column numbers as strings. Otherwise it returns the column number of the original MSA through the wrapped NamedArray column names.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.columnpairsmatrix-Union{Tuple{diagonal}, Tuple{T}, Tuple{AbstractMatrix{MIToS.MSA.Residue}, Type{T}, Type{Val{diagonal}}, T}} where {T, diagonal}","page":"MSA","title":"MIToS.MSA.columnpairsmatrix","text":"Initialize an empty PairwiseListMatrix for a pairwise measure in sequence pairs. It uses the sequence names if they are available, otherwise it uses the actual sequence numbers. You can use the positional argument to indicate the number Type (default: Float64), if the PairwiseListMatrix should store the diagonal values on the list (default: false) and a default value for the diagonal (default: NaN).\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.coverage-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.coverage","text":"Coverage of the sequences with respect of the number of positions on the MSA\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.delete_annotated_modifications!-Tuple{MIToS.MSA.Annotations}","page":"MSA","title":"MIToS.MSA.delete_annotated_modifications!","text":"Deletes all the MIToS annotated modifications\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.deletefullgapcolumns!","page":"MSA","title":"MIToS.MSA.deletefullgapcolumns!","text":"Deletes columns with 100% gaps, this columns are generated by inserts.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.filtercolumns!","page":"MSA","title":"MIToS.MSA.filtercolumns!","text":"filtercolumns!(msa, mask[, annotate::Bool=true])\n\nIt allows to filter MSA or aligned sequence columns/positions using a AbstractVector{Bool} mask. Annotations are updated if annotate is true (default).\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.filtercolumns!-Tuple{MIToS.MSA.Annotations, Any}","page":"MSA","title":"MIToS.MSA.filtercolumns!","text":"filtercolumns!(data::Annotations, mask)\n\nIt is useful for deleting column annotations (creating a subset in place).\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.filtercolumns-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Any}","page":"MSA","title":"MIToS.MSA.filtercolumns","text":"It's similar to filtercolumns! but for an AbstractMatrix{Residue}\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.filtersequences!","page":"MSA","title":"MIToS.MSA.filtersequences!","text":"filtersequences!(msa, mask[, annotate::Bool=true])\n\nIt allows to filter msa sequences using a AbstractVector{Bool} mask (It removes sequences with false values). AnnotatedMultipleSequenceAlignment annotations are updated if annotate is true (default).\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.filtersequences!-Tuple{MIToS.MSA.Annotations, Vector{String}, AbstractVector{Bool}}","page":"MSA","title":"MIToS.MSA.filtersequences!","text":"filtersequences!(data::Annotations, ids::Vector{String}, mask::AbstractArray{Bool,1})\n\nIt is useful for deleting sequence annotations. ids should be a list of the sequence names and mask should be a logical vector.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.filtersequences-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Any}","page":"MSA","title":"MIToS.MSA.filtersequences","text":"It's similar to filtersequences! but for an AbstractMatrix{Residue}\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.gapfraction-Tuple{AbstractArray{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.gapfraction","text":"It calculates the fraction of gaps on the Array (alignment, sequence, column, etc.). This function can take an extra dimension argument for calculation of the gap fraction over the given dimension.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.gapstrip!","page":"MSA","title":"MIToS.MSA.gapstrip!","text":"This functions deletes/filters sequences and columns/positions on the MSA on the following order:\n\nRemoves all the columns/position on the MSA with gaps on the reference (first) sequence.\nRemoves all the sequences with a coverage with respect to the number of columns/positions on the MSA less than a coveragelimit (default to 0.75: sequences with 25% of gaps).\nRemoves all the columns/position on the MSA with more than a gaplimit (default to 0.5: 50% of gaps).\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.gapstrip-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.gapstrip","text":"Creates a new matrix of Residues (MSA) with deleted sequences and columns/positions. The MSA is edited in the following way:\n\nRemoves all the columns/position on the MSA with gaps on the reference (first) sequence\nRemoves all the sequences with a coverage with respect to the number of columns/positions on the MSA less than a coveragelimit (default to 0.75: sequences with 25% of gaps)\nRemoves all the columns/position on the MSA with more than a gaplimit (default to 0.5: 50% of gaps)\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.getannotcolumn","page":"MSA","title":"MIToS.MSA.getannotcolumn","text":"getannotcolumn(ann[, feature[,default]])\n\nIt returns per column annotation for feature\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.getannotfile","page":"MSA","title":"MIToS.MSA.getannotfile","text":"getannotfile(ann[, feature[,default]])\n\nIt returns per file annotation for feature\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.getannotresidue","page":"MSA","title":"MIToS.MSA.getannotresidue","text":"getannotresidue(ann[, seqname, feature[,default]])\n\nIt returns per residue annotation for (seqname, feature)\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.getannotsequence","page":"MSA","title":"MIToS.MSA.getannotsequence","text":"getannotsequence(ann[, seqname, feature[,default]])\n\nIt returns per sequence annotation for (seqname, feature)\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.getcolumnmapping-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment}","page":"MSA","title":"MIToS.MSA.getcolumnmapping","text":"It returns a Vector{Int} with the original column number of each column on the actual MSA. The mapping is annotated in the ColMap file annotation of an AnnotatedMultipleSequenceAlignment or in the column names of an NamedArray or MultipleSequenceAlignment.\n\nNOTE: When the MSA results from vertically concatenating MSAs using vcat, the column map annotations from the constituent MSAs (such as 1_ColMap, 2_ColMap, etc.) are not returned. Instead, the column numbers referenced in the column names are provided. To access the original annotations, utilize the getannotfile function.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.gethcatmapping-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment}","page":"MSA","title":"MIToS.MSA.gethcatmapping","text":"It returns a vector of numbers from 1 to N for each column that indicates the source MSA. The mapping is annotated in the \"HCat\" file annotation of an AnnotatedMultipleSequenceAlignment or in the column names of an NamedArray or MultipleSequenceAlignment.\n\nNOTE: When the MSA results from vertically concatenating MSAs using vcat, the \"HCat\" annotations from the constituent MSAs are renamed as \"1_HCat\", \"2_HCat\", etc. In that case, the MSA numbers referenced in the column names are provided. To access the original annotations, utilize the getannotfile function.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.getnamedict-Tuple{MIToS.MSA.ReducedAlphabet}","page":"MSA","title":"MIToS.MSA.getnamedict","text":"It takes a ResidueAlphabet and returns a dictionary from group name to group position.\n\njulia> using MIToS.MSA\n\njulia> ab = ReducedAlphabet(\"(AILMV)(RHK)(NQST)(DE)(FWY)CGP\")\nReducedAlphabet of length 8 : \"(AILMV)(RHK)(NQST)(DE)(FWY)CGP\"\n\njulia> getnamedict(ab)\nOrderedCollections.OrderedDict{String, Int64} with 8 entries:\n \"AILMV\" => 1\n \"RHK\" => 2\n \"NQST\" => 3\n \"DE\" => 4\n \"FWY\" => 5\n \"C\" => 6\n \"G\" => 7\n \"P\" => 8\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.getresidues-Tuple{Matrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.getresidues","text":"getresidues allows you to access the residues stored inside an MSA or aligned sequence as a Matrix{Residue} without annotations nor column/row names.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.getresiduesequences-Tuple{Matrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.getresiduesequences","text":"getresiduesequences returns a Vector{Vector{Residue}} with all the MSA sequences without annotations nor column/sequence names.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.getsequence","page":"MSA","title":"MIToS.MSA.getsequence","text":"getsequence takes an MSA and a sequence number or identifier and returns an aligned sequence object. If the MSA is an AnnotatedMultipleSequenceAlignment, it returns an AnnotatedAlignedSequence with the sequence annotations. From a MultipleSequenceAlignment, It returns an AlignedSequence object. If an Annotations object and a sequence identifier are used, this function returns the annotations related to the sequence.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.getsequencemapping-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment, String}","page":"MSA","title":"MIToS.MSA.getsequencemapping","text":"It returns the sequence coordinates as a Vector{Int} for an MSA sequence. That vector has one element for each MSA column. If the number if 0 in the mapping, there is a gap in that column for that sequence.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.getweight-Tuple{MIToS.MSA.NoClustering, Int64}","page":"MSA","title":"MIToS.MSA.getweight","text":"getweight(c[, i::Int])\n\nThis function returns the weight of the sequence number i. getweight should be defined for any type used for frequencies!/frequencies in order to use his weigths. If i isn't used, this function returns a vector with the weight of each sequence.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.hobohmI-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Any}","page":"MSA","title":"MIToS.MSA.hobohmI","text":"Sequence clustering using the Hobohm I method from Hobohm et al.\n\nReferences\n\nHobohm, Uwe, et al. \"Selection of representative protein data sets.\" Protein Science 1.3 (1992): 409-417.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.join_msas-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment, MIToS.MSA.AnnotatedMultipleSequenceAlignment, Any}","page":"MSA","title":"MIToS.MSA.join_msas","text":"join_msas(msa_a::AnnotatedMultipleSequenceAlignment, \n msa_b::AnnotatedMultipleSequenceAlignment, \n pairing; \n kind::Symbol=:outer, \n axis::Int=1)::AnnotatedMultipleSequenceAlignment\n\njoin_msas(msa_a::AnnotatedMultipleSequenceAlignment, \n msa_b::AnnotatedMultipleSequenceAlignment, \n positions_a, \n positions_b; \n kind::Symbol=:outer, \n axis::Int=1)::AnnotatedMultipleSequenceAlignment\n\nJoin two Multiple Sequence Alignments (MSAs), msa_a and msa_b, based on specified matching positions or names. The function supports two formats: one takes a pairing argument as a list of correspondences, and the other takes positions_a and positions_b as separate lists indicating matching positions or names in each MSA. This function allows for various types of join operations (:inner, :outer, :left, :right) and can merge MSAs by sequences (axis 1) or by columns (axis 2).\n\nParameters:\n\nmsa_a::AnnotatedMultipleSequenceAlignment: The first MSA.\nmsa_b::AnnotatedMultipleSequenceAlignment: The second MSA.\npairing: An iterable where each element is a pair of sequence or column positions (Ints) or names (Strings) to match between msa_a and msa_b. For example, it can be a list of two-element tuples or pairs, or and OrderedDict.\npositions_a, positions_b: Separate lists of positions or names in msa_a and msa_b, respectively.\nkind::Symbol: Type of join operation. Default is :outer.\naxis::Int: The axis along which to join (1 to match sequences, 2 to match columns).\n\nReturns:\n\nAnnotatedMultipleSequenceAlignment: A new MSA resulting from the join operation.\n\nBehavior and Sequence Ordering:\n\nThe order of sequences or columns in the resulting MSA depends on the kind of join operation and the order of elements in the pairing or positions_a and positions_b lists.\n\nFor :inner joins, the function returns an MSA containing only those sequences/columns that are paired in both msa_a and msa_b. The order of elements in the output MSA follows the order in the pairing or position lists.\nFor :outer joins, the output MSA includes all sequences/columns from both msa_a and msa_b. Unpaired sequences/columns are filled with gaps as needed. The sequences/columns from msa_a are placed first. If the pairing or position lists are sorted, the output MSA columns and sequences will keep the same order as in the inputs. That's nice for situations such as profile alignments where the order of columns is important. If the pairing or position lists are not sorted, then the order of sequences/columns in the output MSA is not guaranteed to be the same as in the inputs. In particular, the matched sequences or columns will be placed first, followed by the unmatched ones.\nFor :left joins, all sequences/columns from msa_a are included in the output MSA keeping the same order as in msa_a. Sequences/columns from msa_b are added where matches are found, with gaps filling the unmatched positions.\nFor :right joins, the output MSA behaves like :left joins but with roles of msa_a and msa_b reversed.\n\nWarning: When using Dict for pairing, the order of elements might not be preserved as expected. Dict in Julia does not maintain the order of its elements, which might lead to unpredictable order of sequences/columns in the output MSA. To preserve order, it is recommended to use an OrderedDict or a list of Pairs objects.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.meanpercentidentity","page":"MSA","title":"MIToS.MSA.meanpercentidentity","text":"Returns the mean of the percent identity between the sequences of a MSA. If the MSA has 300 sequences or less, the mean is exact. If the MSA has more sequences and the exact keyword is false (defualt), 44850 random pairs of sequences are used for the estimation. The number of samples can be changed using the second argument. Use exact=true to perform all the pairwise comparison (the calculation could be slow).\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.namedmatrix-Tuple{MIToS.MSA.AbstractResidueMatrix}","page":"MSA","title":"MIToS.MSA.namedmatrix","text":"The namedmatrix function returns the NamedResidueMatrix{Array{Residue,2}} stored in an MSA or aligned sequence.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.ncolumns-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.ncolumns","text":"ncolumns returns the number of MSA columns or positions.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.ncolumns-Tuple{MIToS.MSA.Annotations}","page":"MSA","title":"MIToS.MSA.ncolumns","text":"ncolumns(ann::Annotations) returns the number of columns/residues with annotations. This function returns -1 if there is not annotations per column/residue.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.nsequences-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.nsequences","text":"nsequences returns the number of sequences on the MSA.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.percentidentity-Tuple{Any, Any, Any}","page":"MSA","title":"MIToS.MSA.percentidentity","text":"percentidentity(seq1, seq2, threshold)\n\nComputes quickly if two aligned sequences have a identity value greater than a given threshold value. Returns a boolean value. Positions with gaps in both sequences doesn't count to the length of the sequences. Positions with a XAA in at least one sequence aren't counted.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.percentidentity-Tuple{Any, Any}","page":"MSA","title":"MIToS.MSA.percentidentity","text":"percentidentity(seq1, seq2)\n\nCalculates the fraction of identities between two aligned sequences. The identity value is calculated as the number of identical characters in the i-th position of both sequences divided by the length of both sequences. Positions with gaps in both sequences doesn't count to the length of the sequences. Positions with a XAA in at least one sequence aren't counted.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.percentidentity-Union{Tuple{AbstractMatrix{MIToS.MSA.Residue}}, Tuple{T}, Tuple{AbstractMatrix{MIToS.MSA.Residue}, Type{T}}} where T","page":"MSA","title":"MIToS.MSA.percentidentity","text":"percentidentity(msa[, out::Type=Float64])\n\nCalculates the identity between all the sequences on a MSA. You can indicate the output element type with the last optional parameter (Float64 by default). For a MSA with a lot of sequences, you can use Float32 or Flot16 in order to avoid the OutOfMemoryError().\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.percentsimilarity","page":"MSA","title":"MIToS.MSA.percentsimilarity","text":"Calculates the similarity percent between two aligned sequences. The 100% is the length of the aligned sequences minus the number of columns with gaps in both sequences and the number of columns with at least one residue outside the alphabet. So, columns with residues outside the alphabet (other than the specially treated GAP) aren't counted to the protein length. Two residues are considered similar if they below to the same group in a ReducedAlphabet. The alphabet (third positional argument) by default is:\n\nReducedAlphabet(\"(AILMV)(NQST)(RHK)(DE)(FWY)CGP\")\n\nThe first group is composed of the non polar residues (AILMV), the second group is composed of polar residues, the third group are positive residues, the fourth group are negative residues, the fifth group is composed by the aromatic residues (FWY). C, G and P are considered unique residues.\n\nOther residue groups/alphabets:\n\nSMS (Sequence Manipulation Suite) Ident and Sim (Stothard Paul. 2000):\n\nReducedAlphabet(\"(GAVLI)(FYW)(ST)(KRH)(DENQ)P(CM)\")\n\nStothard P (2000) The Sequence Manipulation Suite: JavaScript programs for analyzing and formatting protein and DNA sequences. Biotechniques 28:1102-1104.\n\nBio3D 2.2 seqidentity (Grant, Barry J., et al. 2006):\n\nReducedAlphabet(\"(GA)(MVLI)(FYW)(ST)(KRH)(DE)(NQ)PC\")\n\nReferences\n\nStothard, Paul. \"The sequence manipulation suite: JavaScript programs for analyzing and formatting protein and DNA sequences.\" Biotechniques 28.6 (2000): 1102-1104.\nGrant, Barry J., et al. \"Bio3d: an R package for the comparative analysis of protein structures.\" Bioinformatics 22.21 (2006): 2695-2696.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.percentsimilarity-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Vararg{Any}}","page":"MSA","title":"MIToS.MSA.percentsimilarity","text":"Calculates the similarity percent between all the sequences on a MSA. You can indicate the output element type with the out keyword argument (Float64 by default). For an MSA with a lot of sequences, you can use out=Float32 or out=Flot16 in order to avoid the OutOfMemoryError().\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.printmodifications-Tuple{MIToS.MSA.Annotations}","page":"MSA","title":"MIToS.MSA.printmodifications","text":"Prints MIToS annotated modifications\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.rename_sequences!-Union{Tuple{T}, Tuple{AT}, Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}}, Vector{T}}} where {AT, T<:AbstractString}","page":"MSA","title":"MIToS.MSA.rename_sequences!","text":"rename_sequences!(msa, newnames::Vector{T}) where {T<:AbstractString}\nrename_sequences!(msa, old2new::AbstractDict)\nrename_sequences!(msa, old2new::Pair...)\n\nRename the sequences of an MSA given a vector of new names, a dictionary mapping old names to new names, or one or more pairs going from old to new names. If the msa is an AnnotatedMultipleSequenceAlignment, the annotations are also updated. The function modifies the msa in place and returns it.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.rename_sequences-Tuple{Any, Any}","page":"MSA","title":"MIToS.MSA.rename_sequences","text":"rename_sequences(msa, newnames::Vector{T}) where {T<:AbstractString}\nrename_sequences(msa, old2new::AbstractDict)\nrename_sequences(msa, old2new::Pair...)\n\nRename the sequences of an MSA given a vector of new names, a dictionary mapping old names to new names, or one or more pairs going from old to new names. If the msa is an AnnotatedMultipleSequenceAlignment, the annotations are also updated. The function returns a new MSA with the sequences renamed without modifying the original MSA.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.residue2three-Tuple{MIToS.MSA.Residue}","page":"MSA","title":"MIToS.MSA.residue2three","text":"This function returns the three letter name of the Residue.\n\njulia> using MIToS.MSA\n\njulia> residue2three(Residue('G'))\n\"GLY\"\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.residuefraction-Tuple{AbstractArray{MIToS.MSA.Residue}}","page":"MSA","title":"MIToS.MSA.residuefraction","text":"It calculates the fraction of residues (no gaps) on the Array (alignment, sequence, column, etc.). This function can take an extra dimension argument for calculation of the residue fraction over the given dimension\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.sequence_id-Tuple{Union{MIToS.MSA.AbstractAlignedSequence, MIToS.MSA.AbstractSequence}}","page":"MSA","title":"MIToS.MSA.sequence_id","text":"sequence_id(seq::Union{AbstractSequence,AbstractAlignedSequence})\n\nIt returns the sequence identifier of a sequence object.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.sequence_index-Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}} where AT, AbstractString}","page":"MSA","title":"MIToS.MSA.sequence_index","text":"sequence_index(msa, seq_name)\n\nReturn the index (integer position) of the sequence with name seq_name in the MSA msa. A KeyError is thrown if the sequence name does not exist. If seq_name is an integer, the same integer is returned without checking if it is a valid index.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.sequencename_iterator-Union{Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}}}, Tuple{AT}} where AT","page":"MSA","title":"MIToS.MSA.sequencename_iterator","text":"sequencename_iterator(msa)\n\nIt returns an iterator that returns the sequence names/identifiers of the msa.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.sequencenames-Union{Tuple{NamedArrays.NamedMatrix{MIToS.MSA.Residue, AT, Tuple{OrderedCollections.OrderedDict{String, Int64}, OrderedCollections.OrderedDict{String, Int64}}}}, Tuple{AT}} where AT<:AbstractArray","page":"MSA","title":"MIToS.MSA.sequencenames","text":"sequencenames(msa)\n\nIt returns a Vector{String} with the sequence names/identifiers.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.sequencepairsmatrix-Union{Tuple{diagonal}, Tuple{T}, Tuple{AbstractMatrix{MIToS.MSA.Residue}, Type{T}, Type{Val{diagonal}}, T}} where {T, diagonal}","page":"MSA","title":"MIToS.MSA.sequencepairsmatrix","text":"Initialize an empty PairwiseListMatrix for a pairwise measure in column pairs. It uses the column mapping (column number in the input MSA file) if it’s available, otherwise it uses the actual column numbers. You can use the positional argument to indicate the number Type (default: Float64), if the PairwiseListMatrix should store the diagonal values on the list (default: false) and a default value for the diagonal (default: NaN).\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.setannotcolumn!","page":"MSA","title":"MIToS.MSA.setannotcolumn!","text":"setannotcolumn!(ann, feature, annotation)\n\nIt stores per column annotation (1 char per column) for feature\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.setannotfile!","page":"MSA","title":"MIToS.MSA.setannotfile!","text":"setannotfile!(ann, feature, annotation)\n\nIt stores per file annotation for feature\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.setannotresidue!","page":"MSA","title":"MIToS.MSA.setannotresidue!","text":"setannotresidue!(ann, seqname, feature, annotation)\n\nIt stores per residue annotation (1 char per residue) for (seqname, feature)\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.setannotsequence!","page":"MSA","title":"MIToS.MSA.setannotsequence!","text":"setannotsequence!(ann, seqname, feature, annotation)\n\nIt stores per sequence annotation for (seqname, feature)\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.setreference!","page":"MSA","title":"MIToS.MSA.setreference!","text":"It puts the sequence i (name or position) as reference (first sequence) of the MSA. This function swaps the sequences 1 and i.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#MIToS.MSA.shuffle_msa!-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Vararg{Any}}","page":"MSA","title":"MIToS.MSA.shuffle_msa!","text":"shuffle_msa!([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false)\n\nIn-place version of shuffle_msa. It randomly permute residues in the MSA msa along sequences (dims=1) or columns (dims=2, the default). The optional positional argument subset allows to shuffle only a subset of the sequences or columns. The optional keyword argument fixedgaps indicates if the gaps should remain their positions (true by default). The optional keyword argument fixed_reference indicates if the residues in the first sequence should remain in their positions (false by default).\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.shuffle_msa-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Vararg{Any}}","page":"MSA","title":"MIToS.MSA.shuffle_msa","text":"shuffle_msa([rng=default_rng(),] msa::AbstractMatrix{Residue}, subset=Colon(); dims=2, fixedgaps=true, fixed_reference=false)\n\nIt randomly permute residues in the MSA msa along sequences (dims=1) or columns (dims=2, the default). The optional positional argument subset allows to shuffle only a subset of the sequences or columns. The optional keyword argument fixedgaps indicates if the gaps should remain their positions (true by default). The optional keyword argument fixed_reference indicates if the residues in the first sequence should remain in their positions (false by default). To shuffle in-place, see shuffle_msa!.\n\njulia> using MIToS.MSA\n\njulia> using Random\n\njulia> msa = hcat(res\"RRE\",res\"DDK\", res\"G--\")\n3×3 Matrix{Residue}:\n R D G\n R D -\n E K -\n\njulia> Random.seed!(42);\n\njulia> shuffle_msa(msa, dims=1, fixedgaps=true)\n3×3 Matrix{Residue}:\n G D R\n R D -\n E K -\n\njulia> Random.seed!(42);\n\njulia> shuffle_msa(msa, dims=1, fixedgaps=false)\n3×3 Matrix{Residue}:\n G D R\n R - D\n E K -\n\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.stringsequence-Tuple{AbstractMatrix{MIToS.MSA.Residue}, Any}","page":"MSA","title":"MIToS.MSA.stringsequence","text":"stringsequence(seq)\nstringsequence(msa, i::Int)\nstringsequence(msa, id::String)\n\nIt returns the selected sequence as a String.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.swapsequences!-Tuple{Matrix{MIToS.MSA.Residue}, Int64, Int64}","page":"MSA","title":"MIToS.MSA.swapsequences!","text":"It swaps the sequences on the positions i and j of an MSA. Also it's possible to swap sequences using their sequence names/identifiers when the MSA object as names.\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.MSA.three2residue-Tuple{String}","page":"MSA","title":"MIToS.MSA.three2residue","text":"It takes a three letter residue name and returns the corresponding Residue. If the name isn't in the MIToS dictionary, a XAA is returned.\n\njulia> using MIToS.MSA\n\njulia> three2residue(\"ALA\")\nA\n\n\n\n\n\n","category":"method"},{"location":"MSA_API/#MIToS.Utils.parse_file","page":"MSA","title":"MIToS.Utils.parse_file","text":"parse_file(io, format[, output; generatemapping, useidcoordinates, deletefullgaps])\n\nThe keyword argument generatemapping (false by default) indicates if the mapping of the sequences (\"SeqMap\") and columns (\"ColMap\") and the number of columns in the original MSA (\"NCol\") should be generated and saved in the annotations. If useidcoordinates is true (default: false) the sequence IDs of the form \"ID/start-end\" are parsed and used for determining the start and end positions when the mappings are generated. deletefullgaps (true by default) indicates if columns 100% gaps (generally inserts from a HMM) must be removed from the MSA.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#Random.shuffle","page":"MSA","title":"Random.shuffle","text":"It's like shuffle but in-place. When a Matrix{Residue} or a AbstractAlignedObject (sequence or MSA) is used, you can indicate if the gaps should remain their positions using the last boolean argument.\n\nDEPRECATED: This method is deprecated. Use shuffle_msa instead.\n\n\n\n\n\n","category":"function"},{"location":"MSA_API/#Random.shuffle!","page":"MSA","title":"Random.shuffle!","text":"It's like Random.shuffle. When a Matrix{Residue} is used, you can indicate if the gaps should remain their positions using the last boolean argument. The previous argument should be the dimension to shuffle, 1 for shuffling residues in a sequence (row) or 2 for shuffling residues in a column.\n\nDEPRECATED: This method is deprecated. Use shuffle_msa! instead.\n\n\n\n\n\n","category":"function"},{"location":"Information/","page":"Information","title":"Information","text":"CurrentModule = MIToS.Information","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"@info \"Information docs\"","category":"page"},{"location":"Information/#Module-Information","page":"Information","title":"Information","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"Extracting evolutionary signals, such as conservation and coevolution, from Multiple Sequence Alignments (MSAs) is a common task in bioinformatics. There are several methods to estimate these signals, including information measures like Shannon Entropy—to assess the conservation of a position—and Mutual Information—to assess the coevolution between two positions. The Information module of MIToS defines types and functions useful for calculating those information measures over an MSA. This module was designed to count Residues (defined in the MSA module) in special contingency tables (as fast as possible) and to derive probabilities from these counts. It also includes methods for applying corrections to those tables, e.g., pseudo counts and pseudo frequencies. Finally, Information allows using probabilities and counts to estimate information measures and other frequency-based values.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information # to load the Information module","category":"page"},{"location":"Information/#Features","page":"Information","title":"Features","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"Estimate multi-dimensional frequencies (counts) and probability tables from sequences, MSA columns, etc...\nCorrections for a small number of observations\nCorrections for data redundancy on an MSA\nEstimate information measures such as Shannon entropy, mutual information, etc...\nCalculate corrected mutual information between residues","category":"page"},{"location":"Information/#Contents","page":"Information","title":"Contents","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"Pages = [\"Information.md\"]\nDepth = 4","category":"page"},{"location":"Information/#Counting-residues","page":"Information","title":"Counting residues","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"MIToS Information module defines a multidimensional ContingencyTable type and two types wrapping it, Frequencies and Probabilities, to store occurrences or probabilities. The ContingencyTable type stores the contingency matrix, its marginal values and total. These types are parametric, taking three ordered parameters:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"T : The type used for storing the counts or probabilities, e.g. Float64. It's possible to use BigFloat if more precision it's needed.\nN : It's the dimension of the table and should be an Int.\nA : This should be a type, subtype of ResidueAlphabet, i.e.: UngappedAlphabet, GappedAlphabet or ReducedAlphabet.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"note: Note\nContingencyTable can be used for storing probabilities or counts. The wrapper types Probabilities and Frequencies are mainly intended to dispatch in methods that need to know if the matrix has probabilities or counts, e.g. shannon_entropy. In general, the use of ContingencyTable is recommended over the use of Probabilities and Frequencies.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"In this way, a matrix for storing pairwise probabilities of residues (without gaps) can be initialized using:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information\n\nPij = ContingencyTable(Float64, Val{2}, UngappedAlphabet())","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"[High level interface] It is possible to use the functions frequencies and probabilities to easily calculate the frequencies of sequences or columns of a MSA, where the number of sequences/columns determine the dimension of the resulting table.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information\nusing MIToS.MSA # to use res\"...\" to create Vector{Residue}\n\ncolumn_i = res\"AARANHDDRDC-\"\ncolumn_j = res\"-ARRNHADRAVY\"\n# Nij[R,R] = 1 1 = 2\n\nNij = frequencies(column_i, column_j)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"You can use sum to get the stored total:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"sum(Nij) # There are 12 Residues, but 2 are gaps","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Contingency tables can be indexed using Int or Residues:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Nij[2, 2] # Use Int to index the table","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Nij[Residue('R'), Residue('R')] # Use Residue to index the table","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"warning: Warning\nThe number makes reference to the specific index in the table e.g [2,2] references the second row and the second column. The use of the number used to encode the residue to index the table is dangerous. The equivalent index number of a residue depends on the used alphabet and Int(Residue('X')) will be always out of bounds.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Indexing with Residues works as expected. It uses the alphabet of the contingency table to find the index of the Residue.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information\nusing MIToS.MSA\n\nalphabet = ReducedAlphabet(\"(AILMV)(NQST)(RHK)(DE)(FWY)CGP\")\n\ncolumn_i = res\"AARANHDDRDC-\"\ncolumn_j = res\"-ARRNHADRAVY\"\n# Fij[R,R] = 1 1 1 = 3 # RHK\n\nFij = frequencies(column_i, column_j, alphabet = alphabet)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Fij[Residue('R'), Residue('R')] # Use Residue to index the table","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"The function getcontingencytable allows to access the wrapped ContingencyTable in a Frequencies object. You can use it, in combination with normalize to get a contingency table of probabilities. The result can be wrapped inside a Probabilities object:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Probabilities(normalize(getcontingencytable(Fij)))","category":"page"},{"location":"Information/#Example:-Plotting-the-probabilities-of-each-residue-in-a-sequence","page":"Information","title":"Example: Plotting the probabilities of each residue in a sequence","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"Similar to the frequencies function, the probabilities function can take at least one sequence (vector of residues) and returns the probabilities of each residue. Optionally, the keyword argument alphabet could be used to count some residues in the same cell of the table.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"probabilities(res\"AARANHDDRDC\", alphabet = alphabet)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Here, we are going to use the probabilities function to get the residue probabilities of a particular sequence from UniProt.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"use the getsequence function, from the MSA module, to get the sequence from a FASTA downloaded from UniProt.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information # to use the probabilities function\nusing MIToS.MSA # to use getsequence on the one sequence FASTA (canonical) from UniProt\nseq = read_file(\"http://www.uniprot.org/uniprot/P29374.fasta\", FASTA) # Small hack: read the single sequence as a MSA\nprobabilities(seq[1, :]) # Select the single sequence and calculate the probabilities","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"@info \"Information: Plots\"\nusing Plots\ngr(size=(600,300))\nusing MIToS.Information # to use the probabilities function\nusing MIToS.MSA # to use getsequence on the one sequence FASTA (canonical) from UniProt\nseq = read_file(\"http://www.uniprot.org/uniprot/P29374.fasta\", FASTA) # Small hack: read the single sequence as a MSA\nPa = probabilities(seq[1,:]) # Select the single sequence and calculate the probabilities","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using Plots # We choose Plots because it's intuitive, concise and backend independent\ngr(size = (600, 300))","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"You can plot together with the probabilities of each residue in a given sequence, the probabilities of each residue estimated with the BLOSUM62 substitution matrix. That matrix is exported as a constant by the Information module as BLOSUM62_Pi.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"bar(1:20, [Pa BLOSUM62_Pi], lab = [\"Sequence\" \"BLOSUM62\"], alpha = 0.5)\npng(\"inf_plotfreq.png\") # hide\nnothing # hide","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"(Image: )","category":"page"},{"location":"Information/#Low-count-corrections","page":"Information","title":"Low count corrections","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"Low number of observations can lead to sparse contingency tables, that lead to wrong probability estimations. It is shown in Buslje et al. [3] that low-count corrections, can lead to improvements in the contact prediction capabilities of the Mutual Information. The Information module has available two low-count corrections:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Additive Smoothing(Image: ); the constant value pseudocount described in Buslje et al. [3].\nBLOSUM62 based pseudo frequencies of residues pairs, similar to Altschul et al. [4].","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.MSA\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n)\n\nfiltercolumns!(msa, columngapfraction(msa) .< 0.5) # delete columns with 50% gaps or more\n\ncolumn_i = msa[:, 1]\ncolumn_j = msa[:, 2]","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"If you have a preallocated ContingencyTable you can use frequencies! to fill it, this prevent to create a new table as frequencies do. However, you should note that frequencies! adds the new counts to the pre existing values, so in this case, we want to start with a table initialized with zeros.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information\n\nconst alphabet = ReducedAlphabet(\"(AILMV)(NQST)(RHK)(DE)(FWY)CGP\")\n\nNij = ContingencyTable(Float64, Val{2}, alphabet)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"frequencies!(Nij, column_i, column_j)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"In cases like the above, where there are few observations, it is possible to apply a constant pseudocount to the counting table. This module defines the type AdditiveSmoothing and the correspond fill! and apply_pseudocount! methods to efficiently add or fill with a constant value each element of the table.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"apply_pseudocount!(Nij, AdditiveSmoothing(1.0))","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"[High level interface.] The frequencies and frequencies! function has a pseudocounts keyword argument that can take a AdditiveSmoothing value to easily calculate occurrences with pseudocounts. Also their alphabet keyword argument can be used to chage the default alphabet.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"frequencies(column_i, column_j, pseudocounts = AdditiveSmoothing(1.0), alphabet = alphabet)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"To use the conditional probability matrix BLOSUM62_Pij in the calculation of pseudo frequencies G for the pair of residues a, b, it should be calculated first the real frequencies/probabilities p_ab. The observed probabilities are then used to estimate the pseudo frequencies.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"G_ab = sum_cd p_cd cdot BLOSUM62( a c ) cdot BLOSUM62( b d )","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Finally, the probability P of each pair of residues a, b between the columns i, j is the weighted mean between the observed frequency p and BLOSUM62-based pseudo frequency G, where α is generally the number of clusters or the number of sequences of the MSA and β is an empiric weight value. β was determined to be close to 8.512.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"P_ab = fracalpha cdot p_ab + beta cdot G_ab alpha + beta","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"This could be easily achieved using the pseudofrequencies keyword argument of the probabilities function. That argument can take a BLOSUM_Pseudofrequencies object that is created with α and β as first and second argument, respectively.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Pij = probabilities(\n column_i,\n column_j,\n pseudofrequencies = BLOSUM_Pseudofrequencies(nsequences(msa), 8.512),\n)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"You can also use apply_pseudofrequencies! in a previously filled probability contingency table. i.e. apply_pseudofrequencies!(Pij, BLOSUM_Pseudofrequencies(α, β))","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"warning: Warning\nBLOSUM_Pseudofrequencies can be only be applied in normalized/probability tables with UngappedAlphabet.","category":"page"},{"location":"Information/#Correction-for-data-redundancy-in-a-MSA","page":"Information","title":"Correction for data redundancy in a MSA","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"A simple way to reduce redundancy in a MSA without losing sequences, is clusterization and sequence weighting. The weight of each sequence should be 1/N, where N is the number of sequences in its cluster. The Clusters type of the MSA module stores the weights. This vector of weights can be extracted (with the getweight function) and used by the frequencies and probabilities functions with the keyword argument weights. Also it's possible to use the Clusters as second argument of the function frequencies!.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"clusters = hobohmI(msa, 62) # from MIToS.MSA","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"frequencies(msa[:, 1], msa[:, 2], weights = clusters)","category":"page"},{"location":"Information/#Estimating-information-measures-on-an-MSA","page":"Information","title":"Estimating information measures on an MSA","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"The Information module has a number of functions defined to calculate information measures from Frequencies and Probabilities:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"shannon_entropy : Shannon entropy (H)\nmarginal_entropy : Shannon entropy (H) of the marginals\nkullback_leibler : Kullback-Leibler (KL) divergence\nmutual_information : Mutual Information (MI)\nnormalized_mutual_information : Normalized Mutual Information (nMI) by Entropy\ngap_intersection_percentage\ngap_union_percentage","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Information measure functions take optionally the base as a keyword argument (default: ℯ). You can set base=2 to measure information in bits.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information\nusing MIToS.MSA\n\nNi = frequencies(res\"PPCDPPPPPKDKKKKDDGPP\") # Ni has the count table of residues in this low complexity sequence\n\nH = shannon_entropy(Ni) # returns the Shannon entropy in nats (base e)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"H = shannon_entropy(Ni, base = 2) # returns the Shannon entropy in bits (base 2)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Information module defines special iteration functions to easily and efficiently compute a measure over a MSA. In particular, mapcolfreq! and mapseqfreq! map a function that takes a table of Frequencies or Probabilities. The table is filled in place with the counts or probabilities of each column or sequence of a MSA, respectively. mapcolpairfreq! and mapseqpairfreq! are similar, but they fill the table using pairs of columns or sequences, respectively.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"This functions take three positional arguments: the function f to be calculated, the msa and table of Frequencies or Probabilities.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"After that, this function takes some keyword arguments:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"weights (default: NoClustering()) : Weights to be used for table counting.\npseudocounts (default: NoPseudocount()) : Pseudocount object to be applied to table.\npseudofrequencies (default: NoPseudofrequencies()) : Pseudofrequencies to be applied to the normalized (probabilities) table.\nusediagonal (default: true) : Indicates if the function should be applied to pairs containing the same sequence or column.\ndiagonalvalue (default to zero) : The value that fills the diagonal elements of the table if usediagonal is false.","category":"page"},{"location":"Information/#Example:-Estimating-*H(X)*-and-*H(X,-Y)*-over-an-MSA","page":"Information","title":"Example: Estimating H(X) and H(X, Y) over an MSA","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"In this example, we are going to use mapcolfreq! and mapcolpairfreq! to estimate Shannon shannon_entropy of MSA columns H(X) and the joint entropy H(X, Y) of columns pairs, respectively.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"@info \"Information: Entropy\"\nusing Plots\ngr()","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.MSA\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"We are going to count residues to estimate the Shannon entropy. The shannon_entropy estimation is performed over a rehused Frequencies object. The result will be a vector containing the values estimated over each column without counting gaps (UngappedAlphabet).","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.Information\n\nHx = mapcolfreq!(\n shannon_entropy,\n msa,\n Frequencies(ContingencyTable(Float64, Val{1}, UngappedAlphabet())),\n)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"If we want the joint entropy between columns pairs, we need to use a bidimensional table of Frequencies and mapcolpairfreq!.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Hxy = mapcolpairfreq!(\n shannon_entropy,\n msa,\n Frequencies(ContingencyTable(Float64, Val{2}, UngappedAlphabet())),\n)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"In the above examples, we indicate the type of each occurrence in the counting and the probability table to use. Also, it's possible for some measures as entropy and mutual information, to estimate the values only with the count table (without calculate the probability table). Estimating measures only with a ResidueCount table, when this is possible, should be faster than using a probability table.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Time_Pab = map(1:100) do x\n time = @elapsed mapcolpairfreq!(\n shannon_entropy,\n msa,\n Probabilities(ContingencyTable(Float64, Val{2}, UngappedAlphabet())),\n )\nend\n\nTime_Nab = map(1:100) do x\n time = @elapsed mapcolpairfreq!(\n shannon_entropy,\n msa,\n Frequencies(ContingencyTable(Float64, Val{2}, UngappedAlphabet())),\n )\nend\n\nusing Plots\ngr()\n\nhistogram(\n [Time_Pab Time_Nab],\n labels = [\"Using ResidueProbability\" \"Using ResidueCount\"],\n xlabel = \"Execution time [seconds]\",\n)\n\npng(\"inf_entropy.png\") # hide\nnothing # hide","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"(Image: )","category":"page"},{"location":"Information/#Corrected-Mutual-Information","page":"Information","title":"Corrected Mutual Information","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"MIToS ships with two methods to easily calculate corrected mutual information. The first is the algorithm described in Buslje et al. [3]. This algorithm can be accessed through the buslje09 function and includes:","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"Low count correction using AdditiveSmoothing\nSequence weighting after a hobohmI clustering [2]\nAverage Product Correction (APC) proposed by Dunn et al. [5], through the APC! function that takes a MI matrix.\nZ score correction using the functions shuffle_msa! from the MSA module and zscore from the PairwiseListMatrices package.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"buslje09","category":"page"},{"location":"Information/#MIToS.Information.buslje09","page":"Information","title":"MIToS.Information.buslje09","text":"buslje09 takes a MSA and calculates a Z score and a corrected MI/MIp as described on Busjle et al. 2009.\n\nkeyword argument, type, default value and descriptions:\n\n - lambda Float64 0.05 Low count value\n - clustering Bool true Sequence clustering (Hobohm I)\n - threshold 62 Percent identity threshold for clustering\n - maxgap Float64 0.5 Maximum fraction of gaps in positions included in calculation\n - apc Bool true Use APC correction (MIp)\n - samples Int 100 Number of samples for Z-score\n - fixedgaps Bool true Fix gaps positions for the random samples\n - alphabet ResidueAlphabet UngappedAlphabet() Residue alphabet to be used\n\nThis function returns:\n\n - Z score\n - MI or MIp\n\n\n\n\n\n","category":"function"},{"location":"Information/","page":"Information","title":"Information","text":"The second, implemented in the BLMI function, has the same corrections that the above algorithm, but use BLOSUM62 pseudo frequencies. This function is slower than buslje09 (at the same number of samples), but gives better performance (for structural contact prediction) when the MSA has less than 400 clusters after a Hobohm I at 62% identity.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"BLMI","category":"page"},{"location":"Information/#MIToS.Information.BLMI","page":"Information","title":"MIToS.Information.BLMI","text":"BLMI takes an MSA and calculates a Z score (ZBLMI) and a corrected MI/MIp as described on Busjle et al. 2009 but using using BLOSUM62 pseudo frequencies instead of a fixed pseudocount.\n\nKeyword argument, type, default value and descriptions:\n\n - beta Float64 8.512 β for BLOSUM62 pseudo frequencies\n - lambda Float64 0.0 Low count value\n - threshold 62 Percent identity threshold for sequence clustering (Hobohm I)\n - maxgap Float64 0.5 Maximum fraction of gaps in positions included in calculation\n - apc Bool true Use APC correction (MIp)\n - samples Int 50 Number of samples for Z-score\n - fixedgaps Bool true Fix gaps positions for the random samples\n\nThis function returns:\n\n - Z score (ZBLMI)\n - MI or MIp using BLOSUM62 pseudo frequencies (BLMI/BLMIp)\n\nReferences\n\nBuslje, Cristina Marino, et al. \"Correction for phylogeny, small number of observations and data redundancy improves the identification of coevolving amino acid pairs using mutual information.\" Bioinformatics 25.9 (2009): 1125-1131.\n\n\n\n\n\n","category":"function"},{"location":"Information/#Example:-Estimating-corrected-MI-from-an-MSA","page":"Information","title":"Example: Estimating corrected MI from an MSA","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"@info \"Information: MI\"\nusing Plots\ngr()","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using MIToS.MSA\nusing MIToS.Information\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n)\nZMIp, MIp = buslje09(msa)\nZMIp","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"ZBLMIp, BLMIp = BLMI(msa)\nZBLMIp","category":"page"},{"location":"Information/#Visualize-Mutual-Information","page":"Information","title":"Visualize Mutual Information","text":"","category":"section"},{"location":"Information/","page":"Information","title":"Information","text":"You can use the function of the Plots package to visualize the Mutual Information (MI) network between residues. As an example, we are going to visualize the MI between residues of the Pfam domain PF18883. The heatmap is the simplest way to visualize the values of the Mutual Information matrix.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using Plots\ngr()\n\nheatmap(ZMIp, yflip = true)\npng(\"inf_heatmap.png\") # hide\nnothing # hide","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"(Image: )","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"ZMIp is a Z score of the corrected MIp against its distribution on a random MSA (shuffling the residues in each sequence), so pairs with highest values are more likely to coevolve. Here, we are going to use the top 1% pairs of MSA columns.","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"using PairwiseListMatrices # to use getlist\nusing Statistics # to use quantile\n\nthreshold = quantile(getlist(ZMIp), 0.99)","category":"page"},{"location":"Information/","page":"Information","title":"Information","text":"ZMIp[ZMIp. !occursin(r\"_SULIY\", x), sequencenames(msa)) # an element of mask is true if \"_SULIY\" is not in the name","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"filtersequences!(msa, mask) # deletes all the sequences where mask is false\n\nsequencenames(msa)","category":"page"},{"location":"MSA/#Example:-Exporting-a-MSA-for-freecontact-(part-I)","page":"MSA","title":"Example: Exporting a MSA for freecontact (part I)","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The most simple input for the command line tool freecontact(Image: ) (if you don't want to set --mincontsep) is a Raw MSA file with a reference sequence without insertions or gaps. This is easy to get with MIToS using read_file (deletes the insert columns), setreference! (to choose a reference), adjustreference! (to delete columns with gaps in the reference) and write_file (to save it in Raw format) functions.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\nfile_name = \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/test/data/PF09645_full.stockholm\"\nmsa = read_file(file_name, Stockholm)\nmsa_coverage = coverage(msa)\nmaxcoverage, maxindex = findmax(msa_coverage)\nsetreference!(msa, maxindex[1]) # the sequence with the highest coverage\nadjustreference!(msa)\nwrite_file(\"tofreecontact.msa\", msa, Raw)\nprint(read_file(\"tofreecontact.msa\", String)) # display output file","category":"page"},{"location":"MSA/#Column-and-sequence-mappings","page":"MSA","title":"Column and sequence mappings","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Inserts in a Stockholm MSA allow to access the full fragment of the aligned sequences. Using this, combined with the sequence names that contain coordinates used in Pfam, you can know what is the UniProt residue number of each residue in the MSA.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"\"PROT_SPECI/3-15 .....insertALIGNED\"\n# 3456789111111\n# 012345","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"MIToS read_file and parse_file functions delete the insert columns, but they do the mapping between each residue and its residue number before deleting insert columns when generatemapping is true. If you don't set useidcoordinates to true, the residue first i residue will be 1 instead of 3 in the previous example.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\n\nmsa = parse_file(\n \"PROT_SPECI/3-15 .....insertALIGNED\",\n Stockholm,\n generatemapping = true,\n useidcoordinates = true,\n)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"MIToS also keeps the column number of the input MSA and its total number of columns. All this data is stored in the MSA annotations using the SeqMap, ColMap and NCol feature names.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"annotations(msa)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"To have an easy access to mapping data, MIToS provides the getsequencemapping and getcolumnmapping functions.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"getsequencemapping(msa, \"PROT_SPECI/3-15\")","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"getcolumnmapping(msa)","category":"page"},{"location":"MSA/#Example:-Exporting-a-MSA-for-freecontact-(part-II)","page":"MSA","title":"Example: Exporting a MSA for freecontact (part II)","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"If we want to use the --mincontsep argument of freecontact to calculate scores between distant residues, we will need to add a header to the MSA. This header should contains the residue number of the first residue of the sequence and the full fragment of that sequence (with the inserts). This data is used by FreeContact to calculate the residue number of each residue in the reference sequence. We are going to use MIToS mapping data to create this header, so we read the MSA with generatemapping and useidcoordinates set to true.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n generatemapping = true,\n useidcoordinates = true,\n)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Here, we are going to choose the sequence with more coverage of the MSA as our reference sequence.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"msa_coverage = coverage(msa)\nmaxcoverage, maxindex = findmax(msa_coverage)\nsetreference!(msa, maxindex[1])\nadjustreference!(msa)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"MIToS deletes the residues in insert columns, so we are going to use the sequence mapping to generate the whole fragment of the reference sequence (filling the missing regions with 'x').","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"seqmap = getsequencemapping(msa, 1) # seqmap will be a vector with the residue numbers of the first sequence (reference)\n\nseq = collect(stringsequence(msa, 1)) # seq will be a Vector of Chars with the reference sequence\n\nsequence = map(seqmap[1]:seqmap[end]) do seqpos # for each position in the whole fragment\n if seqpos in seqmap # if that position is in the MSA\n popfirst!(seq) # the residue is taken from seq\n else # otherwise\n 'x' # 'x' is included\n end\nend\n\nsequence = join(sequence) # join the Chars on the Vector to create a string","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Once we have the whole fragment of the sequence, we create the file and write the header in the required format (as in the man page of freecontact).","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"open(\"tofreecontact.msa\", \"w\") do fh\n println(fh, \"# querystart=\", seqmap[1])\n println(fh, \"# query=\", sequence)\nend","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"As last (optional) argument, write_file takes the mode in which is opened the file. We use \"a\" here to append the MSA to the header.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"write_file(\"tofreecontact.msa\", msa, Raw, \"a\")","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"print(join(first(readlines(\"tofreecontact.msa\"), 5), '\\n')) # It displays the first five lines","category":"page"},{"location":"MSA/#Get-sequences-from-a-MSA","page":"MSA","title":"Get sequences from a MSA","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"It's possible to index the MSA as any other matrix to get an aligned sequence. This will be return a Array of Residues without annotations but keeping names/identifiers.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/test/data/PF09645_full.stockholm\",\n Stockholm,\n generatemapping = true,\n useidcoordinates = true,\n)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"msa[2, :] # second sequence of the MSA, it keeps column names","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"msa[2:2, :] # Using the range 2:2 to select the second sequence, keeping also the sequence name","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"If you want to obtain the aligned sequence with its name and annotations (and therefore sequence and column mappings), you should use the function getsequence. This function returns an AlignedSequence with the sequence name from a MultipleSequenceAlignment or an AnnotatedAlignedSequence, that also contains annotations, from an AnnotatedMultipleSequenceAlignment.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"secondsequence = getsequence(msa, 2)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"annotations(secondsequence)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Use stringsequence if you want to get the sequence as a string.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"stringsequence(msa, 2)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Because matrices are stored columnwise in Julia, you will find useful the getresiduesequences function when you need to heavily operate over sequences.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"getresiduesequences(msa)","category":"page"},{"location":"MSA/#Describing-your-MSA","page":"MSA","title":"Describing your MSA","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The MSA module has a number of functions to gain insight about your MSA. Using MIToS.MSA, one can easily ask for...","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The number of columns and sequences with the ncolumns and nsequences functions.\nThe fraction of columns with residues (coverage) for each sequence making use of the coverage method.\nThe fraction or percentage of gaps/residues using with the functions gapfraction, residuefraction and columngapfraction.\nThe percentage of identity (PID) between each sequence of the MSA or its mean value with percentidentity and meanpercentidentity.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The percentage identity between two aligned sequences is a common measure of sequence similarity and is used by the hobohmI method to estimate and reduce MSA redundancy. MIToS functions to calculate percent identity don't align the sequences, they need already aligned sequences. Full gaps columns don't count to the alignment length.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\n\nmsa = permutedims(hcat(\n res\"--GGG-\", # res\"...\" uses the @res_str macro to create a (column) Vector{Residue}\n res\"---GGG\",\n), (2, 1))\n# identities 000110 sum 2\n# aligned residues 001111 sum 4","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"percentidentity(msa[1, :], msa[2, :]) # 2 / 4","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"To quickly calculate if the percentage of identity is greater than a determined value, use that threshold as third argument. percentidentity(seqa, seqb, pid) is a lot more faster than percentidentity(seqa, seqb) >= pid.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"percentidentity(msa[1, :], msa[2, :], 62) # 50% >= 62%","category":"page"},{"location":"MSA/#Example:-Plotting-gap-percentage-per-column-and-coverage-per-sequence","page":"MSA","title":"Example: Plotting gap percentage per column and coverage per sequence","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The gapfraction and coverage functions return a vector of numbers between 0.0 and 1.0 (fraction of...). Sometime it's useful to plot this data to quickly understand the MSA structure. In this example, we are going to use the Plots(Image: ) package for plotting, with the GR(Image: ) backend, but you are free to use any of the Julia plotting libraries.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"@info \"MSA: Plots\"\nusing Plots\ngr() # Hide possible warnings","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n)\n\nusing Plots\n\ngr(size = (600, 300))\n\nplot(\n # x is a range from 1 to the number of columns\n 1:ncolumns(msa),\n # y is a Vector{Float64} with the percentage of gaps of each column\n vec(columngapfraction(msa)) .* 100.0,\n linetype = :line,\n ylabel = \"gaps [%]\",\n xlabel = \"columns\",\n legend = false,\n)\n\npng(\"msa_gaps.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"plot(\n # x is a range from 1 to the number of sequences\n 1:nsequences(msa),\n # y is a Vector{Float64} with the coverage of each sequence\n vec(coverage(msa)) .* 100,\n linetype = :line,\n ylabel = \"coverage [%]\",\n xlabel = \"sequences\",\n legend = false,\n)\n\npng(\"msa_coverage.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"plot(msa)\npng(\"msa_msa.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/#Example:-Filter-sequences-per-coverage-and-columns-per-gap-fraction","page":"MSA","title":"Example: Filter sequences per coverage and columns per gap fraction","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Taking advantage of the filter...! functions and the coverage and columngapfraction functions, it's possible to delete short sequences or columns with a lot of gaps.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"println(\"\\tsequences\\tcolumns\")\nprintln(\"Before:\\t\", nsequences(msa), \"\\t\\t\", ncolumns(msa))\n# delete sequences with less than 90% coverage of the MSA length:\nfiltersequences!(msa, coverage(msa) .>= 0.9)\n# delete columns with more than 10% of gaps:\nfiltercolumns!(msa, columngapfraction(msa) .<= 0.1)\nprintln(\"After:\\t\", nsequences(msa), \"\\t\\t\", ncolumns(msa))","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"histogram(\n vec(columngapfraction(msa)),\n # Using vec() to get a Vector{Float64} with the fraction of gaps of each column\n xlabel = \"gap fraction in [0,1]\",\n bins = 20,\n legend = false,\n)\npng(\"msa_hist_gaps.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"histogram(\n vec(coverage(msa) .* 100.0), # Column with the coverage of each sequence\n xlabel = \"coverage [%]\",\n legend = false,\n)\npng(\"msa_hist_coverage.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/#Example:-Plotting-the-percentage-of-identity-between-sequences","page":"MSA","title":"Example: Plotting the percentage of identity between sequences","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The distribution of the percentage of identity between every pair of sequences in an MSA, gives an idea of the MSA diversity. In this example, we are using percentidentity over an MSA to get those identity values.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n)\npid = percentidentity(msa)\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"MIToS stores the matrix of percentage of identity between the aligned sequences as a PairwiseListMatrix from the PairwiseListMatrices(Image: ) package. This matrix type saves RAM, allowing the storage of big matrices. In this example, we use the to_table function of PairwiseListMatrices to convert the matrix into a table with indices.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using PairwiseListMatrices\n\npidtable = to_table(pid, diagonal = false)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The function quantile gives a quick idea of the percentage identity distribution of the MSA.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using Statistics\n\nquantile(convert(Vector{Float64}, pidtable[:, 3]), [0.00, 0.25, 0.50, 0.75, 1.00])","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The function meanpercentidentity gives the mean value of the percent identity distribution for MSA with less than 300 sequences, or a quick estimate (mean PID in a random sample of sequence pairs) otherwise unless you set exact to true.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"meanpercentidentity(msa)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"One can easily plot that matrix and its distribution using the heatmap and histogram functions of the Plots(Image: ) package.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"@info \"MSA: PID\"\nusing Plots\ngr() # Hide possible warnings","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using Plots\ngr()\nheatmap(convert(Matrix, pid), yflip = true, ratio = :equal)\npng(\"msa_heatmap_pid.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"histogram(pidtable[:, 3], xlabel = \"Percentage of identity\", legend = false)\npng(\"msa_hist_pid.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/#Sequence-clustering","page":"MSA","title":"Sequence clustering","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"The MSA module allows to clusterize sequences in an MSA. The hobohmI function takes as input an MSA followed by an identity threshold value, and returns a Clusters type with the result of a Hobohm I sequence clustering [2]. The Hobohm I algorithm will add a sequence to an existing cluster, if the percentage of identity is equal or greater than the threshold. The Clusters is sub-type of ClusteringResult from the Clustering.jl(Image: ) package. One advantage of use a sub-type of ClusteringResultis that you are able to use any method defined on Clustering.jl like varinfo (Variation of Information) for example. Also, you can use any clustering algorithm included in Clustering.jl, and convert its result to an Clusters object to use it with MIToS. MSA defines the functions nclusters to get the resulting number of clusters, counts to get the number of sequences on each cluster and assignments to get the cluster number of each sequence. The most important method is getweight, which returns the weight of each sequence. This method is used in the Information module of MIToS to reduce redundancy.","category":"page"},{"location":"MSA/#Example:-Reducing-redundancy-of-a-MSA","page":"MSA","title":"Example: Reducing redundancy of a MSA","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"MSAs can suffer from an unnatural sequence redundancy and a high number of protein fragments. In this example, we are using a sequence clustering to make a non-redundant set of representative sequences. We are going to use the function hobohmI to perform the clustering with the Hobohm I algorithm at 62% identity.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"@info \"MSA: Clusters\"\nusing Plots\nusing StatsPlots\nusing DataFrames\ngr() # Hide possible warnings","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\nusing Clustering # to use the nclusters and assignments functions\n\nmsa = read_file(\n \"https://raw.githubusercontent.com/diegozea/MIToS.jl/master/docs/data/PF18883.stockholm.gz\",\n Stockholm,\n)\n\nprintln(\"This MSA has \", nsequences(msa), \" sequences...\")","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"clusters = hobohmI(msa, 62)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"println(\n \"...but has only \",\n nclusters(clusters),\n \" sequence clusters after a clustering at 62% identity.\",\n)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using Plots\ngr()\n\nplot(msa)\npng(\"msa_clusters_i.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"We are going to use the DataFrames(Image: ) package to easily select the sequence with the highest coverage of each cluster.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using DataFrames\n\ndf = DataFrame(\n seqnum = 1:nsequences(msa),\n seqname = sequencenames(msa),\n cluster = assignments(clusters), # the cluster number/index of each sequence\n coverage = vec(coverage(msa)),\n)\n\nfirst(df, 5)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"It is possible to use this DataFrame and Plots to plot the sequence coverage of the MSA and also an histogram of the number of sequences in each cluster:","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using StatsPlots # Plotting DataFrames\nh = @df df histogram(:cluster, ylabel = \"nseq\")\np = @df df plot(:cluster, :coverage, linetype = :scatter)\nplot(p, h, nc = 1, xlim = (0, nclusters(clusters) + 1), legend = false)\npng(\"msa_clusters_ii.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"We use the Split-Apply-Combine strategy, though the groupby and combine function of the DataFrames package, to select the sequence of highest coverage for each cluster.","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"grouped_df = groupby(df, :cluster)\n\nmaxcoverage = combine(grouped_df) do cl\n row_index = findmax(cl.coverage)[2]\n cl[row_index, [:seqnum, :seqname, :coverage]]\nend\n\nfirst(maxcoverage, 5)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"p = @df maxcoverage plot(:cluster, :coverage, linetype = :scatter)\nh = @df maxcoverage histogram(:cluster, ylabel = \"nseq\")\nplot(p, h, nc = 1, xlim = (0, nclusters(clusters) + 1), legend = false)\npng(\"msa_clusters_iii.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"We can easily generate a mask using list comprehension, to select only the representative sequences of the MSA (deleting the rest of the sequences with filtersequences!).","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"cluster_references = Bool[seqnum in maxcoverage.seqnum for seqnum = 1:nsequences(msa)]","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"filtersequences!(msa, cluster_references)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"plot(msa)\npng(\"msa_clusters_iv.png\") # hide\nnothing # hide","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"(Image: )","category":"page"},{"location":"MSA/#Concatenating-MSAs","page":"MSA","title":"Concatenating MSAs","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Concatenating multiple sequence alignments can be helpful in various bioinformatics applications. It allows researchers to combine the alignments of different sequences or regions into a single MSA for further analysis. Examples of this maneuver are concatenating two protein sequences from the same organism to estimate coevolution among those proteins or to model the protein-protein interaction using tools such as AlphaFold.","category":"page"},{"location":"MSA/#Horizontal-and-Vertical-Concatenation","page":"MSA","title":"Horizontal and Vertical Concatenation","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"We can concatenate two MSAs as matrices using Julia's hcat and vcat functions. However, MIToS defines special methods for these functions on MSA objects to deal with sequence and column names and annotations. To use hcat, we only need the MSA having the same number of sequences. The hcat function will concatenate the first sequence of the first MSA with the first sequence of the second MSA, and so on. For example, let's define two small MSAs msa_a and msa_b, and concatenate them horizontally:","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\nmsa_a = AnnotatedMultipleSequenceAlignment(Residue[\n 'A' 'R' 'N'\n 'D' 'C' 'Q'\n]);\nrename_sequences!(msa_a, [\"SEQ1_A\", \"SEQ2_A\"])\nmsa_b = AnnotatedMultipleSequenceAlignment(Residue[\n 'N' 'Q'\n 'E' 'G'\n]);\nrename_sequences!(msa_b, [\"SEQ1_B\", \"SEQ2_B\"])\nconcatenated_msa = hcat(msa_a, msa_b)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"As you might have noticed, the hcat function preserves the sequence names by concatenating them using _&_ as a separator. So, the first sequence of the concatenated MSA is SEQ1_A_&_SEQ1_B. Also, the column names have changed in the concatenated MSA. For example, the first column of msa_a is now the first column of concatenated_msa, but its name changed from 1 to 1_1. The hcat function renames the columns so that the first number, the one before the underscore, indicates the index of the sub-MSA. The first sub-MSA in the concatenated MSA is 1, the second sub-MSA is 2, and so on. This allows you to track the origin of each column in the concatenated MSA. You can access a vector of those indices using the gethcatmapping function:","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"gethcatmapping(concatenated_msa)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"If we perform multiple concatenations—i.e., if we call hcat on an MSA output of another call to hcat—the hcat function will remember the sub-MSA boundaries to continue the numeration accordingly. For example, let's create and add a third MSA:","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"msa_c = AnnotatedMultipleSequenceAlignment(Residue[\n 'A' 'H'\n 'A' 'H'\n]);\nrename_sequences!(msa_c, [\"SEQ1_C\", \"SEQ2_C\"])\nhcat(concatenated_msa, msa_c)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"As you can see, the hcat function detects the previous concatenation and continues the indexing from the last MSA. So that column 1 of msa_c is now 3_1 in the concatenated MSA. The hcat function can take more than two MSAs as arguments. For example, you can get the same result as above by calling hcat(msa_a, msa_b, msa_c).","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"To concatenate MSAs vertically, you can use the vcat function. The only requirement is that the MSAs have the same number of columns. For example, let's define two small MSAs. The first column of msa_a will be concatenated with the first column of msa_b, and so on:","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\nmsa_a = AnnotatedMultipleSequenceAlignment(Residue[\n 'A' 'R'\n 'D' 'C'\n 'E' 'G'\n])\nmsa_b = AnnotatedMultipleSequenceAlignment(Residue[\n 'N' 'Q'\n 'D' 'R'\n])\nconcatenated_msa = vcat(msa_a, msa_b)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"In this case, vcat adds the MSA index prefix to the sequence names. So, the sequence 1 of msa_a is now 1_1 in the concatenated MSA. The vcat function, similar to hcat, can take more than two MSAs as arguments in case you need to concatenate multiple alignments vertically.","category":"page"},{"location":"MSA/#Joining-MSAs","page":"MSA","title":"Joining MSAs","text":"","category":"section"},{"location":"MSA/","page":"MSA","title":"MSA","text":"Sometimes, you may need to join or merge two MSAs, having different number of sequences or columns. For such cases, MIToS provides the join_msas function. This function allows you to join two MSAs based on specified matching positions or names. It supports different types of joins: inner, outer, left, and right. You can indicate the positions or names to match using an iterable of pairs or separate lists of positions or names. For example, using a vector of Pair objects, you can identify which positions on the first MSA (the first element of the pair) should match with which positions on the second MSA (the second element of the pair). Let's see that in one fictional example:","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"using MIToS.MSA\nmsa_a = AnnotatedMultipleSequenceAlignment(Residue[\n 'A' 'R' 'D'\n 'G' 'K' 'E'\n 'G' 'R' 'D'\n]);\nrename_sequences!(msa_a, [\"aa_HUMAN\", \"bb_MOUSE\", \"cc_YEAST\"])\nmsa_b = AnnotatedMultipleSequenceAlignment(Residue[\n 'N' 'A'\n 'E' 'G'\n 'E' 'A'\n]);\nrename_sequences!(msa_b, [\"AA_HUMAN\", \"BB_MOUSE\", \"CC_SHEEP\"])\npairing = [\"aa_HUMAN\" => \"AA_HUMAN\", \"bb_MOUSE\" => \"BB_MOUSE\"]\njoin_msas(msa_a, msa_b, pairing)","category":"page"},{"location":"MSA/","page":"MSA","title":"MSA","text":"As we can see, the join_msas function has matched the sequences on both MSAs based on the specified pairing—in this example, we create a dictionary to pair the sequences from the same species. The join_msas have two important keyword arguments: kind and axis. By default, the function performs an outer join (kind = :outer) and matches the sequences (axis = 1). You can change these arguments to perform other kinds of joins or to match the columns. Since we performed an outer join, the resulting MSA contains all sequences from both input MSAs, and join_msas have added gaps where the sequences do not match.","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"EditURL = \"cookbook/03_RMSF.jl\"","category":"page"},{"location":"03_RMSF/#Root-Mean-Squared-Fluctuation-(RMSF)","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"","category":"section"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"md # (Image: ) md # (Image: )","category":"page"},{"location":"03_RMSF/#Problem-description","page":"Root Mean Squared Fluctuation (RMSF)","title":"Problem description","text":"","category":"section"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"The Root Mean Squared Fluctuation (RMSF) is a common way to measure residue flexibility in a structural ensemble. It is a measure of how far is the residue moving from its average position in the group of structures. Usually, we represent a residue position with the spatial coordinates of its alpha carbon.","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"The protein structures should be previously superimposed to calculate the RMSF, for example, by using the superimpose function of the PDB module of MIToS. In this example, we are going to measure the RMSF of each residue from an NMR ensemble using the rmsf function.","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"The structure superimposition could be the most complicated step of the process, depending on the input data. In particular, it structures come from different PDB structures or homologous proteins can require the use of external programs, as MAMMOTH-mult or MUSTANG among others, tailored for this task.","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"In this case, we are going to use an NMR ensemble. Therefore, we are not going to need to superimpose the structures as NMR models have the same protein sequence and are, usually, well-aligned.","category":"page"},{"location":"03_RMSF/#MIToS-solution","page":"Root Mean Squared Fluctuation (RMSF)","title":"MIToS solution","text":"","category":"section"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"import MIToS\nusing MIToS.PDB\nusing Plots","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"Lets read the NMR ensemble:","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"pdb_file = abspath(pathof(MIToS), \"..\", \"..\", \"test\", \"data\", \"1AS5.pdb\")\npdb_res = read_file(pdb_file, PDBFile, occupancyfilter = true)\nnothing # hide","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"We set occupancyfilter to true to ensure that we have one single set of coordinates for each atom. That filter isn't essential for NMR structures, but It can avoid multiple alpha carbons in crystallographic structures with disordered atoms. We can get an idea of the alpha carbon positions by plotting these residues:","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"scatter(pdb_res, legend = false)","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"As we saw in the previous plot, the structure doesn't need to be superimposed. Now, we are going to separate each model into different vectors, storing each vector into a Dict:","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"models = Dict{String,Vector{PDBResidue}}()\nfor res in pdb_res\n push!(get!(models, res.id.model, []), res)\nend","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"Then, we simply need to collect all the PDB models in the values of the Dict, to get the vector of PDBResidues vectors required to calculate the RMSF.","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"pdb_models = collect(values(models))\nnothing # hide","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"And, finally, call the rmsf function on the list of structures. It is important that all the vectors has the same number of PDBResidues. This function assumes that the nth element of each vector corresponds to the same residue:","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"RMSF = rmsf(pdb_models)","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"This return the vector of RMSF values for each residue, calculated using the coordinates of the alpha carbons. You can plot this vector to get an idea of the which are the most flexible position in your structure:","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"plot(RMSF, legend = false, xlab = \"Residue\", ylab = \"RMSF [Å]\")","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"","category":"page"},{"location":"03_RMSF/","page":"Root Mean Squared Fluctuation (RMSF)","title":"Root Mean Squared Fluctuation (RMSF)","text":"This page was generated using Literate.jl.","category":"page"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"@info \"Utils API docs\"","category":"page"},{"location":"Utils_API/#API-Utils","page":"Utils","title":"Utils","text":"","category":"section"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"MIToS.Utils","category":"page"},{"location":"Utils_API/#MIToS.Utils","page":"Utils","title":"MIToS.Utils","text":"The Utils has common utils functions and types used in other modules.\n\nusing MIToS.Utils\n\n\n\n\n\n","category":"module"},{"location":"Utils_API/#Contents","page":"Utils","title":"Contents","text":"","category":"section"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"Pages = [\"Utils_API.md\"]\nDepth = 2","category":"page"},{"location":"Utils_API/#Types","page":"Utils","title":"Types","text":"","category":"section"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"Modules = [MIToS.Utils]\nPrivate = false\nOrder = [:type]","category":"page"},{"location":"Utils_API/#MIToS.Utils.All","page":"Utils","title":"MIToS.Utils.All","text":"All is used instead of MIToS 1.0 \"all\" or \"*\", because it's possible to dispatch on it.\n\n\n\n\n\n","category":"type"},{"location":"Utils_API/#MIToS.Utils.FileFormat","page":"Utils","title":"MIToS.Utils.FileFormat","text":"FileFormat is used for defile special parse_file (called by read_file) and print_file (called by read_file) methods for different file formats.\n\n\n\n\n\n","category":"type"},{"location":"Utils_API/#Constants","page":"Utils","title":"Constants","text":"","category":"section"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"Modules = [MIToS.Utils]\nPrivate = false\nOrder = [:constant]","category":"page"},{"location":"Utils_API/#MIToS.Utils.THREE2ONE","page":"Utils","title":"MIToS.Utils.THREE2ONE","text":"THREE2ONE is a dictionary that maps three-letter amino acid residue codes (String) to their corresponding one-letter codes (Char). The dictionary is generated by parsing components.cif file from the Protein Data Bank.\n\njulia> using MIToS.Utils\n\njulia> one_letter_code = THREE2ONE[\"ALA\"]\n'A': ASCII/Unicode U+0041 (category Lu: Letter, uppercase)\n\n\n\n\n\n","category":"constant"},{"location":"Utils_API/#Macros","page":"Utils","title":"Macros","text":"","category":"section"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"Modules = [MIToS.Utils]\nPrivate = false\nOrder = [:macro]","category":"page"},{"location":"Utils_API/#Methods-and-functions","page":"Utils","title":"Methods and functions","text":"","category":"section"},{"location":"Utils_API/","page":"Utils","title":"Utils","text":"Modules = [MIToS.Utils]\nPrivate = false\nOrder = [:function]","category":"page"},{"location":"Utils_API/#MIToS.Utils.check_file-Tuple{Any}","page":"Utils","title":"MIToS.Utils.check_file","text":"Returns the filename. Throws an ErrorException if the file doesn't exist, or a warning if the file is empty.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.check_pdbcode-Tuple{String}","page":"Utils","title":"MIToS.Utils.check_pdbcode","text":"It checks if a PDB code has the correct format.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.download_file-Tuple{AbstractString, AbstractString}","page":"Utils","title":"MIToS.Utils.download_file","text":"download_file uses Downloads.jl to download files from the web. It takes the file url as first argument and, optionally, a path to save it. Keyword arguments are are directly passed to to Downloads.download.\n\njulia> using MIToS.Utils\n\njulia> download_file(\"https://www.uniprot.org/uniprot/P69905.fasta\", \"seq.fasta\")\n\"seq.fasta\"\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.get_n_words-Tuple{String, Int64}","page":"Utils","title":"MIToS.Utils.get_n_words","text":"get_n_words{T <: Union{ASCIIString, UTF8String}}(line::T, n::Int) It returns a Vector{T} with the first n (possibles) words/fields (delimited by space or tab). If there is more than n words, the last word returned contains the finals words and the delimiters. The length of the returned vector is n or less (if the number of words is less than n). This is used for parsing the Stockholm format.\n\njulia> using MIToS.Utils\n\njulia> get_n_words(\"#=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHHEEEEEEEEEEEEEEEEHHH\", 3)\n3-element Vector{String}:\n \"#=GR\"\n \"O31698/18-71\"\n \"SS CCCHHHHHHHHHHHHHHHEEEEEEEEEEEEEEEEHHH\"\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.getarray-Tuple{NamedArrays.NamedArray}","page":"Utils","title":"MIToS.Utils.getarray","text":"Getter for the array field of NamedArrays\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.hascoordinates-Tuple{Any}","page":"Utils","title":"MIToS.Utils.hascoordinates","text":"hascoordinates(id) It returns true if id/sequence name has the format: UniProt/start-end (i.e. O83071/192-246)\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.isnotemptyfile-Tuple{Any}","page":"Utils","title":"MIToS.Utils.isnotemptyfile","text":"Returns true if the file exists and isn't empty.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.lineiterator-Tuple{String}","page":"Utils","title":"MIToS.Utils.lineiterator","text":"Create an iterable object that will yield each line from a stream or string.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.list2matrix-Union{Tuple{T}, Tuple{AbstractVector{T}, Int64}} where T","page":"Utils","title":"MIToS.Utils.list2matrix","text":"Returns a square symmetric matrix from the vector vec. side is the number of rows/columns. The diagonal is not included by default, set to true if there are diagonal elements in the list.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.matrix2list-Union{Tuple{AbstractMatrix{T}}, Tuple{T}} where T","page":"Utils","title":"MIToS.Utils.matrix2list","text":"Returns a vector with the part (\"upper\" or \"lower\") of the square matrix mat. The diagonal is not included by default.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.read_file-Union{Tuple{T}, Tuple{AbstractString, Type{T}, Vararg{Any}}} where T<:MIToS.Utils.FileFormat","page":"Utils","title":"MIToS.Utils.read_file","text":"read_file(pathname, FileFormat [, Type [, … ] ] ) -> Type\n\nThis function opens a file in the pathname and calls parse_file(io, ...) for the given FileFormat and Type on it. If the pathname is an HTTP or FTP URL, the file is downloaded with download in a temporal file. Gzipped files should end on .gz.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.select_element-Union{Tuple{Vector{T}}, Tuple{T}, Tuple{Vector{T}, String}} where T","page":"Utils","title":"MIToS.Utils.select_element","text":"Selects the first element of the vector. This is useful for unpacking one element vectors. Throws a warning if there are more elements. element_name is element by default, but the name can be changed using the second argument.\n\n\n\n\n\n","category":"method"},{"location":"Utils_API/#MIToS.Utils.write_file-Union{Tuple{T}, Tuple{AbstractString, Any, Type{T}}, Tuple{AbstractString, Any, Type{T}, String}} where T<:MIToS.Utils.FileFormat","page":"Utils","title":"MIToS.Utils.write_file","text":"write_file{T<:FileFormat}(filename::AbstractString, object, format::Type{T}, mode::ASCIIString=\"w\")\n\nThis function opens a file with filename and mode (default: \"w\") and writes (print_file) the object with the given format. Gzipped files should end on .gz.\n\n\n\n\n\n","category":"method"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"@info \"SIFTS docs\"","category":"page"},{"location":"SIFTS/#Module-SIFTS","page":"SIFTS","title":"SIFTS","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"The SIFTS module of MIToS allows to obtain the residue-level mapping between databases stored in the SIFTS XML files. It makes easy to assign PDB residues to UniProt/Pfam positions. Given the fact that pairwise alignments can lead to misleading association between residues in both sequences, SIFTS offers more reliable association between sequence and structure residue numbers.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS # to load the SIFTS module","category":"page"},{"location":"SIFTS/#Features","page":"SIFTS","title":"Features","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Download and parse SIFTS XML files\nStore residue-level mapping in Julia\nEasy generation of Dicts between residues numbers","category":"page"},{"location":"SIFTS/#Contents","page":"SIFTS","title":"Contents","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Pages = [\"SIFTS.md\"]\nDepth = 4","category":"page"},{"location":"SIFTS/#Simplest-residue-level-mapping","page":"SIFTS","title":"Simplest residue-level mapping","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"This module export the function siftsmapping to generate a Dict between residue numbers. This function takes 5 positional arguments.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"The name of the SIFTS XML file to parse,\nthe source database,\nthe source protein/structure identifier,\nthe destiny database and,\nthe destiny protein/structure identifier. Optionally it’s possible to indicate a particular PDB chain and if missings will be used.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Databases should be indicated using an available sub-type of DataBase. Keys and values types will be depend on the residue number type in that database.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Type db... Database Residue number type\ndbPDBe PDBe (Protein Data Bank in Europe) Int\ndbInterPro InterPro String\ndbUniProt UniProt Int\ndbPfam Pfam (Protein families database) Int\ndbNCBI NCBI (National Center for Biotechnology Information) Int\ndbPDB PDB (Protein Data Bank) String\ndbCATH CATH String\ndbSCOP SCOP (Structural Classification of Proteins) String\ndbEnsembl Ensembl String","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"To download the XML SIFTS file of a determined PDB use the downloadsifts function.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS\n\nimport MIToS # to use pathof(MIToS)\nsiftsfile = joinpath(dirname(pathof(MIToS)), \"..\", \"docs\", \"data\", \"1ivo.xml.gz\");","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"siftsfile = downloadsifts(\"1IVO\")","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"The following example, shows the residue number mapping between Pfam and PDB. Pfam uses UniProt coordinates and PDB uses their own residue numbers with insertion codes. Note that the siftsmapping function is case sensitive, and that SIFTS stores PDB identifiers using lowercase characters.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"siftsmap = siftsmapping(\n siftsfile,\n dbPfam,\n \"PF00757\",\n dbPDB,\n \"1ivo\", # SIFTS stores PDB identifiers in lowercase\n chain = \"A\", # In this example we are only using the chain A of the PDB\n missings = false,\n) # Residues without coordinates aren't used in the mapping","category":"page"},{"location":"SIFTS/#Storing-residue-level-mapping","page":"SIFTS","title":"Storing residue-level mapping","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"If you need more than the residue number mapping between two databases, you could access all the residue-level cross references using the function read_file in the SIFTSXMLFile.Format file. The parse_file function (and therefore the read_file function) for the SIFTSXML format, also takes the keyword arguments chain and missings. The read_file/parse_file function returns a Vector of SIFTSResidues objects that stores the cross references between residues in each database.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"siftsresidues = read_file(siftsfile, SIFTSXML, chain=\"A\", missings=false) # Array{SIFTSResidue,1}\nresidue_data = siftsresidues[301];","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"You are free to access the SIFTSResidue fields in order to get the desired information. SIFTSResidue objects contain db... objects (sub-types of DataBase), with the cross referenced information. You should note that, except for the PDBe and InterPro fields, the field values can be missing. The ismissing function is helpful to know if there is a db... object. For example, getting the UniProt residue name (one letter code of the amino acid) would be:","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"ismissing(residue_data.UniProt) ? \"\" : residue_data.UniProt.name","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"That line of code returns an empty string if the UniProt field is missing. Otherwise, it returns a string with the name of the residue in UniProt. Because that way of access values in a SIFT residue is too verbose, MIToS defines a more complex signature for get. Using MIToS get the previous line of code will be:","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"# SIFTSResidue database field default\nget(residue_data, dbUniProt, :name, \"\")","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"The is not need to use the full signature. Other signatures are possible depending on the value you want to access. In particular, a missing object is returned if a default value is not given at the end of the signature and the value to access is missing:","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"import MIToS # to use pathof(MIToS)\nsiftsfile = joinpath(dirname(pathof(MIToS)), \"..\", \"docs\", \"data\", \"1ivo.xml.gz\")\n\nusing MIToS.SIFTS\nresidue_data = read_file(siftsfile, SIFTSXML)[301]; # hide","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"get(residue_data, dbUniProt) # get takes the database type (`db...`)\nget(residue_data, dbUniProt, :name) # and can also take a field name (Symbol)","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"But you don't need the get function to access the three letter code of the residue in PDBe because the PDBe field can not be missing.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"residue_data.PDBe.name","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"SIFTSResidue also store information about if that residue is missing (i.e. not resolved) in the PDB structure and the information about the secondary structure (sscode and ssname):","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"residue_data.missing\nresidue_data.sscode\nresidue_data.ssname","category":"page"},{"location":"SIFTS/#Accessing-residue-level-cross-references","page":"SIFTS","title":"Accessing residue-level cross references","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"You can ask for particular values in a single SIFTSResidue using the get function.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS\nresidue_data = read_file(siftsfile, SIFTSXML)[301]\n# Is the UniProt residue name in the list of basic amino acids [\"H\", \"K\", \"R\"]?\nget(residue_data, dbUniProt, :name, \"\") in [\"H\", \"K\", \"R\"]","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Use higher order functions and lambda expressions (anonymous functions) or list comprehension to easily ask for information on the Vector{SIFTSResidue}. You can use get with the previous signature or simple direct field access and ismissing.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"# Captures PDB residue numbers if the Pfam id is \"PF00757\"\nresnums = [\n res.PDB.number for res in siftsresidues if\n !ismissing(res.PDB) && get(res, dbPfam, :id, \"\") == \"PF00757\"\n]","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Useful higher order functions are:","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"findall","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"# Which of the residues have UniProt residue names in the list [\"H\", \"K\", \"R\"]? (basic residues)\nindexes = findall(res -> get(res, dbUniProt, :name, \"\") in [\"H\", \"K\", \"R\"], siftsresidues)","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"map","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"map(i -> siftsresidues[i].UniProt, indexes) # UniProt data of the basic residues","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"filter","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"# SIFTSResidues with UniProt names in [\"H\", \"K\", \"R\"]\nbasicresidues =\n filter(res -> get(res, dbUniProt, :name, \"\") in [\"H\", \"K\", \"R\"], siftsresidues)\n\nbasicresidues[1].UniProt # UniProt data of the first basic residue","category":"page"},{"location":"SIFTS/#Example:-Which-residues-are-missing-in-the-PDB-structure","page":"SIFTS","title":"Example: Which residues are missing in the PDB structure","text":"","category":"section"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"Given that SIFTSResidue objects store a missing residue flag, it’s easy to get a vector where there is a true value if the residue is missing in the structure.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"import MIToS # to use pathof(MIToS)\nsiftsfile = joinpath(dirname(pathof(MIToS)), \"..\", \"docs\", \"data\", \"1ivo.xml.gz\");","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS\nsifts_1ivo = read_file(siftsfile, SIFTSXML, chain = \"A\"); # SIFTSResidues of the 1IVO chain A\n[res.missing for res in sifts_1ivo]","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"However, if you need to filter using other conditions, you’ll find useful the get function. In this example, we are going to ask for the UniProt id (to avoid problems with fragments, tags or chimeric/fusion proteins). We are also using get to select an specific PDB chain.","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS\n\nimport MIToS # to use pathof(MIToS)\nsiftsfile = joinpath(dirname(pathof(MIToS)), \"..\", \"docs\", \"data\", \"1jqz.xml.gz\");","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"siftsfile = downloadsifts(\"1JQZ\")","category":"page"},{"location":"SIFTS/","page":"SIFTS","title":"SIFTS","text":"using MIToS.SIFTS\nsifts_1jqz = read_file(siftsfile, SIFTSXML); # It has an amino terminal his tag\nmissings = [\n (\n (get(res, dbUniProt, :id, \"\") == \"P05230\") &\n (get(res, dbPDB, :chain, \"\") == \"A\") &\n res.missing\n ) for res in sifts_1jqz\n];\nprintln(\n \"There are only \",\n sum(missings),\n \" missing residues in the chain A, associated to UniProt P05230\",\n)\nprintln(\n \"But there are \",\n sum([res.missing for res in sifts_1jqz]),\n \" missing residues in the PDB file.\",\n)","category":"page"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"@info \"Pfam API docs\"","category":"page"},{"location":"Pfam_API/#Pfam","page":"Pfam","title":"Pfam","text":"","category":"section"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"MIToS.Pfam","category":"page"},{"location":"Pfam_API/#MIToS.Pfam","page":"Pfam","title":"MIToS.Pfam","text":"The Pfam module, defines functions to measure the protein contact prediction performance of information measure between column pairs from a Pfam MSA.\n\nFeatures\n\nRead and download Pfam MSAs\nObtain PDB information from alignment annotations\nMap between sequence/alignment residues/columns and PDB structures\nMeasure of AUC (ROC curve) for contact prediction of MI scores\n\nusing MIToS.Pfam\n\n\n\n\n\n","category":"module"},{"location":"Pfam_API/#Contents","page":"Pfam","title":"Contents","text":"","category":"section"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"Pages = [\"Pfam_API.md\"]\nDepth = 2","category":"page"},{"location":"Pfam_API/#Types","page":"Pfam","title":"Types","text":"","category":"section"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"Modules = [MIToS.Pfam]\nPrivate = false\nOrder = [:type]","category":"page"},{"location":"Pfam_API/#Constants","page":"Pfam","title":"Constants","text":"","category":"section"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"Modules = [MIToS.Pfam]\nPrivate = false\nOrder = [:constant]","category":"page"},{"location":"Pfam_API/#Macros","page":"Pfam","title":"Macros","text":"","category":"section"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"Modules = [MIToS.Pfam]\nPrivate = false\nOrder = [:macro]","category":"page"},{"location":"Pfam_API/#Methods-and-functions","page":"Pfam","title":"Methods and functions","text":"","category":"section"},{"location":"Pfam_API/","page":"Pfam","title":"Pfam","text":"Modules = [MIToS.Pfam]\nPrivate = false\nOrder = [:function]","category":"page"},{"location":"Pfam_API/#MIToS.Pfam.downloadpfam-Tuple{String}","page":"Pfam","title":"MIToS.Pfam.downloadpfam","text":"It downloads a gzipped Stockholm alignment from InterPro for the Pfam family with the given pfamcode.\n\nBy default, it downloads the full Pfam alignment. You can use the alignment keyword argument to download the seed or the uniprot alignment instead. For example, downloadpfam(\"PF00069\") will download the full alignment for the PF00069 Pfam family, while downloadpfam(\"PF00069\", alignment=\"seed\") will download the seed alignment of the family.\n\nThe extension of the downloaded file is .stockholm.gz by default; you can change it using the filename keyword argument, but the .gz at the end is mandatory.\n\n\n\n\n\n","category":"method"},{"location":"Pfam_API/#MIToS.Pfam.getcontactmasks-Union{Tuple{Vector{T}}, Tuple{T}} where T<:AbstractFloat","page":"Pfam","title":"MIToS.Pfam.getcontactmasks","text":"This function takes a msacontacts or its list of contacts contact_list with 1.0 for true contacts and 0.0 for not contacts (NaN or other numbers for missing values). Returns two BitVectors, the first with trues where contact_list is 1.0 and the second with trues where contact_list is 0.0. There are useful for AUC calculations.\n\n\n\n\n\n","category":"method"},{"location":"Pfam_API/#MIToS.Pfam.getseq2pdb-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment}","page":"Pfam","title":"MIToS.Pfam.getseq2pdb","text":"Generates from a Pfam msa a Dict{String, Vector{Tuple{String,String}}}. Keys are sequence IDs and each value is a list of tuples containing PDB code and chain.\n\njulia> getseq2pdb(msa)\nDict{String,Array{Tuple{String,String},1}} with 1 entry:\n \"F112_SSV1/3-112\" => [(\"2VQC\",\"A\")]\n\n\n\n\n\n","category":"method"},{"location":"Pfam_API/#MIToS.Pfam.hasresidues-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment, AbstractDict{Int64, String}}","page":"Pfam","title":"MIToS.Pfam.hasresidues","text":"Returns a BitVector where there is a true for each column with PDB residue.\n\n\n\n\n\n","category":"method"},{"location":"Pfam_API/#MIToS.Pfam.msacolumn2pdbresidue-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment, Vararg{String, 5}}","page":"Pfam","title":"MIToS.Pfam.msacolumn2pdbresidue","text":"msacolumn2pdbresidue(msa, seqid, pdbid, chain, pfamid, siftsfile; strict=false, checkpdbname=false, missings=true)\n\nThis function returns a OrderedDict{Int,String} with MSA column numbers on the input file as keys and PDB residue numbers (\"\" for missings) as values. The mapping is performed using SIFTS. This function needs correct ColMap and SeqMap annotations. This checks correspondence of the residues between the MSA sequence and SIFTS (It throws a warning if there are differences). Missing residues are included if the keyword argument missings is true (default: true). If the keyword argument strict is true (default: false), throws an Error, instead of a Warning, when residues don't match. If the keyword argument checkpdbname is true (default: false), throws an Error if the three letter name of the PDB residue isn't the MSA residue. If you are working with a downloaded Pfam MSA without modifications, you should read it using generatemapping=true and useidcoordinates=true. If you don't indicate the path to the siftsfile used in the mapping, this function downloads the SIFTS file in the current folder. If you don't indicate the Pfam accession number (pfamid), this function tries to read the AC file annotation.\n\n\n\n\n\n","category":"method"},{"location":"Pfam_API/#MIToS.Pfam.msacontacts","page":"Pfam","title":"MIToS.Pfam.msacontacts","text":"This function takes an AnnotatedMultipleSequenceAlignment with correct ColMap annotations and two dicts:\n\nThe first is an OrderedDict{String,PDBResidue} from PDB residue number to PDBResidue.\nThe second is a Dict{Int,String} from MSA column number on the input file to PDB residue number.\n\nmsacontacts returns a PairwiseListMatrix{Float64,false} of 0.0 and 1.0 where 1.0 indicates a residue contact. Contacts are defined with an inter residue distance less or equal to distance_limit (default to 6.05) angstroms between any heavy atom. NaN indicates a missing value.\n\n\n\n\n\n","category":"function"},{"location":"Pfam_API/#MIToS.Pfam.msaresidues-Tuple{MIToS.MSA.AnnotatedMultipleSequenceAlignment, AbstractDict{String, MIToS.PDB.PDBResidue}, AbstractDict{Int64, String}}","page":"Pfam","title":"MIToS.Pfam.msaresidues","text":"This function takes an AnnotatedMultipleSequenceAlignment with correct ColMap annotations and two dicts:\n\nThe first is an OrderedDict{String,PDBResidue} from PDB residue number to PDBResidue.\nThe second is a Dict{Int,String} from MSA column number on the input file to PDB residue number.\n\nmsaresidues returns an OrderedDict{Int,PDBResidue} from input column number (ColMap) to PDBResidue. Residues on inserts are not included.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"@info \"PDB API docs\"","category":"page"},{"location":"PDB_API/#PDB","page":"PDB","title":"PDB","text":"","category":"section"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"MIToS.PDB","category":"page"},{"location":"PDB_API/#MIToS.PDB","page":"PDB","title":"MIToS.PDB","text":"The module PDB defines types and methods to work with protein structures inside Julia. It is useful to link structural and sequential information, and needed for measure the predictive performance at protein contact prediction of mutual information scores.\n\nFeatures\n\nRead and parse PDF and PDBML files\nCalculate distance and contacts between atoms or residues\nDetermine interaction between residues\n\nusing MIToS.PDB\n\n\n\n\n\n","category":"module"},{"location":"PDB_API/#Contents","page":"PDB","title":"Contents","text":"","category":"section"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"Pages = [\"PDB_API.md\"]\nDepth = 2","category":"page"},{"location":"PDB_API/#Types","page":"PDB","title":"Types","text":"","category":"section"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"Modules = [MIToS.PDB]\nPrivate = false\nOrder = [:type]","category":"page"},{"location":"PDB_API/#MIToS.PDB.Coordinates","page":"PDB","title":"MIToS.PDB.Coordinates","text":"A Coordinates object is a fixed size vector with the coordinates x,y,z.\n\n\n\n\n\n","category":"type"},{"location":"PDB_API/#MIToS.PDB.PDBAtom","page":"PDB","title":"MIToS.PDB.PDBAtom","text":"A PDBAtom object contains the information from a PDB atom, without information of the residue. It has the following fields that you can access at any moment for query purposes:\n\n- `coordinates` : x,y,z coordinates, e.g. `Coordinates(109.641,73.162,42.7)`.\n- `atom` : Atom name, e.g. `\"CA\"`.\n- `element` : Element type of the atom, e.g. `\"C\"`.\n- `occupancy` : A float number with the occupancy, e.g. `1.0`.\n- `B` : B factor as a string, e.g. `\"23.60\"`.\n- `alt_id` : Alternative location ID, e.g. `\"A\"`.\n- `charge` : Charge of the atom, e.g. `\"0\"`.\n\n\n\n\n\n","category":"type"},{"location":"PDB_API/#MIToS.PDB.PDBFile","page":"PDB","title":"MIToS.PDB.PDBFile","text":"PDBFile <: FileFormat\n\nProtein Data Bank (PDB) format. It provides a standard representation for macromolecular structure data derived from X-ray diffraction and NMR studies.\n\n\n\n\n\n","category":"type"},{"location":"PDB_API/#MIToS.PDB.PDBML","page":"PDB","title":"MIToS.PDB.PDBML","text":"PDBML <: FileFormat\n\nProtein Data Bank Markup Language (PDBML), a representation of PDB data in XML format.\n\n\n\n\n\n","category":"type"},{"location":"PDB_API/#MIToS.PDB.PDBResidue","page":"PDB","title":"MIToS.PDB.PDBResidue","text":"A PDBResidue object contains all the information about a PDB residue. It has the following fields that you can access at any moment for query purposes:\n\n- `id` : A `PDBResidueIdentifier` object.\n- `atoms` : A vector of `PDBAtom`s.\n\n\n\n\n\n","category":"type"},{"location":"PDB_API/#MIToS.PDB.PDBResidueIdentifier","page":"PDB","title":"MIToS.PDB.PDBResidueIdentifier","text":"A PDBResidueIdentifier object contains the information needed to identity PDB residues. It has the following fields that you can access at any moment for query purposes:\n\n- `PDBe_number` : It's only used when a PDBML is readed (PDBe number as a string).\n- `number` : PDB residue number, it includes insertion codes, e.g. `\"34A\"`.\n- `name` : Three letter residue name in PDB, e.g. `\"LYS\"`.\n- `group` : It can be `\"ATOM\"` or `\"HETATM\"`.\n- `model` : The model number as a string, e.g. `\"1\"`.\n- `chain` : The chain as a string, e.g. `\"A\"`.\n\n\n\n\n\n","category":"type"},{"location":"PDB_API/#Constants","page":"PDB","title":"Constants","text":"","category":"section"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"Modules = [MIToS.PDB]\nPrivate = false\nOrder = [:constant]","category":"page"},{"location":"PDB_API/#MIToS.PDB.covalentradius","page":"PDB","title":"MIToS.PDB.covalentradius","text":"Covalent radius in Å of each element from the Additional file 1 of PICCOLO (Bickerton et al.). Hydrogen was updated using the value on Table 2 from (Cordero et al.).\n\nReferences\n\n- [Bickerton, George R., Alicia P. Higueruelo, and Tom L. Blundell. \"Comprehensive, \n atomic-level characterization of structurally characterized protein-protein \n interactions: the PICCOLO database.\" BMC bioinformatics \n 12 (2011): 1-15.](@cite 10.1186/1471-2105-12-313)\n- [Cordero, Beatriz, et al. \"Covalent radii revisited.\" Dalton Transactions \n 21 (2008): 2832-2838.](@cite 10.1039/B801115J)\n\n\n\n\n\n","category":"constant"},{"location":"PDB_API/#MIToS.PDB.vanderwaalsradius","page":"PDB","title":"MIToS.PDB.vanderwaalsradius","text":"van der Waals radius in Å from the Additional file 1 of Bickerton et al.\n\nReferences\n\n- [Bickerton, George R., Alicia P. Higueruelo, and Tom L. Blundell. \"Comprehensive, \n atomic-level characterization of structurally characterized protein-protein \n interactions: the PICCOLO database.\" BMC bioinformatics \n 12 (2011): 1-15.](@cite 10.1186/1471-2105-12-313)\n\n\n\n\n\n","category":"constant"},{"location":"PDB_API/#Macros","page":"PDB","title":"Macros","text":"","category":"section"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"Modules = [MIToS.PDB]\nPrivate = false\nOrder = [:macro]","category":"page"},{"location":"PDB_API/#MIToS.PDB.@atoms-Tuple{Any, Symbol, Any, Symbol, Any, Symbol, Any, Symbol, Any, Symbol, Any}","page":"PDB","title":"MIToS.PDB.@atoms","text":"@atoms ... model ... chain ... group ... residue ... atom ...\n\nThese return a vector of PDBAtoms with the selected subset of atoms from a list of residues. You can use the type All to avoid filtering that option.\n\nDEPRECATED: This macro is deprecated. Use the select_atoms function instead.\n\n\n\n\n\n","category":"macro"},{"location":"PDB_API/#MIToS.PDB.@residues-Tuple{Any, Symbol, Any, Symbol, Any, Symbol, Any, Symbol, Any}","page":"PDB","title":"MIToS.PDB.@residues","text":"@residues ... model ... chain ... group ... residue ...\n\nThese return a new vector with the selected subset of residues from a list of residues. You can use the type All to avoid filtering that option.\n\nDEPRECATED: This macro is deprecated. Use the select_residues function instead.\n\n\n\n\n\n","category":"macro"},{"location":"PDB_API/#MIToS.PDB.@residuesdict-Tuple{Any, Symbol, Any, Symbol, Any, Symbol, Any, Symbol, Any}","page":"PDB","title":"MIToS.PDB.@residuesdict","text":"@residuesdict ... model ... chain ... group ... residue ...\n\nThis macro returns a dictionary (using PDB residue numbers as keys) with the selected subset of residues from a list of residues. You can use the type All to avoid filtering that option.\n\nDEPRECATED: This macro is deprecated. Use the residuesdict function instead.\n\n\n\n\n\n","category":"macro"},{"location":"PDB_API/#Methods-and-functions","page":"PDB","title":"Methods and functions","text":"","category":"section"},{"location":"PDB_API/","page":"PDB","title":"PDB","text":"Modules = [MIToS.PDB]\nPrivate = false\nOrder = [:function]","category":"page"},{"location":"PDB_API/#Base.angle-Tuple{MIToS.PDB.Coordinates, MIToS.PDB.Coordinates, MIToS.PDB.Coordinates}","page":"PDB","title":"Base.angle","text":"angle(a::Coordinates, b::Coordinates, c::Coordinates)\n\nAngle (in degrees) at b between a-b and b-c\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#Base.any-Tuple{Function, MIToS.PDB.PDBResidue, MIToS.PDB.PDBResidue, Function}","page":"PDB","title":"Base.any","text":"any(f::Function, a::PDBResidue, b::PDBResidue, criteria::Function)\n\nTest if the function f is true for any pair of atoms between the residues a and b. This function only test atoms that returns true for the fuction criteria.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#Base.any-Tuple{Function, MIToS.PDB.PDBResidue, MIToS.PDB.PDBResidue}","page":"PDB","title":"Base.any","text":"any(f::Function, a::PDBResidue, b::PDBResidue)\n\nTest if the function f is true for any pair of atoms between the residues a and b\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.CAmatrix-Tuple{AbstractVector{MIToS.PDB.PDBResidue}}","page":"PDB","title":"MIToS.PDB.CAmatrix","text":"Returns a matrix with the x, y and z coordinates of the Cα with best occupancy for each PDBResidue of the ATOM group. If a residue doesn't have a Cα, its Cα coordinates are NaNs.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.aromatic-Tuple{MIToS.PDB.PDBResidue, MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.aromatic","text":"There's an aromatic interaction if centriods are at 6.0 Å or less.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.aromaticsulphur-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.aromaticsulphur","text":"Returns true if an sulphur and an aromatic atoms are 5.3 Å or less\"\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.bestoccupancy-Tuple{Vector{MIToS.PDB.PDBAtom}}","page":"PDB","title":"MIToS.PDB.bestoccupancy","text":"Takes a Vector of PDBAtoms and returns a Vector of the PDBAtoms with best occupancy.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.center!-Tuple{AbstractMatrix{Float64}}","page":"PDB","title":"MIToS.PDB.center!","text":"center!(A::AbstractMatrix{Float64})\n\nTakes a set of points A as an NxD matrix (N: number of points, D: dimension). Translates A in place so that its centroid is at the origin of coordinates\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.centeredcoordinates","page":"PDB","title":"MIToS.PDB.centeredcoordinates","text":"Returns a Matrix{Float64} with the centered coordinates of all the atoms in residues. An optional positional argument CA (default: true) defines if only Cα carbons should be used to center the matrix.\n\n\n\n\n\n","category":"function"},{"location":"PDB_API/#MIToS.PDB.centeredresidues","page":"PDB","title":"MIToS.PDB.centeredresidues","text":"Returns a new Vector{PDBResidue} with the PDBResidues having centered coordinates. An optional positional argument CA (default: true) defines if only Cα carbons should be used to center the matrix.\n\n\n\n\n\n","category":"function"},{"location":"PDB_API/#MIToS.PDB.change_coordinates","page":"PDB","title":"MIToS.PDB.change_coordinates","text":"change_coordinates(residue::PDBResidue, coordinates::AbstractMatrix{Float64}, offset::Int=1)\n\nReturns a new PDBResidues with (x,y,z) from a coordinates AbstractMatrix{Float64} You can give an offset indicating in wich matrix row starts the (x,y,z) coordinates of the residue.\n\n\n\n\n\n","category":"function"},{"location":"PDB_API/#MIToS.PDB.change_coordinates-Tuple{AbstractVector{MIToS.PDB.PDBResidue}, AbstractMatrix{Float64}}","page":"PDB","title":"MIToS.PDB.change_coordinates","text":"change_coordinates(residues::AbstractVector{PDBResidue}, coordinates::AbstractMatrix{Float64})\n\nReturns a new Vector{PDBResidues} with (x,y,z) from a coordinates Matrix{Float64}\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.change_coordinates-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.Coordinates}","page":"PDB","title":"MIToS.PDB.change_coordinates","text":"change_coordinates(atom::PDBAtom, coordinates::Coordinates)\n\nReturns a new PDBAtom but with a new coordinates\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.check_atoms_for_interactions-Tuple{MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.check_atoms_for_interactions","text":"This function takes a PDBResidue and returns true only if all the atoms can be used for checking interactions.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.contact-Tuple{MIToS.PDB.Coordinates, MIToS.PDB.Coordinates, AbstractFloat}","page":"PDB","title":"MIToS.PDB.contact","text":"contact(a::Coordinates, b::Coordinates, limit::AbstractFloat)\n\nIt returns true if the distance is less or equal to the limit. It doesn't call sqrt because it does squared_distance(a,b) <= limit^2.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.contact-Tuple{MIToS.PDB.PDBResidue, MIToS.PDB.PDBResidue, AbstractFloat}","page":"PDB","title":"MIToS.PDB.contact","text":"contact(A::PDBResidue, B::PDBResidue, limit::AbstractFloat; criteria::String=\"All\")\n\nReturns true if the residues A and B are at contact distance (limit). The available distance criteria are: Heavy, All, CA, CB (CA for GLY)\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.contact-Tuple{Vector{MIToS.PDB.PDBResidue}, AbstractFloat}","page":"PDB","title":"MIToS.PDB.contact","text":"contact(residues::Vector{PDBResidue}, limit::AbstractFloat; criteria::String=\"All\")\n\nIf contact takes a Vector{PDBResidue}, It returns a matrix with all the pairwise comparisons (contact map).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.coordinatesmatrix-Tuple{MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.coordinatesmatrix","text":"Returns a matrix with the x, y, z coordinates of each atom in each PDBResidue\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.covalent-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.covalent","text":"Returns true if the distance between atoms is less than the sum of the covalentradius of each atom.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.distance-Tuple{MIToS.PDB.Coordinates, MIToS.PDB.Coordinates}","page":"PDB","title":"MIToS.PDB.distance","text":"It calculates the squared euclidean distance.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.distance-Tuple{Vector{MIToS.PDB.PDBResidue}}","page":"PDB","title":"MIToS.PDB.distance","text":"distance(residues::Vector{PDBResidue}; criteria::String=\"All\")\n\nIf distance takes a Vector{PDBResidue} returns a PairwiseListMatrix{Float64, false} with all the pairwise comparisons (distance matrix).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.disulphide-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.disulphide","text":"Returns true if two CYS's S are at 2.08 Å or less\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.download_alphafold_structure-Union{Tuple{String}, Tuple{T}} where T<:MIToS.Utils.FileFormat","page":"PDB","title":"MIToS.PDB.download_alphafold_structure","text":"download_alphafold_structure(uniprot_accession::String; format::Type{T}=MMCIFFile) where T<:FileFormat\n\nThis function downloads the structure file (PDB or mmCIF) for a given UniProt Accession from AlphaFoldDB. The uniprot_accession parameter specifies the UniProt Accession of the protein, e.g. \"P00520\". The format parameter specifies the file format to download, with the default being mmCIF, i.e. MMCIFFile. You can set format to PDBFile if you want to download a PDB file.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.downloadpdb-Union{Tuple{String}, Tuple{T}} where T<:MIToS.Utils.FileFormat","page":"PDB","title":"MIToS.PDB.downloadpdb","text":"downloadpdb(pdbcode::String; format::Type{T} = MMCIFFile, filename, baseurl, kargs...)\n\nIt downloads a gzipped PDB file from PDB database. It requires a four character pdbcode. Its default format is MMCIFFile (mmCIF) and It uses the baseurl \"http://www.rcsb.org/pdb/files/\". filename is the path/name of the output file. This function calls MIToS.Utils.download_file that calls Downloads.download. So, you can use keyword arguments, such as headers, from that function.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.downloadpdbheader-Tuple{String}","page":"PDB","title":"MIToS.PDB.downloadpdbheader","text":"It downloads a JSON file containing the PDB header information.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.findCB-Tuple{MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.findCB","text":"Returns a vector of indices for CB (CA for GLY)\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.findatoms-Tuple{Vector{MIToS.PDB.PDBAtom}, String}","page":"PDB","title":"MIToS.PDB.findatoms","text":"findatoms(res::PDBResidue, atom::String)\n\nReturns a index vector of the atoms with the given atom name.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.findheavy-Tuple{Vector{MIToS.PDB.PDBAtom}}","page":"PDB","title":"MIToS.PDB.findheavy","text":"Returns a list with the index of the heavy atoms (all atoms except hydrogen) in the PDBResidue\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.getCA-Tuple{MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.getCA","text":"Returns the Cα with best occupancy in the PDBResidue. If the PDBResidue has no Cα, missing is returned.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.getpdbdescription-Tuple{String}","page":"PDB","title":"MIToS.PDB.getpdbdescription","text":"Access general information about a PDB entry (e.g., Header information) using the GraphQL interface of the PDB database. It parses the JSON answer into a JSON3.Object that can be used as a dictionary.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.hydrogenbond-Tuple{MIToS.PDB.PDBResidue, MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.hydrogenbond","text":"This function only works if there are hydrogens in the structure. The criteria for a hydrogen bond are:\n\nd(Ai, Aj) < 3.9Å\nd(Ah, Aacc) < 2.5Å\nθ(Adon, Ah, Aacc) > 90°\nθ(Adon, Aacc, Aacc-antecedent) > 90°\nθ(Ah, Aacc, Aacc-antecedent) > 90°\n\nWhere Ah is the donated hydrogen atom, Adon is the hydrogen bond donor atom, Aacc is the hydrogen bond acceptor atom and Aacc-antecednt is the atom antecedent to the hydrogen bond acceptor atom.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.hydrophobic-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.hydrophobic","text":"There's an hydrophobic interaction if two hydrophobic atoms are at 5.0 Å or less.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.ionic-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.ionic","text":"There's an ionic interaction if a cationic and an anionic atoms are at 6.0 Å or less.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.is_aminoacid-Tuple{MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.is_aminoacid","text":"is_aminoacid(residue::PDBResidue)\nis_aminoacid(residue_id::PDBResidueIdentifier)\n\nThis function returns true if the PDB residue is an amino acid residue. It checks if the residue's three-letter name exists in the MIToS.Utils.THREE2ONE dictionary, and returns false otherwise.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.isanionic-Tuple{MIToS.PDB.PDBAtom, String}","page":"PDB","title":"MIToS.PDB.isanionic","text":"Returns true if the atom, e.g. (\"GLU\",\"CD\"), is an anionic atom in the residue.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.isaromatic-Tuple{MIToS.PDB.PDBAtom, String}","page":"PDB","title":"MIToS.PDB.isaromatic","text":"Returns true if the atom, e.g. (\"HIS\",\"CG\"), is an aromatic atom in the residue.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.isatom-Tuple{MIToS.PDB.PDBAtom, Any}","page":"PDB","title":"MIToS.PDB.isatom","text":"It tests if the atom has the indicated atom name.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.iscationic-Tuple{MIToS.PDB.PDBAtom, String}","page":"PDB","title":"MIToS.PDB.iscationic","text":"Returns true if the atom, e.g. (\"ARG\",\"NE\"), is a cationic atom in the residue.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.ishbondacceptor-Tuple{MIToS.PDB.PDBAtom, String}","page":"PDB","title":"MIToS.PDB.ishbondacceptor","text":"Returns true if the atom, e.g. (\"ARG\",\"O\"), is an acceptor in H bonds.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.ishbonddonor-Tuple{MIToS.PDB.PDBAtom, String}","page":"PDB","title":"MIToS.PDB.ishbonddonor","text":"Returns true if the atom, e.g. (\"ARG\",\"N\"), is a donor in H bonds.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.isresidue-Tuple{MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.isresidue","text":" isresidue(res; model=All, chain=All, group=All, residue=All)\n\nThis function tests if a PDBResidue has the indicated model, chain, group and residue names/numbers. You can use the type All (default value) to avoid filtering that level.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.kabsch-Tuple{AbstractMatrix{Float64}, AbstractMatrix{Float64}}","page":"PDB","title":"MIToS.PDB.kabsch","text":"kabsch(A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})\n\nThis function takes two sets of points, A (refrence) and B as NxD matrices, where D is the dimension and N is the number of points. Assumes that the centroids of A and B are at the origin of coordinates. You can call center! on each matrix before calling kabsch to center the matrices in the (0.0, 0.0, 0.0). Rotates B so that rmsd(A,B) is minimized. Returns the rotation matrix. You should do B * RotationMatrix to get the rotated B.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.mean_coordinates-Union{Tuple{AbstractVector{T}}, Tuple{T}} where T<:AbstractMatrix{Float64}","page":"PDB","title":"MIToS.PDB.mean_coordinates","text":"Calculates the average/mean position of each atom in a set of structure. The function takes a vector (AbstractVector) of vectors (AbstractVector{PDBResidue}) or matrices (AbstractMatrix{Float64}) as first argument. As second (optional) argument this function can take an AbstractVector{Float64} of matrix/structure weights to return a weighted mean. When a AbstractVector{PDBResidue} is used, if the keyword argument calpha is false the RMSF is calculated for all the atoms. By default only alpha carbons are used (default: calpha=true).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.modelled_sequences-Union{Tuple{AbstractArray{MIToS.PDB.PDBResidue, N}}, Tuple{N}} where N","page":"PDB","title":"MIToS.PDB.modelled_sequences","text":"modelled_sequences(residue_list::AbstractArray{PDBResidue,N}; \n model::Union{String,Type{All}}=All, chain::Union{String,Type{All}}=All, \n group::Union{String,Regex,Type{All}}=All) where N\n\nThis function returns an OrderedDict where each key is a named tuple (containing the model and chain identifiers), and each value is the protein sequence corresponding to the modelled residues in those chains. Therefore, the obtained sequences do not contain missing residues. All modelled residues are included by default, but those that don't satisfy specified criteria based on the model, chain, or group keyword arguments are excluded. One-letter residue names are obtained from the MIToS.Utils.THREE2ONE dictionary for all residue names that return true for is_aminoacid.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.pication-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.pication","text":"There's a Π-Cation interaction if a cationic and an aromatic atoms are at 6.0 Å or less\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.proximitymean-Union{Tuple{T}, Tuple{Vector{MIToS.PDB.PDBResidue}, AbstractVector{T}}, Tuple{Vector{MIToS.PDB.PDBResidue}, AbstractVector{T}, T}} where T<:AbstractFloat","page":"PDB","title":"MIToS.PDB.proximitymean","text":"proximitymean calculates the proximity mean/average for each residue as the average score (from a scores list) of all the residues within a certain physical distance to a given amino acid. The score of that residue is not included in the mean unless you set include to true. The default values are 6.05 for the distance threshold/limit and \"Heavy\" for the criteria keyword argument. This function allows to calculate pMI (proximity mutual information) and pC (proximity conservation) as in Buslje et al..\n\nReferences\n\nMarino Buslje, Cristina, et al. \"Networks of high mutual information define the structural proximity of catalytic sites: implications for catalytic residue identification.\" PLoS computational biology 6.11 (2010): e1000978.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.query_alphafolddb-Tuple{String}","page":"PDB","title":"MIToS.PDB.query_alphafolddb","text":"query_alphafolddb(uniprot_accession::String)\n\nThis function queries the AlphaFoldDB API to retrieve structure information for a given uniprot_accession, e.g. \"P00520\". This function returns the structure information as a JSON3.Object.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.residuepairsmatrix-Union{Tuple{diagonal}, Tuple{T}, Tuple{Vector{MIToS.PDB.PDBResidue}, Type{T}, Type{Val{diagonal}}, T}} where {T, diagonal}","page":"PDB","title":"MIToS.PDB.residuepairsmatrix","text":"It creates a NamedArray containing a PairwiseListMatrix where each element (column, row) is identified with a PDBResidue from the input vector. You can indicate the value type of the matrix (default to Float64), if the list should have the diagonal values (default to Val{false}) and the diagonal values (default to NaN).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.residues-Union{Tuple{N}, Tuple{AbstractArray{MIToS.PDB.PDBResidue, N}, Vararg{Any, 4}}} where N","page":"PDB","title":"MIToS.PDB.residues","text":"The residues function for AbstractArray{PDBResidue,N} is deprecated. Use the select_residues function instead. So, residues(residue_list, model, chain, group, residue) becomes select_residues(residue_list; model=model, chain=chain, group=group, residue=residue).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.residuesdict-Union{Tuple{AbstractArray{MIToS.PDB.PDBResidue, N}}, Tuple{N}} where N","page":"PDB","title":"MIToS.PDB.residuesdict","text":" residuesdict(residue_list; model=All, chain=All, group=All, residue=All)\n\nThis function returns a dictionary (using PDB residue numbers as keys) with the selected subset of residues. The residues are selected using the keyword arguments model, chain, group and residue. You can use the type All (default value) to avoid filtering at a particular level.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.rmsd-Tuple{AbstractMatrix{Float64}, AbstractMatrix{Float64}}","page":"PDB","title":"MIToS.PDB.rmsd","text":"rmsd(A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})\n\nReturn RMSD between two sets of points A and B, given as NxD matrices (N: number of points, D: dimension).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.rmsd-Tuple{AbstractVector{MIToS.PDB.PDBResidue}, AbstractVector{MIToS.PDB.PDBResidue}}","page":"PDB","title":"MIToS.PDB.rmsd","text":"rmsd(A::AbstractVector{PDBResidue}, B::AbstractVector{PDBResidue}; superimposed::Bool=false)\n\nReturns the Cα RMSD value between two PDB structures: A and B. If the structures are already superimposed between them, use superimposed=true to avoid a new superimposition (superimposed is false by default).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.rmsf-Union{Tuple{AbstractVector{T}}, Tuple{T}} where T<:AbstractMatrix{Float64}","page":"PDB","title":"MIToS.PDB.rmsf","text":"Calculates the RMSF (Root Mean-Square-Fluctuation) between an atom and its average position in a set of structures. The function takes a vector (AbstractVector) of vectors (AbstractVector{PDBResidue}) or matrices (AbstractMatrix{Float64}) as first argument. As second (optional) argument this function can take an AbstractVector{Float64} of matrix/structure weights to return the root weighted mean-square-fluctuation around the weighted mean structure. When a Vector{PDBResidue} is used, if the keyword argument calpha is false the RMSF is calculated for all the atoms. By default only alpha carbons are used (default: calpha=true).\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.select_atoms-Union{Tuple{AbstractArray{MIToS.PDB.PDBResidue, N}}, Tuple{N}} where N","page":"PDB","title":"MIToS.PDB.select_atoms","text":"select_atoms(residue_list; model=All, chain=All, group=All, residue=All, atom=All, alt_id=All, charge=All)\n\nThis function returns a vector of PDBAtoms with the selected subset of atoms from a list of residues. The atoms are selected using the keyword arguments model, chain, group, residue, atom, alt_id, and charge. You can use the type All (default value) to avoid filtering at a particular level.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.select_residues-Union{Tuple{AbstractArray{MIToS.PDB.PDBResidue, N}}, Tuple{N}} where N","page":"PDB","title":"MIToS.PDB.select_residues","text":"select_residues(residue_list; model=All, chain=All, group=All, residue=All)\n\nThis function returns a new vector with the selected subset of residues from a list of residues. You can use the keyword arguments model, chain, group and residue to select the residues. You can use the type All (default value) to avoid filtering at a particular level.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.selectbestoccupancy-Tuple{Vector{MIToS.PDB.PDBAtom}, Vector{Int64}}","page":"PDB","title":"MIToS.PDB.selectbestoccupancy","text":"Takes a PDBResidue and a Vector of atom indices. Returns the index value of the Vector with maximum occupancy.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.squared_distance-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom}","page":"PDB","title":"MIToS.PDB.squared_distance","text":"It calculates the squared euclidean distance, i.e. it doesn't spend time in sqrt\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.squared_distance-Tuple{MIToS.PDB.PDBResidue, MIToS.PDB.PDBResidue}","page":"PDB","title":"MIToS.PDB.squared_distance","text":"squared_distance(A::PDBResidue, B::PDBResidue; criteria::String=\"All\")\n\nReturns the squared distance between the residues A and B. The available criteria are: Heavy, All, CA, CB (CA for GLY)\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.superimpose","page":"PDB","title":"MIToS.PDB.superimpose","text":"Asuper, Bsuper, RMSD = superimpose(A, B, matches=nothing)\n\nThis function takes A::AbstractVector{PDBResidue} (reference) and B::AbstractVector{PDBResidue}. Translates A and B to the origin of coordinates, and rotates B so that rmsd(A,B) is minimized with the Kabsch algorithm (using only their α carbons). Returns the rotated and translated versions of A and B, and the RMSD value.\n\nOptionally provide matches which iterates over matched index pairs in A and B, e.g., matches = [(3, 5), (4, 6), ...]. The alignment will be constructed using just the matching residues.\n\n\n\n\n\n","category":"function"},{"location":"PDB_API/#MIToS.PDB.vanderwaals-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.vanderwaals","text":"Test if two atoms or residues are in van der Waals contact using: distance(a,b) <= 0.5 + vanderwaalsradius[a] + vanderwaalsradius[b]. It returns distance <= 0.5 if the atoms aren't in vanderwaalsradius.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.PDB.vanderwaalsclash-Tuple{MIToS.PDB.PDBAtom, MIToS.PDB.PDBAtom, Any, Any}","page":"PDB","title":"MIToS.PDB.vanderwaalsclash","text":"Returns true if the distance between the atoms is less than the sum of the vanderwaalsradius of the atoms. If the atoms aren't on the list (i.e. OXT), the vanderwaalsradius of the element is used. If there is not data in the dict, distance 0.0 is used.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.Utils.parse_file-Tuple{LightXML.XMLDocument, Type{MIToS.PDB.PDBML}}","page":"PDB","title":"MIToS.Utils.parse_file","text":"parse_file(pdbml, ::Type{PDBML}; chain=All, model=All, group=All, atomname=All, onlyheavy=false, label=true, occupancyfilter=false)\n\nReads a LightXML.XMLDocument representing a pdb file. Returns a list of PDBResidues (view MIToS.PDB.PDBResidues). Setting chain, model, group, atomname and onlyheavy values can be used to select of a subset of all residues. If not set, all residues are returned. If the keyword argument label (default: true) is false,the auth_ attributes will be use instead of the label_ attributes for chain, atom and residue name fields. The auth_ attributes are alternatives provided by an author in order to match the identification/values used in the publication that describes the structure. If the keyword argument occupancyfilter (default: false) is true, only the atoms with the best occupancy are returned.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.Utils.parse_file-Tuple{Union{IO, String}, Type{MIToS.PDB.MMCIFFile}}","page":"PDB","title":"MIToS.Utils.parse_file","text":"parse_file(io, ::Type{MMCIFFile}; chain=All, model=All, group=All, atomname=All, onlyheavy=false, label=true, occupancyfilter=false)\n\nParse an mmCIF file and returns a list of PDBResidues. Setting chain, model, group, atomname and onlyheavy values can be used to select a subset of residues. Group can be \"ATOM\" or \"HETATM\". If those keyword arguments are not set, all residues are returned. If the keyword argument label (default: true) is false, the auth_ attributes will be used instead of the label_ attributes for chain, atom, and residue name fields. The auth_ attributes are alternatives provided by an author in order to match the identification/values used in the publication that describes the structure. If the keyword argument occupancyfilter (default: false) is true, only the atoms with the best occupancy are returned.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.Utils.parse_file-Tuple{Union{IO, String}, Type{MIToS.PDB.PDBFile}}","page":"PDB","title":"MIToS.Utils.parse_file","text":"parse_file(io, ::Type{PDBFile}; chain=All, model=All, group=All, atomname=All, onlyheavy=false, occupancyfilter=false)\n\nReads a text file of a PDB entry. Returns a list of PDBResidue (view MIToS.PDB.PDBResidues). Setting chain, model, group, atomname and onlyheavy values can be used to select of a subset of all residues. Group can be \"ATOM\" or \"HETATM\". If not set, all residues are returned. If the keyword argument occupancyfilter (default: false) is true, only the atoms with the best occupancy are returned.\n\n\n\n\n\n","category":"method"},{"location":"PDB_API/#MIToS.Utils.print_file","page":"PDB","title":"MIToS.Utils.print_file","text":"print_file(io, res, format::Type{PDBFile}) print_file(res, format::Type{PDBFile})\n\nPrint a PDBResidue or a vector of PDBResidues in PDB format.\n\n\n\n\n\n","category":"function"},{"location":"References/","page":"References","title":"References","text":"@info \"References\"","category":"page"},{"location":"References/#References","page":"References","title":"References","text":"","category":"section"},{"location":"References/","page":"References","title":"References","text":"D. J. Zea, D. Anfossi, M. Nielsen and C. Marino-Buslje. MIToS. jl: mutual information tools for protein sequence analysis in the Julia language. Bioinformatics 33, 564–565 (2017).\n\n\n\nU. Hobohm, M. Scharf, R. Schneider and C. Sander. Selection of representative protein data sets. Protein Science 1, 409–417 (1992).\n\n\n\nC. M. Buslje, J. Santos, J. M. Delfino and M. Nielsen. Correction for phylogeny, small number of observations and data redundancy improves the identification of coevolving amino acid pairs using mutual information. Bioinformatics 25, 1125–1131 (2009).\n\n\n\nS. F. Altschul, T. L. Madden, A. A. Schäffer, J. Zhang, Z. Zhang, W. Miller and D. J. Lipman. Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. Nucleic acids research 25, 3389–3402 (1997).\n\n\n\nS. D. Dunn, L. M. Wahl and G. B. Gloor. Mutual information without the influence of phylogeny or entropy dramatically improves residue contact prediction. Bioinformatics 24, 333–340 (2008).\n\n\n\nS. Velankar, J. M. Dana, J. Jacobsen, G. van Ginkel, P. J. Gane, J. Luo, T. J. Oldfield, C. O’Donovan, M.-J. Martin and G. J. Kleywegt. SIFTS: Structure Integration with Function, Taxonomy and Sequences resource. Nucleic Acids Research 41, D483-D489 (2012).\n\n\n\nP. Stothard. The sequence manipulation suite: JavaScript programs for analyzing and formatting protein and DNA sequences. Biotechniques 28, 1102–1104 (2000).\n\n\n\nB. J. Grant, A. P. Rodrigues, K. M. ElSawy, J. A. McCammon and L. S. Caves. Bio3d: an R package for the comparative analysis of protein structures. Bioinformatics 22, 2695–2696 (2006).\n\n\n\nW. Perks. Some observations on inverse probability including a new indifference rule. Journal of the Institute of Actuaries 73, 285–334 (1947).\n\n\n\nS. Trybula. Some problems of simultaneous minimax estimation. The Annals of Mathematical Statistics 29, 245–253 (1958).\n\n\n\nH. Jeffreys. An invariant form for the prior probability in estimation problems. Proceedings of the Royal Society of London. Series A. Mathematical and Physical Sciences 186, 453–461 (1946).\n\n\n\nC. Marino Buslje, E. Teppa, T. Di Doménico, J. M. Delfino and M. Nielsen. Networks of high mutual information define the structural proximity of catalytic sites: implications for catalytic residue identification. PLoS computational biology 6, e1000978 (2010).\n\n\n\n","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"EditURL = \"cookbook/01_Change_B_factors.jl\"","category":"page"},{"location":"01_Change_B_factors/#Change-B-factors","page":"Change B-factors","title":"Change B-factors","text":"","category":"section"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"(Image: ) (Image: )","category":"page"},{"location":"01_Change_B_factors/#Problem-description","page":"Change B-factors","title":"Problem description","text":"","category":"section"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"It is a common practice to change the B-factors of a PDB to store information about atoms or residues to be used by other programs. In particular, values in the B-factor column can be easily used to colour residues with PyMOL or Chimera.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"We cannot simply assign a new value to the B field of a PDBAtom because this type is immutable. However, we can make use of the @set macro of the Setfield package to create a new PDBAtom with a different B-factor value.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"In a PDB file, B-factors are stored from the column 61 to 66. Therefore, new B-factors should be a String with 6 or fewer characters, normally using two characters for decimal values. We can use pyfmt and FormatSpec from the Format package to create a proper B-factor string.","category":"page"},{"location":"01_Change_B_factors/#MIToS-solution","page":"Change B-factors","title":"MIToS solution","text":"","category":"section"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"For this example we are going to use the small heat shock protein AgsA from Salmonella typhimurium (PDB code: 4ZJ9) available in MIToS docs data:","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"using MIToS\npdbfile = abspath(pathof(MIToS), \"..\", \"..\", \"docs\", \"data\", \"4zj9.pdb\")\nnothing # hide","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"First, we need to read the PDB file using the MIToS.PDB module:","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"using MIToS.PDB\npdb_residues = read_file(pdbfile, PDBFile)\nnothing # hide","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"For this example, we are going to replace the B-factor of the alpha-carbons by the residue hydrophobicity according to the hydrophobicity scale of Kyte and Doolittle used by Chimera:","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"hydrophobicity = Dict(\n \"ILE\" => 4.5,\n \"VAL\" => 4.2,\n \"LEU\" => 3.8,\n \"PHE\" => 2.8,\n \"CYS\" => 2.5,\n \"MET\" => 1.9,\n \"ALA\" => 1.8,\n \"GLY\" => -0.4,\n \"THR\" => -0.7,\n \"SER\" => -0.8,\n \"TRP\" => -0.9,\n \"TYR\" => -1.3,\n \"PRO\" => -1.6,\n \"HIS\" => -3.2,\n \"GLU\" => -3.5,\n \"GLN\" => -3.5,\n \"ASP\" => -3.5,\n \"ASN\" => -3.5,\n \"LYS\" => -3.9,\n \"ARG\" => -4.5,\n)\nnothing # hide","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"First, we define a helper function using Format to create a proper B-factor string with the PDB format; 6 characters and 2 digits after the decimal point. The PDB format description describe this field as:","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"COLUMNS DATA TYPE FIELD DEFINITION\n------------------------------------------------------\n61 - 66 Real(6.2) tempFactor Temperature factor.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"using Format\n\n\"\"\"\nReturn value as a string with the B factor format described in PDB. # e.g. 1.5 -> \" 1.50\"\n\"\"\"\nformat_b_factor(value) = pyfmt(FormatSpec(\"6.2f\"), value) # e.g. 1.5 -> \" 1.50\"\nnothing # hide","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"Then, where are using that helper function to define a function that returns a new PDBAtom by changing the B factor field using the Setfield package.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"using Setfield\n\n\"\"\"\nReturn a new PDBAtom with the B-factor changed to value.\n\"\"\"\nfunction change_b_factor(atom::PDBAtom, value)\n b_factor_string = format_b_factor(value)\n b_factor_string = strip(b_factor_string) # e.g. \" 1.50\" -> \"1.50\"\n if length(b_factor_string) > 6\n throw(ErrorException(\"$b_factor_string has more than 6 characters.\"))\n end\n @set atom.B = b_factor_string\nend\nnothing # hide","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"Now, we can use the change_b_factor function to change the B-factor of each \"CA\" atom:","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"for res in pdb_residues\n for i in eachindex(res.atoms)\n atom = res.atoms[i]\n if atom.atom == \"CA\"\n res.atoms[i] = change_b_factor(atom, hydrophobicity[res.id.name])\n end\n end\nend","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"Finally, we can save the changed residues in a new PDB file.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"write_file(\"4zj9_hydrophobicity.pdb\", pdb_residues, PDBFile)","category":"page"},{"location":"01_Change_B_factors/#Discussion","page":"Change B-factors","title":"Discussion","text":"","category":"section"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"While we have focused on changing the B-factor field of a PDBAtom, you can use the same approach to change other fields. However, if you want to change atom coordinates, it is better to use the change_coordinates function from the PDB module of MIToS.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"MIToS atoms and residues generally stores the string present in the input file without surrounding spaces. You can use the Format module to create these strings and strip to get rid of the spaces. You can see the PDB format description to know what is the format of the expected string or see the MIToS PDB print_file source code to get a quick idea.","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"","category":"page"},{"location":"01_Change_B_factors/","page":"Change B-factors","title":"Change B-factors","text":"This page was generated using Literate.jl.","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"@info \"Example\"","category":"page"},{"location":"Example/#Example","page":"Example","title":"Example","text":"","category":"section"},{"location":"Example/","page":"Example","title":"Example","text":"In this simple demonstration, you will see how to calculate ZBLMIp (Z score of the corrected MIp using BLOSUM62 pseudo frequencies) for a Pfam(Image: ) MSA from the Julia REPL or using a MIToS script in the system command line.","category":"page"},{"location":"Example/#juliarepl","page":"Example","title":"MIToS in the Julia REPL","text":"","category":"section"},{"location":"Example/","page":"Example","title":"Example","text":"If you load the Pfam module from MIToS, you will get access to a set of functions that work with Pfam MSAs. In this case, we are going to use it for download a Stockholm(Image: ) MSA from the Pfam website and read it into Julia.","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"using MIToS.Pfam\npfam_file = downloadpfam(\"PF10660\")\nmsa = read_file(pfam_file, Stockholm, generatemapping = true, useidcoordinates = true)","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"note: Generation of sequence and column mappings\nThe keyword argument generatemapping of read_file allows to generate sequence and column mappings for the MSA. Column mapping is the map between of each column on the MSA object and the column number in the file. Sequence mappings will use the start and end coordinates in the sequence ids for enumerate each residue in the sequence if useidcoordinates is true.","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"You can plot this MSA and other MIToS’ objects using the Plots(Image: ) package. The installation of Plots is described in the Installation section of this site:","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"using Plots\nplot(msa)\npng(\"msa.png\") # hide\nnothing # hide","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"(Image: )","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"The Information module of MIToS has functions to calculate measures from the Information Theory(Image: ), such as Shannon Entropy and Mutual Information (MI), on a MSA. In this example, we will estimate covariation between columns of the MSA with a corrected MI that use the BLOSUM62 matrix for calculate pseudo frequencies (BLMI).","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"using MIToS.Information\nZBLMIp, BLMIp = BLMI(msa)\nZBLMIp # shows ZBLMIp scores","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"Once the Plots package is installed and loaded, you can use its capabilities to visualize this results:","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"heatmap(ZBLMIp, yflip = true, c = :grays)\npng(\"blmi.png\") # hide\nnothing # hide","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"(Image: )","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"rm(pfam_file) # clean up","category":"page"},{"location":"Example/#commandline","page":"Example","title":"MIToS in system command line","text":"","category":"section"},{"location":"Example/","page":"Example","title":"Example","text":"Calculate ZBLMIp on the system shell is easy using the script called BLMI.jl in the MIToS_Scripts.jl(Image: ) package. This script reads a MSA file, and writes a file with the same base name of the input but with the .BLMI.csv extension.","category":"page"},{"location":"Example/","page":"Example","title":"Example","text":"julia BLMI.jl PF14972.stockholm.gz","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"@info \"PDB docs\"","category":"page"},{"location":"PDB/#Module-PDB","page":"PDB","title":"PDB","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"The module PDB defines types and methods to work with protein structures inside Julia. It is useful to link structural and sequential information, and needed for measure the predictive performance at protein contact prediction of mutual information scores.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using MIToS.PDB # to load the PDB module","category":"page"},{"location":"PDB/#Features","page":"PDB","title":"Features","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"Read and parse mmCIF, PDB, and PDBML files.\nDownload structures from the PDB and AlphaFold databases.\nCalculate distance and contacts between atoms or residues.\nDetermine interaction between residues.","category":"page"},{"location":"PDB/#Contents","page":"PDB","title":"Contents","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"Pages = [\"PDB.md\"]\nDepth = 4","category":"page"},{"location":"PDB/#Retrieve-information-from-PDB-database","page":"PDB","title":"Retrieve information from PDB database","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"This module exports the downloadpdb function, to retrieve a PDB file from PDB database(Image: ). By default, this function downloads a gzipped mmCIF file (format=MMCIFFile), which could be easily read by MIToS. You are able to determine the format as PDBFile if you want to download a PDB file instead.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using MIToS.PDB\n\npdbfile = downloadpdb(\"1IVO\", format = PDBFile)","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"PDB module also exports a getpdbdescription to access the header information of a PDB entry.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"getpdbdescription(\"1IVO\")","category":"page"},{"location":"PDB/#Retrieve-information-from-AlphaFold-database","page":"PDB","title":"Retrieve information from AlphaFold database","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"This module provides functions to download and query protein structures from AlphaFold DB.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"The download_alphafold_structure function downloads the structure file, in mmCIF format by default, for a given UniProt Accession ID. You can set format to PDBFile to download a PDB file instead.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using MIToS.PDB\n\n# Get the structure for the human insulin\nfile = download_alphafold_structure(\"P01308\")","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"If you need more information about that entry, you can use the query_alphafolddb function. The query_alphafolddb function returns an JSON3.Object that works like a dictionary.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"json_result = query_alphafolddb(\"P01308\")","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"You can access the information in the JSON3.Object using the keys. For example, to get the URL to the PAE matrix image:","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"pae_image_url = json_result[\"paeImageUrl\"]","category":"page"},{"location":"PDB/#Read-and-parse-PDB-files","page":"PDB","title":"Read and parse PDB files","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"This is easy using the read_file and parse_file functions, indicating the filename and the FileFormat: PDBML for PDB XML files or PDBFile for usual PDB files. These functions returns a Vector of PDBResidue objects with all the residues in the PDB. To return only a specific subset of residues/atoms you can use any of the following keyword arguments:","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"keyword arguments default returns only ...\nchain All residues from a PDB chain, i.e. \"A\"\nmodel All residues from a determined model, i.e. \"1\"\ngroup All residues from a group: \"ATOM\", \"HETATM\" or All for both\natomname All atoms with a specific name, i.e. \"CA\"\nonlyheavy false heavy atoms (not hydrogens) if it's true\noccupancyfilter false only the atoms with the best occupancy are returned if it's true","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"note: Note\nFor PDBML files it is possible to use the keyword argument label to false (default to true) to get the auth_ attributes instead of the label_ attributes for chain, atom and residue name fields. The auth_ attributes are alternatives provided by an author in order to match the identification/values used in the publication that describes the structure.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"# Read α carbon of each residue from the 1ivo pdb file, in the model 1, chain A and in the ATOM group.\nCA_1ivo =\n read_file(pdbfile, PDBFile, model = \"1\", chain = \"A\", group = \"ATOM\", atomname = \"CA\")\n\nCA_1ivo[1] # First residue. It has only the α carbon.","category":"page"},{"location":"PDB/#Looking-for-particular-residues","page":"PDB","title":"Looking for particular residues","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"MIToS parse PDB files to vector of residues, instead of using a hierarchical structure like other packages. This approach makes the search and selection of residues or atoms a little different. To make it easy, this module exports the select_residues and select_atoms functions. Given the fact that residue numbers from different chains, models, etc. can collide, we can indicate the model, chain, group, residue number and atom name using the keyword arguments of those functions. If you want to select all the residues in one of the categories, you are able to use the type All (this is the default value of such arguments). You can also use regular expressions or functions to make the selections.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using MIToS.PDB\npdbfile = downloadpdb(\"1IVO\", format = PDBFile)\nresidues_1ivo = read_file(pdbfile, PDBFile)\n# Select residue number 9 from model 1 and chain B (it looks in both ATOM and HETATM groups)\nselect_residues(residues_1ivo, model = \"1\", chain = \"B\", residue = \"9\")","category":"page"},{"location":"PDB/#Getting-a-Dict-of-PDBResidues","page":"PDB","title":"Getting a Dict of PDBResidues","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"If you prefer a Dict of PDBResidue, indexed by their residue numbers, you can use the residuedict function.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"# Dict of residues from the model 1, chain A and from the ATOM group\nchain_a = residuesdict(residues_1ivo, model = \"1\", chain = \"A\", group = \"ATOM\")\nchain_a[\"9\"]","category":"page"},{"location":"PDB/#Select-particular-residues","page":"PDB","title":"Select particular residues","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"Use the select_residues function to collect specific residues. It's possible to use a single residue number (i.e. \"2\") or even a function which should return true for the selected residue numbers. Also regular expressions can be used to select residues. Use All to select all the residues.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"residue_list = map(string, 2:5)\n\n# If the list is large, you can use a `Set` to gain performance\n# residue_set = Set(map(string, 2:5))","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"first_res = select_residues(\n residues_1ivo,\n model = \"1\",\n chain = \"A\",\n group = \"ATOM\",\n residue = resnum -> resnum in residue_list,\n)\n\nfor res in first_res\n println(res.id.name, \" \", res.id.number)\nend","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"A more complex example using an anonymous function:","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"# Select all the residues of the model 1, chain A of the ATOM group with residue number less than 5\n\nfirst_res = select_residues(\n residues_1ivo,\n model = \"1\",\n chain = \"A\",\n group = \"ATOM\",\n residue = x -> parse(Int, match(r\"^(\\d+)\", x)[1]) <= 5,\n)\n# The anonymous function takes the residue number (string) and use a regular expression\n# to extract the number (without insertion code).\n# It converts the number to `Int` to test if the it is `<= 5`.\n\nfor res in first_res\n println(res.id.name, \" \", res.id.number)\nend","category":"page"},{"location":"PDB/#Select-particular-atoms","page":"PDB","title":"Select particular atoms","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"The select_atoms function allow to select a particular set of atoms.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"# Select all the atoms with name starting with \"C\" using a regular expression\n# from all the residues of the model 1, chain A of the ATOM group\n\ncarbons = select_atoms(\n residues_1ivo,\n model = \"1\",\n chain = \"A\",\n group = \"ATOM\",\n residue = All,\n atom = r\"C.+\",\n)\n\ncarbons[1]","category":"page"},{"location":"PDB/#Protein-contact-map","page":"PDB","title":"Protein contact map","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"The PDB module offers a number of functions to measure distances between atoms or residues, to detect possible interactions or contacts. In particular the contact function calls the distance function using a threshold or limit in an optimized way. The measure can be done between alpha carbons (\"CA\"), beta carbons (\"CB\") (alpha carbon for glycine), any heavy atom (\"Heavy\") or any (\"All\") atom of the residues.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"In the following example, whe are going to plot a contact map for the 1ivo chain A. Two residues will be considered in contact if their β carbons (α carbon for glycine) have a distance of 8Å or less.","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using MIToS.PDB\n\npdbfile = downloadpdb(\"1IVO\", format = PDBFile)\n\nresidues_1ivo = read_file(pdbfile, PDBFile)\n\npdb = select_residues(residues_1ivo, model = \"1\", chain = \"A\", group = \"ATOM\")\n\ndmap = distance(pdb, criteria = \"All\") # Minimum distance between residues using all their atoms","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"Use the contact function to get a contact map:","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"cmap = contact(pdb, 8.0, criteria = \"CB\") # Contact map","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"@info \"PDB: Cmap\"\nusing Plots\ngr() # Hide possible warnings","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using Plots\ngr()\n\nheatmap(dmap, grid = false, yflip = true, ratio = :equal)\n\npng(\"pdb_dmap.png\") # hide\nnothing # hide","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"(Image: )","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"heatmap(cmap, grid = false, yflip = true, ratio = :equal)\n\npng(\"pdb_cmap.png\") # hide\nnothing # hide","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"(Image: )","category":"page"},{"location":"PDB/#Structural-superposition","page":"PDB","title":"Structural superposition","text":"","category":"section"},{"location":"PDB/","page":"PDB","title":"PDB","text":"@info \"PDB: RMSD\"\nusing Plots\ngr() # Hide possible warnings","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"using MIToS.PDB\n\npdbfile = downloadpdb(\"2HHB\")\n\nres_2hhb = read_file(pdbfile, MMCIFFile)\n\nchain_A = select_residues(res_2hhb, model = \"1\", chain = \"A\", group = \"ATOM\", residue = All)\nchain_C = select_residues(res_2hhb, model = \"1\", chain = \"C\", group = \"ATOM\", residue = All)\n\nusing Plots\ngr()\n\nscatter3d(chain_A, label = \"A\", alpha = 0.5)\nscatter3d!(chain_C, label = \"C\", alpha = 0.5)\n\npng(\"pdb_unaligned.png\") # hide\nnothing # hide","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"(Image: )","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"superimposed_A, superimposed_C, RMSD = superimpose(chain_A, chain_C)\n\nRMSD","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"scatter3d(superimposed_A, label = \"A\", alpha = 0.5)\nscatter3d!(superimposed_C, label = \"C\", alpha = 0.5)\npng(\"pdb_aligned.png\") # hide\nnothing # hide","category":"page"},{"location":"PDB/","page":"PDB","title":"PDB","text":"(Image: )","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"@info \"Pfam docs\"","category":"page"},{"location":"Pfam/#Module-Pfam","page":"Pfam","title":"Pfam","text":"","category":"section"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"MIToS defines methods and types useful for any MSA. The Pfam module uses other MIToS modules in the context of Pfam MSAs, where it’s possible to us determine how structure and sequence information should be mapped. This module defines functions that go from a Pfam MSA to the protein contact prediction performance of pairwise scores estimated from that MSA.","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"using MIToS.Pfam # to load the Pfam module","category":"page"},{"location":"Pfam/#Features","page":"Pfam","title":"Features","text":"","category":"section"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"Download and read Pfam MSAs.\nObtain PDB information from alignment annotations.\nMap between sequence/alignment residues/columns and PDB structures.\nMeasure of AUC (ROC curve) for protein contact prediction of MI scores.","category":"page"},{"location":"Pfam/#Contents","page":"Pfam","title":"Contents","text":"","category":"section"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"Pages = [\"Pfam.md\"]\nDepth = 4","category":"page"},{"location":"Pfam/#Getting-a-Pfam-MSA","page":"Pfam","title":"Getting a Pfam MSA","text":"","category":"section"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"The function downloadpfam takes a Pfam accession and downloads a Pfam MSA in Stockholm format. In that way, you can do","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"pfamfile = downloadpfam(\"PF18883\")","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"to get the MSA. But, we are going to use an already downloaded file in this case:","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"using MIToS\npfamfile = joinpath(dirname(pathof(MIToS)), \"..\", \"docs\", \"data\", \"PF18883.stockholm.gz\");","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"Use read_file function and the Stockholm FileFormat to get a AnnotatedMultipleSequenceAlignment object with the MSA and its Pfam annotations. You must set generatemapping and useidcoordinates to true the first time you read the downloaded MSA. This is necessary to some of the methods in the Pfam module.","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"msa = read_file(pfamfile, Stockholm, generatemapping = true, useidcoordinates = true)","category":"page"},{"location":"Pfam/#Getting-PDB-information-from-an-MSA","page":"Pfam","title":"Getting PDB information from an MSA","text":"","category":"section"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"The function getseq2pdb parses the MSA annotations to return a Dict from the sequence identifier in the MSA to PDB and chain codes.","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"getseq2pdb(msa)","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"Once you know the association between PDB chains and sequences, you can use that information together with the msacolumn2pdbresidue function to get the PDB residue number that correspond to each MSA column for given a determined sequence and PDB chain. That function downloads information from SIFTS to generate the mapping.","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"col2res = msacolumn2pdbresidue(msa, \"ICSA_SHIFL/611-720\", \"3ML3\", \"A\")","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"The returned dictionary can be used to get the PDB residue associated to each column (using the msaresidues function)...","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"using MIToS.PDB\npdbfile = downloadpdb(\"3ML3\")\npdb = read_file(pdbfile, MMCIFFile)\nresdict = residuesdict(pdb, model = \"1\", chain = \"A\", group = \"ATOM\")\n\nmsaresidues(msa, resdict, col2res)","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"...or to delete the columns without PDB residues (using the hasresidues function):","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"using MIToS.MSA\nfiltercolumns!(msa, hasresidues(msa, col2res))","category":"page"},{"location":"Pfam/#PDB-contacts-and-AUC","page":"Pfam","title":"PDB contacts and AUC","text":"","category":"section"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"The Dict between MSA columns and PDB residue number also can be used to generate a protein contact map associated to the MSA.","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"cmap = msacontacts(msa, resdict, col2res)","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"That protein contact map can be used to calculate the Area Under the ROC Curve for a given score with the AUC function.","category":"page"},{"location":"Pfam/","page":"Pfam","title":"Pfam","text":"using MIToS.Information\nZMIp, MIp = buslje09(msa)\n\nusing ROCAnalysis # You need to load ROCAnalysis to use the AUC function\n\nAUC(ZMIp, cmap)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"EditURL = \"cookbook/02_Linking_structural_and_evolutionary_information.jl\"","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/#Linking-structural-and-evolutionary-information","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"","category":"section"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"(Image: ) (Image: )","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/#Problem-description","page":"Linking structural and evolutionary information","title":"Problem description","text":"","category":"section"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"It is a very common task to map sequence to structure residue number. For example, to link structural information coming from PDB and evolutionary information calculated from multiple sequence alignments.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"The naive way of mapping sequence and structure is to perform global pairwise alignment between the sequence and the PDB sequence (using the residues in ATOM). The problem with this approach is that the sequences can have missing regions and standard pairwise alignment algorithms often yield incorrect assignations around those regions (Velankar et.al. 2013 [6]). This is particularly important when aligning PDB sequences, that can have missing residues, and sequences coming from multiple sequence alignments, that can be incomplete or have unaligned regions (e.g. insert states).","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"The SIFTS (Structure Integration with Function, Taxonomy and Sequences) database solves this problem and provides residue level mapping between PDB and other databases (e.g. UniProt and Pfam).","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"The SIFTS module of MIToS has functions to access this residue level mapping between PDB and other databases. Also, MIToS keeps track of the residue number of each residue in a multiple sequence alignment (MSA) using its annotations. Both things together, allow the correct mapping of sequence and structure without performing error-prone pairwise alignments.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"Particular solutions depend on problem details, here we show some common ways to use MIToS and SIFTS to map evolutionary information calculated in an MSA (e.g. Shannon entropy) with structural information (e.g. B-factors).","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/#PDB-and-Pfam-alignment-mapping","page":"Linking structural and evolutionary information","title":"PDB and Pfam alignment mapping","text":"","category":"section"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"This is the easiest problem to solve with the MIToS Pfam module because SIFTS already has a residue level mapping between PDB and Pfam.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"For this example, we are going to map the columns in the multiple sequence alignment of the PF09645 Pfam family and the residues in the chain A from the 2VQC PDB file. The needed files are available in the MIToS test suite:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS\npdb_file = abspath(pathof(MIToS), \"..\", \"..\", \"test\", \"data\", \"2VQC.pdb\")\npfam_file = abspath(pathof(MIToS), \"..\", \"..\", \"test\", \"data\", \"PF09645_full.stockholm\")\nsifts_file = abspath(pathof(MIToS), \"..\", \"..\", \"test\", \"data\", \"2vqc.xml.gz\")\nnothing # hide","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"You can also use downloadpdb from MIToS.PDB, downloadpfam from MIToS.Pfam and downloadsifts from MIToS.SIFTS to get the corresponding files from those databases.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"It is important to read the Pfam MSA file using generatemapping=true and useidcoordinates=true because that allows keeping track of the residue number using the MSA annotations.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS.Pfam\nmsa = read_file(pfam_file, Stockholm, generatemapping = true, useidcoordinates = true)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"First, we need to know what is the sequence in the MSA that correspond to the PDB we want to link. Luckily, Pfam Stockholm files store the mapping between sequences and PDB chains. You can access that mapping using the getseq2pdb function from MIToS.Pfam","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"seq2pdbs = getseq2pdb(msa)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"The returned dictionary gives you all the PDB chains associated with a determined sequence in the MSA. But, in this case, we want to go in the other direction to find all the sequences associated with a determined PDB chain. We are going to use a list comprehension because it is possible for a single chain to be associated with more than one sequence in the Pfam MSA (e.g. domain repeats).","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"pdb_code = \"2VQC\"\npdb_chain = \"A\"\nseq_ids = [seq for (seq, pdbs) in seq2pdbs if (pdb_code, pdb_chain) in pdbs]","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"In this example, we are going to use the only sequence we found for the A of 2VQC.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"seq_id = seq_ids[1]","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"Finally, we can use the msacolumn2pdbresidue function from the Pfam module to get a dictionary from the MSA column index to the PDB residue number:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"pfam_id = \"PF09645\"\nmsacol2pdbres = msacolumn2pdbresidue(msa, seq_id, pdb_code, pdb_chain, pfam_id, sifts_file)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"This dictionary has the mapping between MSA column and PDB residue that allows the mapping between evolutionary and structural information. For example, to measure the correlation between entropy (related to residue variation in an MSA column) and the mean B factor of the residue:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS.Information\nHx = mapcolfreq!(\n shannon_entropy,\n msa,\n Frequencies(ContingencyTable(Int, Val{1}, UngappedAlphabet())),\n)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"To get quick access to each PDB residue based on its residue number, we can read the PDB file into a dictionary using the read_file and residuesdict functions from the MIToS PDB module:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS.PDB\nres_dict = residuesdict(\n read_file(pdb_file, PDBFile, occupancyfilter = true),\n model = \"1\",\n chain = \"A\",\n)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"Then, we can iterate the mapping dictionary to link the MSA and PDB based values:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using Statistics\n\nx = Float64[]\ny = Float64[]\n\nfor (col_index, res_number) in msacol2pdbres\n if res_number != \"\" # i.e. MSA column has an associated PDB residue\n push!(x, Hx[col_index])\n push!(y, mean(parse(Float64, atom.B) for atom in res_dict[res_number].atoms))\n end\nend\n\ncor(x, y)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/#Unknown-sequence-coordinates","page":"Linking structural and evolutionary information","title":"Unknown sequence coordinates","text":"","category":"section"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"While Pfam alignments have the start and end of the aligned region indicated in the sequence name, other multiple sequence alignments don't give any hint about that. In those cases, we should use pairwise alignments. However, instead of aligning the sequence coming from the MSA and the PDB sequence, we can align the MSA sequence to the UniProt sequence to reduce the possibility of mapping errors. Once we have the mapping of the MSA sequence to the UniProt sequence, we can use SIFTS to map the PDB sequence to the MSA sequence using the UniProt numeration.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"For this example, we are going to use the following files included in MIToS documentation:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS\npdb_file = abspath(pathof(MIToS), \"..\", \"..\", \"docs\", \"data\", \"1dur.pdb\")\nmsa_file = abspath(pathof(MIToS), \"..\", \"..\", \"docs\", \"data\", \"blast_alignment.fa\")\nsifts_file = abspath(pathof(MIToS), \"..\", \"..\", \"docs\", \"data\", \"1dur.xml.gz\")\nuniprot_file = abspath(pathof(MIToS), \"..\", \"..\", \"docs\", \"data\", \"P00193.fasta\")\nnothing # hide","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"First, we are going to read the MSA file. In this case, we can not use useidcoordinates=true because the sequence names don't have the sequence coordinates in the Pfam format. However, we are going to use generatemapping=true to get the default mapping for each sequence in the alignment (from 1 to the length of the aligned region):","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS.MSA\nmsa = read_file(msa_file, FASTA, generatemapping = true)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"After that, we get the first sequence of the MSA, the one we know that corresponds to the PDB of interest. We need the sequence as a String without gaps (unaligned), so we use the MIToS.MSA stringsequence function together with replace:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"msa_seq = replace(stringsequence(msa, 1), '-' => \"\")","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"Also, we are going to read the UniProt sequence. You can easily download the sequence from UniProt by doing:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS.Utils\ndownload_file(\"https://www.uniprot.org/uniprot/P00193.fasta\", \"P00193.fasta\")","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"To read the FASTA file we are going to use the FastaIO package:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using FastaIO\nuniprot_sequences = readfasta(uniprot_file)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"And get the unique sequence:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"uniprot_seq = uniprot_sequences[1][2]","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"We can perform a pairwise sequence alignment between both sequences by using the BioAlignments package from the BioJulia suite. In this case, we use a semi-global alignment (no start/end gap penalty) because we know that the MSA sequence is a region of the UniProt sequence.","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using BioAlignments\ncostmodel = AffineGapScoreModel(BLOSUM62, gap_open = -10, gap_extend = -1)\naln = pairalign(SemiGlobalAlignment(), msa_seq, uniprot_seq, costmodel)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"Then, we only need to iterate the alignment to designate the positions and store the equivalences in a dictionary:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"function seq2refnumber(aln)\n seq_pos = 0\n ref_pos = 0\n last_seq_pos = 0\n seq2ref = Dict{Int,Int}()\n for (seq_res, ref_res) in alignment(aln)\n if seq_res != '-'\n seq_pos += 1\n end\n if ref_res != '-'\n ref_pos += 1\n end\n if seq_pos != last_seq_pos\n seq2ref[seq_pos] = ref_pos\n last_seq_pos = seq_pos\n end\n end\n seq2ref\nend\n\nseqnum2uniprotnum = seq2refnumber(aln)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"Then, we can use getsequencemapping to go from MSA column number to UniProt residue, and siftsmapping to go from UniProt to PDB:","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"seqmap = getsequencemapping(msa, 1)","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"colnum2uniprotnum = Dict{Int,Int}()\nfor (colnum, seqnum) in enumerate(seqmap)\n if seqnum != 0 # getsequencemapping returns 0 where there is a gap\n colnum2uniprotnum[colnum] = seqnum2uniprotnum[seqnum]\n end\nend\ncolnum2uniprotnum","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"using MIToS.SIFTS\n\nuniprotnum2pdbnum = siftsmapping(\n sifts_file,\n dbUniProt,\n \"P00193\",\n dbPDB,\n \"1dur\", # SIFTS stores PDB identifiers in lowercase\n chain = \"A\",\n missings = false,\n) # residues without coordinates aren't used in the mapping","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"To finally get the dictionary from MSA column index to PDB residue number","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"colnum2pdbnum = Dict{Int,String}()\nfor (colnum, uniprotnum) in colnum2uniprotnum\n pdbresnum = get(uniprotnum2pdbnum, string(uniprotnum), \"\")\n if pdbresnum != \"\"\n colnum2pdbnum[colnum] = pdbresnum\n end\nend\n\ncolnum2pdbnum","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"","category":"page"},{"location":"02_Linking_structural_and_evolutionary_information/","page":"Linking structural and evolutionary information","title":"Linking structural and evolutionary information","text":"This page was generated using Literate.jl.","category":"page"},{"location":"Information_API/","page":"Information","title":"Information","text":"@info \"Information API docs\"","category":"page"},{"location":"Information_API/#Information","page":"Information","title":"Information","text":"","category":"section"},{"location":"Information_API/","page":"Information","title":"Information","text":"MIToS.Information","category":"page"},{"location":"Information_API/#MIToS.Information","page":"Information","title":"MIToS.Information","text":"The Information module of MIToS defines types and functions useful to calculate information measures (e.g. Mutual Information (MI) and Entropy) over a Multiple Sequence Alignment (MSA). This module was designed to count Residues (defined in the MSA module) in special contingency tables (as fast as possible) and to derive probabilities from this counts. Also, includes methods for applying corrections to that tables, e.g. pseudocounts and pseudo frequencies. Finally, Information allows to use this probabilities and counts to estimate information measures and other frequency based values.\n\nFeatures\n\nEstimate multi dimensional frequencies and probabilities tables from sequences, MSAs, etc...\nCorrection for small number of observations\nCorrection for data redundancy on a MSA\nEstimate information measures\nCalculate corrected mutual information between residues\n\nusing MIToS.Information\n\n\n\n\n\n","category":"module"},{"location":"Information_API/#Contents","page":"Information","title":"Contents","text":"","category":"section"},{"location":"Information_API/","page":"Information","title":"Information","text":"Pages = [\"Information_API.md\"]\nDepth = 2","category":"page"},{"location":"Information_API/#Types","page":"Information","title":"Types","text":"","category":"section"},{"location":"Information_API/","page":"Information","title":"Information","text":"Modules = [MIToS.Information]\nPrivate = false\nOrder = [:type]","category":"page"},{"location":"Information_API/#MIToS.Information.AdditiveSmoothing","page":"Information","title":"MIToS.Information.AdditiveSmoothing","text":"Additive Smoothing or fixed pseudocount λ for ResidueCount (in order to estimate probabilities when the number of samples is low).\n\nCommon values of λ are:\n\n0 : No cell frequency prior, gives you the maximum likelihood estimator.\n0.05 is the optimum value for λ found in Buslje et al. 2009, similar results was obtained for λ in the range [0.025, 0.075].\n1 / p : Perks prior (Perks, 1947) where p the number of parameters (i.e. residues, pairs of residues) to estimate. If p is the number of residues (20 without counting gaps), this gives you 0.05.\nsqrt(n) / p : Minimax prior (Trybula, 1958) where n is the number of samples and p the number of parameters to estimate. If the number of samples n is 400 (minimum number of sequence clusters for achieve good performance in Buslje et al. 2009) for estimating 400 parameters (pairs of residues without counting gaps) this gives you 0.05.\n0.5 : Jeffreys prior (Jeffreys, 1946).\n1 : Bayes-Laplace uniform prior, aka. Laplace smoothing.\n\nReferences\n\nBuslje, Cristina Marino, et al. \"Correction for phylogeny, small number of observations and data redundancy improves the identification of coevolving amino acid pairs using mutual information.\" Bioinformatics 25.9 (2009): 1125-1131.\nPerks, Wilfred. \"Some observations on inverse probability including a new indifference rule.\" Journal of the Institute of Actuaries 73.2 (1947): 285-334.\nTrybula, Stanislaw. \"Some problems of simultaneous minimax estimation.\" The Annals of Mathematical Statistics 29.1 (1958): 245-253.\nJeffreys, Harold. \"An invariant form for the prior probability in estimation problems.\" Proceedings of the Royal Society of London. Series A. Mathematical and Physical Sciences 186.1007 (1946): 453-461.\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.BLOSUM_Pseudofrequencies","page":"Information","title":"MIToS.Information.BLOSUM_Pseudofrequencies","text":"BLOSUM_Pseudofrequencies type. It takes to arguments/fields:\n\nα : Usually the number of sequences or sequence clusters in the MSA.\nβ : The weight of the pseudofrequencies, a value close to 8.512 when α is the number of sequence clusters.\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.ContingencyTable","page":"Information","title":"MIToS.Information.ContingencyTable","text":"A ContingencyTable is a multidimensional array. It stores the contingency matrix, its marginal values and total. The type also has an internal and private temporal array and an alphabet object. It's a parametric type, taking three ordered parameters:\n\nT : The element type of the multidimensional array.\nN : It's the dimension of the array and should be an Int.\nA : This should be a type, subtype of ResidueAlphabet, i.e.: UngappedAlphabet, GappedAlphabet or ReducedAlphabet.\n\nA ContingencyTable can be created from an alphabet if all the parameters are given. Otherwise, you need to give a type, a number (Val) and an alphabet. You can also create a ContingencyTable using a matrix and a alphabet. For example:\n\nContingencyTable{Float64,2,UngappedAlphabet}(UngappedAlphabet())\nContingencyTable(Float64, Val{2}, UngappedAlphabet())\nContingencyTable(zeros(Float64, 20, 20), UngappedAlphabet())\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.Frequencies","page":"Information","title":"MIToS.Information.Frequencies","text":"A Frequencies object wraps a ContingencyTable storing counts/frequencies.\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.NoPseudocount","page":"Information","title":"MIToS.Information.NoPseudocount","text":"You can use NoPseudocount() to avoid pseudocount corrections where a Pseudocount type is needed.\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.NoPseudofrequencies","page":"Information","title":"MIToS.Information.NoPseudofrequencies","text":"You can use NoPseudofrequencies() to avoid pseudocount corrections where a Pseudofrequencies type is needed.\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.Probabilities","page":"Information","title":"MIToS.Information.Probabilities","text":"A Probabilities object wraps a ContingencyTable storing probabilities. It doesn't perform any check. If the total isn't one, you must use normalize or normalize!on the ContingencyTable before wrapping it to make the sum of the probabilities equal to one.\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.Pseudocount","page":"Information","title":"MIToS.Information.Pseudocount","text":"Parametric abstract type to define pseudocount types\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#MIToS.Information.Pseudofrequencies","page":"Information","title":"MIToS.Information.Pseudofrequencies","text":"Parametric abstract type to define pseudofrequencies types\n\n\n\n\n\n","category":"type"},{"location":"Information_API/#Constants","page":"Information","title":"Constants","text":"","category":"section"},{"location":"Information_API/","page":"Information","title":"Information","text":"Modules = [MIToS.Information]\nPrivate = false\nOrder = [:constant]","category":"page"},{"location":"Information_API/#MIToS.Information.BLOSUM62_Pi","page":"Information","title":"MIToS.Information.BLOSUM62_Pi","text":"BLOSUM62 probabilities P(aa) for each residue on the UngappedAlphabet. SUM: 0.9987\n\n\n\n\n\n","category":"constant"},{"location":"Information_API/#MIToS.Information.BLOSUM62_Pij","page":"Information","title":"MIToS.Information.BLOSUM62_Pij","text":"Table with conditional probabilities of residues based on BLOSUM62. The normalization is done row based. The firts row contains the P(aa|A) and so one.\n\n\n\n\n\n","category":"constant"},{"location":"Information_API/#Macros","page":"Information","title":"Macros","text":"","category":"section"},{"location":"Information_API/","page":"Information","title":"Information","text":"Modules = [MIToS.Information]\nPrivate = false\nOrder = [:macro]","category":"page"},{"location":"Information_API/#Methods-and-functions","page":"Information","title":"Methods and functions","text":"","category":"section"},{"location":"Information_API/","page":"Information","title":"Information","text":"Modules = [MIToS.Information]\nPrivate = false\nOrder = [:function]","category":"page"},{"location":"Information_API/#Base.count!-Union{Tuple{A}, Tuple{N}, Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, N, A}, Any, MIToS.Information.Pseudocount, Vararg{AbstractArray{MIToS.MSA.Residue}, N}}} where {T, N, A}","page":"Information","title":"Base.count!","text":"It populates a ContingencyTable (first argument) using the frequencies in the sequences (last positional arguments). The dimension of the table must match the number of sequences and all the sequences must have the same length. You must indicate the used weights and pseudocounts as second and third positional arguments respectively. You can use NoPseudofrequencies() and NoClustering() to avoid the use of sequence weighting and pseudocounts, respectively.\n\nDEPRECATED: Use frequencies! instead. Note that frequencies! defines the weigths and pseudocounts using keyword arguments instead of positional arguments.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#Base.count-Union{Tuple{Vararg{AbstractArray{MIToS.MSA.Residue}, N}}, Tuple{N}} where N","page":"Information","title":"Base.count","text":"It returns a ContingencyTable wrapped in a Frequencies type with the frequencies of residues in the sequences that takes as arguments. The dimension of the table is equal to the number of sequences. You can use the keyword arguments alphabet, weights and pseudocounts to indicate the alphabet of the table (default to UngappedAlphabet()), a clustering result (default to NoClustering()) and the pseudocounts (default to NoPseudocount()) to be used during the estimation of the frequencies.\n\nDEPRECATED: Use frequencies instead. Note that frequencies defines the alphabet, weigths and pseudocounts using keyword arguments instead of positional arguments.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#LinearAlgebra.normalize!-Union{Tuple{MIToS.Information.ContingencyTable{T, N, A}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T, N, A}","page":"Information","title":"LinearAlgebra.normalize!","text":"normalize! makes the sum of the frequencies to be one, in place.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#LinearAlgebra.normalize-Union{Tuple{MIToS.Information.ContingencyTable{T, N, A}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T, N, A}","page":"Information","title":"LinearAlgebra.normalize","text":"normalize returns another table where the sum of the frequencies is one.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.APC!-Union{Tuple{Matrix{T}}, Tuple{T}} where T","page":"Information","title":"MIToS.Information.APC!","text":"APC\n\nReferences\n\nDunn, Stanley D., Lindi M. Wahl, and Gregory B. Gloor. \"Mutual information without the influence of phylogeny or entropy dramatically improves residue contact prediction.\" Bioinformatics 24.3 (2008): 333-340.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.BLMI-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.BLMI","text":"BLMI takes an MSA and calculates a Z score (ZBLMI) and a corrected MI/MIp as described on Busjle et al. 2009 but using using BLOSUM62 pseudo frequencies instead of a fixed pseudocount.\n\nKeyword argument, type, default value and descriptions:\n\n - beta Float64 8.512 β for BLOSUM62 pseudo frequencies\n - lambda Float64 0.0 Low count value\n - threshold 62 Percent identity threshold for sequence clustering (Hobohm I)\n - maxgap Float64 0.5 Maximum fraction of gaps in positions included in calculation\n - apc Bool true Use APC correction (MIp)\n - samples Int 50 Number of samples for Z-score\n - fixedgaps Bool true Fix gaps positions for the random samples\n\nThis function returns:\n\n - Z score (ZBLMI)\n - MI or MIp using BLOSUM62 pseudo frequencies (BLMI/BLMIp)\n\nReferences\n\nBuslje, Cristina Marino, et al. \"Correction for phylogeny, small number of observations and data redundancy improves the identification of coevolving amino acid pairs using mutual information.\" Bioinformatics 25.9 (2009): 1125-1131.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.apply_pseudocount!-Union{Tuple{A}, Tuple{N}, Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, N, A}, T}} where {T, N, A}","page":"Information","title":"MIToS.Information.apply_pseudocount!","text":"It adds the pseudocount value to the table cells.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.apply_pseudofrequencies!-Union{Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, 2, MIToS.MSA.UngappedAlphabet}, MIToS.Information.BLOSUM_Pseudofrequencies}} where T","page":"Information","title":"MIToS.Information.apply_pseudofrequencies!","text":"apply_pseudofrequencies!{T}(Pab::ContingencyTable{T,2,UngappedAlphabet}, pseudofrequencies::BLOSUM_Pseudofrequencies)\n\nWhen a BLOSUM_Pseudofrequencies(α,β) is used, this function applies pseudofrequencies Gab over Pab, as a weighted mean of both. It uses the conditional probability matrix BLOSUM62_Pij and the real frequencies/probabilities Pab to estimate the pseudofrequencies Gab. α is the weight of the real frequencies Pab and β the weight of the pseudofrequencies.\n\nGab = Σcd Pcd ⋅ BLOSUM62( a | c ) ⋅ BLOSUM62( b | d ) Pab = (α ⋅ Pab + β ⋅ Gab )/(α + β)\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.buslje09-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.buslje09","text":"buslje09 takes a MSA and calculates a Z score and a corrected MI/MIp as described on Busjle et al. 2009.\n\nkeyword argument, type, default value and descriptions:\n\n - lambda Float64 0.05 Low count value\n - clustering Bool true Sequence clustering (Hobohm I)\n - threshold 62 Percent identity threshold for clustering\n - maxgap Float64 0.5 Maximum fraction of gaps in positions included in calculation\n - apc Bool true Use APC correction (MIp)\n - samples Int 100 Number of samples for Z-score\n - fixedgaps Bool true Fix gaps positions for the random samples\n - alphabet ResidueAlphabet UngappedAlphabet() Residue alphabet to be used\n\nThis function returns:\n\n - Z score\n - MI or MIp\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.cumulative-Union{Tuple{VT}, Tuple{D}, Tuple{T}, Tuple{PairwiseListMatrices.PairwiseListMatrix{T, D, VT}, T}} where {T, D, VT}","page":"Information","title":"MIToS.Information.cumulative","text":"cumulative allows to calculate cumulative scores (i.e. cMI) as defined in Marino Buslje et al. 2010:\n\n\"We calculated a cumulative mutual information score (cMI) for each residue as the sum of MI values above a certain threshold for every amino acid pair where the particular residue appears. This value defines to what degree a given amino acid takes part in a mutual information network.\"\n\nReferences\n\nMarino Buslje, Cristina, et al. \"Networks of high mutual information define the structural proximity of catalytic sites: implications for catalytic residue identification.\" PLoS computational biology 6.11 (2010): e1000978.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.delete_dimensions!-Union{Tuple{A}, Tuple{S}, Tuple{N}, Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, S, A}, MIToS.Information.ContingencyTable{T, N, A}, Vararg{Int64}}} where {T, N, S, A}","page":"Information","title":"MIToS.Information.delete_dimensions!","text":"delete_dimensions!(out::ContingencyTable, in::ContingencyTable, dimensions::Int...)\n\nThis function fills a ContingencyTable with the counts/probabilities on in after the deletion of dimensions. i.e. This is useful for getting Pxy from Pxyz.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.delete_dimensions-Union{Tuple{I}, Tuple{A}, Tuple{N}, Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, N, A}, Vararg{Int64, I}}} where {T, N, A, I}","page":"Information","title":"MIToS.Information.delete_dimensions","text":"delete_dimensions(in::ContingencyTable, dimensions::Int...)\n\nThis function creates a ContingencyTable with the counts/probabilities on in after the deletion of dimensions. i.e. This is useful for getting Pxy from Pxyz.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.frequencies!-Union{Tuple{A}, Tuple{N}, Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, N, A}, Vararg{AbstractArray{MIToS.MSA.Residue}, N}}} where {T, N, A}","page":"Information","title":"MIToS.Information.frequencies!","text":"frequencies!(table, seqs...; weights::WeightTypes, pseudocounts::Pseudocount)\n\nIt populates a ContingencyTable or Frequencies table (first argument) using the frequencies in the given sequences (last positional arguments). The dimension of the table must match the number of sequences and all the sequences must have the same length. You must indicate the used weights and pseudocounts as keyword arguments. Those arguments default to NoClustering() and NoPseudocount() respectively, to avoid the use of sequence weighting and pseudocounts.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.frequencies-Union{Tuple{Vararg{AbstractArray{MIToS.MSA.Residue}, N}}, Tuple{N}} where N","page":"Information","title":"MIToS.Information.frequencies","text":"frequencies(seqs...; alphabet=UngappedAlphabet(), weights=NoClustering(), pseudocounts=NoPseudocount()\n\nThis function returns a Frequencies object wrapping a ContingencyTable with the frequencies of residues in the sequences that takes as arguments. The dimension of the table is equal to the number of sequences. You can use the keyword arguments alphabet, weights and pseudocounts to indicate the alphabet of the table, a clustering result and the pseudocounts to be used during the estimation of the frequencies.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.gap_intersection_percentage-Union{Tuple{MIToS.Information.Frequencies{T, 2, MIToS.MSA.GappedAlphabet}}, Tuple{T}} where T","page":"Information","title":"MIToS.Information.gap_intersection_percentage","text":"It calculates the gap intersection as percentage from a table of Frequencies.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.gap_union_percentage-Union{Tuple{MIToS.Information.Frequencies{T, 2, MIToS.MSA.GappedAlphabet}}, Tuple{T}} where T","page":"Information","title":"MIToS.Information.gap_union_percentage","text":"It calculates the gap union as percentage from a table of Frequencies.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.gaussdca-Tuple{Any}","page":"Information","title":"MIToS.Information.gaussdca","text":"Wrapper function to GaussDCA.gDCA. You need to install GaussDCA:\n\nusing Pkg\n\nPkg.add(PackageSpec(url = \"https://github.com/carlobaldassi/GaussDCA.jl\", rev = \"master\"))\n\nLook into GaussDCA.jl README for further information. If you use this wrapper, please cite the GaussDCA publication and the package's doi.\n\nIt's possible to indicate the path to the julia binary where GaussDCA is installed. However, it's recommended to use the same version where MIToS is installed. That is because this function use serialize/deserialize to transfer data between the processes.\n\nGaussDCA Publication: Baldassi, Carlo, Marco Zamparo, Christoph Feinauer, Andrea Procaccini, Riccardo Zecchina, Martin Weigt, and Andrea Pagnani. \"Fast and accurate multivariate Gaussian modeling of protein families: predicting residue contacts and protein-interaction partners.\" PloS one 9, no. 3 (2014): e92721.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.getalphabet-Tuple{MIToS.Information.ContingencyTable}","page":"Information","title":"MIToS.Information.getalphabet","text":"getalphabet allows to access the stored alphabet object.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.getcontingencytable-Union{Tuple{MIToS.Information.Probabilities{T, N, A}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T, N, A}","page":"Information","title":"MIToS.Information.getcontingencytable","text":"getcontingencytable allows to access the wrapped ContingencyTable in a Probabilities or Frequencies object.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.getmarginals-Tuple{MIToS.Information.ContingencyTable}","page":"Information","title":"MIToS.Information.getmarginals","text":"getmarginals allows to access the array with the marginal values (NamedArray).\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.getmarginalsarray-Tuple{MIToS.Information.ContingencyTable}","page":"Information","title":"MIToS.Information.getmarginalsarray","text":"getmarginalsarray allows to access the array with the marginal values (Array without names).\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.gettable-Tuple{MIToS.Information.ContingencyTable}","page":"Information","title":"MIToS.Information.gettable","text":"gettable allows to access the table (NamedArray).\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.gettablearray-Tuple{MIToS.Information.ContingencyTable}","page":"Information","title":"MIToS.Information.gettablearray","text":"gettablearray allows to access the table (Array without names).\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.gettotal-Tuple{MIToS.Information.ContingencyTable}","page":"Information","title":"MIToS.Information.gettotal","text":"gettotal allows to access the stored total value.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.kullback_leibler-Tuple{AbstractArray{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.kullback_leibler","text":"kullback_leibler(msa::AbstractArray{Residue}; background::Union{Array{T,N}, Probabilities{T,N,A}, ContingencyTable{T,N,A}}=BLOSUM62_Pi, base::Number=ℯ, kargs...)\n\nIt calculates the Kullback-Leibler (KL) divergence from a multiple sequence alignment (MSA). You can use the keyword argument background to set the background distribution. This argument can take an Array, Probabilities, or ContingencyTable object. The background distribution must have the same size and alphabet as the probabilities. The default is the BLOSUM62_Pi table. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.\n\nThe other keyword arguments are passed to the mapfreq function.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.kullback_leibler-Union{Tuple{MIToS.Information.Probabilities{T, N, A}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T<:Number, N, A<:MIToS.MSA.ResidueAlphabet}","page":"Information","title":"MIToS.Information.kullback_leibler","text":"kullback_leibler(probabilities::Probabilities{T,N,A}, background::Union{\n AbstractArray{T,N}, Probabilities{T,N,A}, ContingencyTable{T,N,A}}=BLOSUM62_Pi, \n base::Number=ℯ)\n\nIt calculates the Kullback-Leibler (KL) divergence from a table of Probabilities. You can use the keyword argument background to set the background distribution. This argument can take an Array, Probabilities, or ContingencyTable object. The background distribution must have the same size and alphabet as the probabilities. The default is the BLOSUM62_Pi table. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mapcolfreq!-Union{Tuple{A}, Tuple{T}, Tuple{Function, AbstractMatrix{MIToS.MSA.Residue}, Union{MIToS.Information.Frequencies{T, 1, A}, MIToS.Information.Probabilities{T, 1, A}}}} where {T, A}","page":"Information","title":"MIToS.Information.mapcolfreq!","text":"It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each column from the msa (second argument).\n\nweights (default: NoClustering()): Weights to be used for table counting.\npseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.\npseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mapcolpairfreq!-Union{Tuple{A}, Tuple{T}, Tuple{Function, AbstractMatrix{MIToS.MSA.Residue}, Union{MIToS.Information.Frequencies{T, 2, A}, MIToS.Information.Probabilities{T, 2, A}}}} where {T, A}","page":"Information","title":"MIToS.Information.mapcolpairfreq!","text":"It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each pair of columns from the msa (second argument).\n\nweights (default: NoClustering()): Weights to be used for table counting.\npseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.\npseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.\nusediagonal (default: true): If true, the function will be also applied to the diagonal elements.\ndiagonalvalue (default: zero): Value to fill diagonal elements if usediagonal is false.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mapfreq-Tuple{Function, AbstractArray{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.mapfreq","text":"mapfreq(f, msa; rank = 1, dims = 2, alphabet = UngappedAlphabet(), \n weights = NoClustering(), pseudocounts = NoPseudocount(), \n pseudofrequencies = NoPseudofrequencies(), probabilities = true, \n usediagonal = false, diagonalvalue = NaN, kargs...)\n\nIt efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (depending on the probabilities keyword argument) calculated on sequences (dims = 1) or columns (dims = 2, the default) of an msa (second argument). If rank = 1, the default, the function is applied to each sequence or column. If rank = 2, the function is applied to each pair of sequences or columns. In that case, we can set the usediagonal keyword argument to true to apply the function to pairs of the same sequence or column. The diagonalvalue keyword argument is used to set the value of the diagonal elements if usediagonal is false. By default, the function is not applied to the diagonal elements (i.e. usediagonal = false) and the diagonalvalue is set to NaN. The alphabet keyword argument can be used to set the alphabet used to construct the contingency table. The function also accepts the following keyword arguments:\n\nweights (default: NoClustering()): Weights to be used for table counting.\npseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.\npseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.\n\nNote that the pseudofrequencies argument is only valid if probabilities = true. All the other keyword arguments are passed to the function f.\n\njulia> using Random, MIToS.MSA, MIToS.Information\n\njulia> msa = rand(Random.MersenneTwister(1), Residue, 3, 6) # random MSA as an example\n3×6 Matrix{Residue}:\n F A F D E V\n T R R G F I\n N V S W Q T\n\njulia> mapfreq(sum, msa) # default: rank=1, dims=2, probabilities=true\n1×6 Named Matrix{Float64}\nFunction ╲ Col │ 1 2 3 4 5 6\n───────────────┼─────────────────────────────\nsum │ 1.0 1.0 1.0 1.0 1.0 1.0\n\njulia> mapfreq(sum, msa, probabilities=false)\n1×6 Named Matrix{Float64}\nFunction ╲ Col │ 1 2 3 4 5 6\n───────────────┼─────────────────────────────\nsum │ 3.0 3.0 3.0 3.0 3.0 3.0\n\njulia> mapfreq(sum, msa, dims=1)\n3×1 Named Matrix{Float64}\nSeq ╲ Function │ sum\n───────────────┼────\n1 │ 1.0\n2 │ 1.0\n3 │ 1.0\n\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mapseqfreq!-Union{Tuple{A}, Tuple{T}, Tuple{Function, AbstractMatrix{MIToS.MSA.Residue}, Union{MIToS.Information.Frequencies{T, 1, A}, MIToS.Information.Probabilities{T, 1, A}}}} where {T, A}","page":"Information","title":"MIToS.Information.mapseqfreq!","text":"It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each sequence from the msa (second argument).\n\nweights (default: NoClustering()): Weights to be used for table counting.\npseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.\npseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mapseqpairfreq!-Union{Tuple{A}, Tuple{T}, Tuple{Function, AbstractMatrix{MIToS.MSA.Residue}, Union{MIToS.Information.Frequencies{T, 2, A}, MIToS.Information.Probabilities{T, 2, A}}}} where {T, A}","page":"Information","title":"MIToS.Information.mapseqpairfreq!","text":"It efficiently map a function (first argument) that takes a table of Frequencies or Probabilities (third argument). The table is filled in place with the counts or probabilities of each pair of sequences from the msa (second argument).\n\nweights (default: NoClustering()): Weights to be used for table counting.\npseudocounts (default: NoPseudocount()): Pseudocount object to be applied to table.\npseudofrequencies (default: NoPseudofrequencies()): Pseudofrequencies to be applied to the normalized (probabilities) table.\nusediagonal (default: true): If true, the function will be also applied to the diagonal elements.\ndiagonalvalue (default: zero): Value to fill diagonal elements if usediagonal is false.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.marginal_entropy-Union{Tuple{Union{MIToS.Information.Frequencies{T, N, A}, MIToS.Information.Probabilities{T, N, A}}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T, N, A}","page":"Information","title":"MIToS.Information.marginal_entropy","text":"marginal_entropy(table::Union{Frequencies{T,N,A},Probabilities{T,N,A}}; margin::Int=1, \n base::Number=ℯ)\n\nIt calculates marginal entropy (H) from a table of Frequencies or Probabilities. It takes two keyword arguments: margin and base. The first one is used to indicate the margin used to calculate the entropy, e.g. it estimates the entropy H(X) if margin is 1, H(Y) for 2, etc. The default value of margin is 1. The second keyword argument is used to change the base of the log. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mutual_information-Tuple{AbstractArray{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.mutual_information","text":"mutual_information(msa::AbstractArray{Residue}; base::Number=ℯ, kargs...)\n\nIt calculates Mutual Information (MI) from a multiple sequence alignment (MSA). The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits. The minimum value for rank is 2 (the default value). By defualt, it uses counts/frequencies to calculate the MI, as it's faster. You can use the keyword argument probabilities = true to calculate the MI from probabilities.\n\njulia> using Random, MIToS.MSA, MIToS.Information\n\njulia> msa = rand(Random.MersenneTwister(37), Residue, 3, 4)\n3×4 Matrix{Residue}:\n T R F K\n S H C I\n G G R V\n\njulia> mi = mutual_information(msa);\n\njulia> mi[1, 2]\n1.0986122886681098\n\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mutual_information-Union{Tuple{MIToS.Information.Probabilities{T, 2, A}}, Tuple{A}, Tuple{T}} where {T, A}","page":"Information","title":"MIToS.Information.mutual_information","text":"mutual_information(table::Union{Frequencies{T,2,A},Probabilities{T,2,A}}; base::Number=ℯ)\n\nIt calculates Mutual Information (MI) from a table of Frequencies or Probabilities. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits. Note that calculating MI from Frequencies is faster than from Probabilities.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.mutual_information-Union{Tuple{Union{MIToS.Information.Frequencies{T, 3, A}, MIToS.Information.Probabilities{T, 3, A}}}, Tuple{A}, Tuple{T}} where {T, A}","page":"Information","title":"MIToS.Information.mutual_information","text":"mutual_information(table::Union{Frequencies{T,3,A},Probabilities{T,3,A}}; base::Number=ℯ)\n\nIt calculates Mutual Information (MI) from a table of Frequencies or Probabilities with three dimensions. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.\n\njulia> using Random, MIToS.MSA, MIToS.Information\n\njulia> msa = rand(Random.MersenneTwister(37), Residue, 3, 4)\n3×4 Matrix{Residue}:\n T R F K\n S H C I\n G G R V\n\njulia> Nxyz = frequencies(msa[:, 1], msa[:, 2], msa[:, 3]);\n\njulia> mutual_information(Nxyz)\n1.0986122886681093\n\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.normalized_mutual_information-Tuple{AbstractArray{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.normalized_mutual_information","text":"normalized_mutual_information(msa::AbstractArray{Residue}; kargs...)\n\nThis function calculates the Normalized Mutual Information (nMI) from a multiple sequence alignment using the mapfreq function—all the keyword arguments are passed to mapfreq. The mutual information score is normalized by the joint entropy of the two variables: nMI(X Y) = MI(X Y) H(X Y) By default, it uses counts/frequencies to estimate the nMI, as it's faster than using probabilities.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.normalized_mutual_information-Union{Tuple{Union{MIToS.Information.Frequencies{T, N, A}, MIToS.Information.Probabilities{T, N, A}}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T, N, A}","page":"Information","title":"MIToS.Information.normalized_mutual_information","text":"It calculates a Normalized Mutual Information (nMI) from a table of Frequencies or Probabilities. The mutual information score is normalized by the joint entropy of the two variables: nMI(X Y) = MI(X Y) H(X Y)\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.pairwisegapfraction-Tuple{AbstractMatrix{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.pairwisegapfraction","text":"It takes a MSA or a file and a FileFormat as first arguments. It calculates the percentage of gaps on columns pairs (union and intersection) using sequence clustering (Hobohm I).\n\nArgument, type, default value and descriptions:\n\n - clustering Bool true Sequence clustering (Hobohm I)\n - threshold 62 Percent identity threshold for sequence clustering (Hobohm I)\n\nThis function returns:\n\n - pairwise gap union as percentage\n - pairwise gap intersection as percentage\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.probabilities!-Union{Tuple{A}, Tuple{N}, Tuple{T}, Tuple{MIToS.Information.ContingencyTable{T, N, A}, Any, MIToS.Information.Pseudocount, MIToS.Information.Pseudofrequencies, Vararg{AbstractArray{MIToS.MSA.Residue}, N}}} where {T, N, A}","page":"Information","title":"MIToS.Information.probabilities!","text":"It populates a ContingencyTable (first argument) using the probabilities in the sequences (last positional arguments). The dimension of the table must match the number of sequences and all the sequences must have the same length. You must indicate the used weights, pseudocounts and pseudofrequencies as second, third and fourth positional arguments respectively. You can use NoClustering(), NoPseudocount() and NoPseudofrequencies() to avoid the use of sequence weighting, pseudocounts and pseudofrequencies, respectively.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.probabilities-Union{Tuple{Vararg{AbstractArray{MIToS.MSA.Residue}, N}}, Tuple{N}} where N","page":"Information","title":"MIToS.Information.probabilities","text":"It returns a ContingencyTable wrapped in a Probabilities type with the probabilities of residues in the sequences that takes as arguments. The dimension of the table is equal to the number of sequences. You can use the keyword arguments alphabet, weights, pseudocounts and pseudofrequencies to indicate the alphabet of the table (default to UngappedAlphabet()), a clustering result (default to NoClustering()), the pseudocounts (default to NoPseudocount()) and the pseudofrequencies (default to NoPseudofrequencies()) to be used during the estimation of the probabilities.\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.shannon_entropy-Tuple{AbstractArray{MIToS.MSA.Residue}}","page":"Information","title":"MIToS.Information.shannon_entropy","text":"shannon_entropy(msa::AbstractArray{Residue}; base::Number=ℯ, \n probabilities::Bool=false, usediagonal::Bool=true, kargs...)\n\nIt calculates the Shannon entropy (H) on a MSA. You can use the keyword argument base to change the base of the log. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits. It uses mapfreq under the hood, so it takes the same keyword arguments. By default, it measures the entropy of each column in the MSA. You can use dims = 1 to measure the entropy of each sequence. You can also set rank = 2to measure the joint entropy of each pair of sequences or columns. This function sets by default the probabilities keyword argument to false because it's faster to calculate the entropy from counts/frequencies. It also sets usediagonal = true to also calculate the entropy of the individual variables (sequences or columns).\n\njulia> using MIToS.MSA, MIToS.Information\n\njulia> msa = Residue['C' 'G'; 'C' 'L'; 'C' 'I']\n3×2 Matrix{Residue}:\n C G\n C L\n C I\n\njulia> shannon_entropy(msa)\n1×2 Named Matrix{Float64}\n Function ╲ Col │ 1 2\n────────────────┼─────────────────\nshannon_entropy │ 0.0 1.09861\n\n\n\n\n\n\n","category":"method"},{"location":"Information_API/#MIToS.Information.shannon_entropy-Union{Tuple{MIToS.Information.Probabilities{T, N, A}}, Tuple{A}, Tuple{N}, Tuple{T}} where {T, N, A}","page":"Information","title":"MIToS.Information.shannon_entropy","text":"shannon_entropy(table::Union{Frequencies{T,N,A},Probabilities{T,N,A}}; base::Number=ℯ)\n\nIt calculates the Shannon entropy (H) from a table of Frequencies or Probabilities. Use last and optional positional argument to change the base of the log. The default base for the log is ℯ (base=ℯ), so the result is in nats. You can use base = 2 to get the result in bits.\n\n\n\n\n\n","category":"method"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"@info \"SIFTS API docs\"","category":"page"},{"location":"SIFTS_API/#SIFTS","page":"SIFTS","title":"SIFTS","text":"","category":"section"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"MIToS.SIFTS","category":"page"},{"location":"SIFTS_API/#MIToS.SIFTS","page":"SIFTS","title":"MIToS.SIFTS","text":"The SIFTS module of MIToS allows to obtain the residue-level mapping between databases stored in the SIFTS XML files. It makes easy to assign PDB residues to UniProt/Pfam positions. Given the fact that pairwise alignments can lead to misleading association between residues in both sequences, SIFTS offers more reliable association between sequence and structure residue numbers.\n\nFeatures\n\nDownload and parse SIFTS XML files\nStore residue-level mapping in Julia\nEasy generation of OrderedDicts between residues numbers\n\nusing MIToS.SIFTS\n\n\n\n\n\n","category":"module"},{"location":"SIFTS_API/#Contents","page":"SIFTS","title":"Contents","text":"","category":"section"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"Pages = [\"SIFTS_API.md\"]\nDepth = 2","category":"page"},{"location":"SIFTS_API/#Types","page":"SIFTS","title":"Types","text":"","category":"section"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"Modules = [MIToS.SIFTS]\nPrivate = false\nOrder = [:type]","category":"page"},{"location":"SIFTS_API/#MIToS.SIFTS.SIFTSResidue","page":"SIFTS","title":"MIToS.SIFTS.SIFTSResidue","text":"A SIFTSResidue object stores the SIFTS residue level mapping for a residue. It has the following fields that you can access at any moment for query purposes:\n\n- `PDBe` : A `dbPDBe` object, it's present in all the `SIFTSResidue`s.\n- `UniProt` : A `dbUniProt` object or `missing`.\n- `Pfam` : A `dbPfam` object or `missing`.\n- `NCBI` : A `dbNCBI` object or `missing`.\n- `InterPro` : An array of `dbInterPro` objects.\n- `PDB` : A `dbPDB` object or `missing`.\n- `SCOP` : A `dbSCOP` object or `missing`.\n- `SCOP2` : An array of `dbSCOP2` objects.\n- `SCOP2B` : A `dbSCOP2B` object or `missing`.\n- `CATH` : A `dbCATH` object or `missing`.\n- `Ensembl` : An array of `dbEnsembl` objects.\n- `missing` : It's `true` if the residue is missing, i.e. not observed, in the structure.\n- `sscode` : A string with the secondary structure code of the residue.\n- `ssname` : A string with the secondary structure name of the residue.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbCATH","page":"SIFTS","title":"MIToS.SIFTS.dbCATH","text":"dbCATH stores the residue id, number, name and chain in CATH as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbEnsembl","page":"SIFTS","title":"MIToS.SIFTS.dbEnsembl","text":"dbEnsembl stores the residue (gene) accession id, the transcript, translation and exon ids in Ensembl as strings, together with the residue number and name using the UniProt coordinates.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbInterPro","page":"SIFTS","title":"MIToS.SIFTS.dbInterPro","text":"dbInterPro stores the residue id, number, name and evidence in InterPro as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbNCBI","page":"SIFTS","title":"MIToS.SIFTS.dbNCBI","text":"dbNCBI stores the residue id, number and name in NCBI as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbPDB","page":"SIFTS","title":"MIToS.SIFTS.dbPDB","text":"dbPDB stores the residue id, number, name and chain in PDB as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbPDBe","page":"SIFTS","title":"MIToS.SIFTS.dbPDBe","text":"dbPDBe stores the residue number and name in PDBe as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbPfam","page":"SIFTS","title":"MIToS.SIFTS.dbPfam","text":"dbPfam stores the residue id, number and name in Pfam as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbSCOP","page":"SIFTS","title":"MIToS.SIFTS.dbSCOP","text":"dbSCOP stores the residue id, number, name and chain in SCOP as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbSCOP2","page":"SIFTS","title":"MIToS.SIFTS.dbSCOP2","text":"dbSCOP2 stores the residue id, number, name and chain in SCOP2 as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbSCOP2B","page":"SIFTS","title":"MIToS.SIFTS.dbSCOP2B","text":"dbSCOP2B stores the residue id, number, name and chain in SCOP2B as strings. SCOP2B is expansion of SCOP2 domain annotations at superfamily level to every PDB with same UniProt accession having at least 80% SCOP2 domain coverage.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#MIToS.SIFTS.dbUniProt","page":"SIFTS","title":"MIToS.SIFTS.dbUniProt","text":"dbUniProt stores the residue id, number and name in UniProt as strings.\n\n\n\n\n\n","category":"type"},{"location":"SIFTS_API/#Constants","page":"SIFTS","title":"Constants","text":"","category":"section"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"Modules = [MIToS.SIFTS]\nPrivate = false\nOrder = [:constant]","category":"page"},{"location":"SIFTS_API/#Macros","page":"SIFTS","title":"Macros","text":"","category":"section"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"Modules = [MIToS.SIFTS]\nPrivate = false\nOrder = [:macro]","category":"page"},{"location":"SIFTS_API/#Methods-and-functions","page":"SIFTS","title":"Methods and functions","text":"","category":"section"},{"location":"SIFTS_API/","page":"SIFTS","title":"SIFTS","text":"Modules = [MIToS.SIFTS]\nPrivate = false\nOrder = [:function]","category":"page"},{"location":"SIFTS_API/#MIToS.SIFTS.downloadsifts-Tuple{String}","page":"SIFTS","title":"MIToS.SIFTS.downloadsifts","text":"downloadsifts(pdbcode::String; filename::String, source::String=\"https\")\n\nDownload the gzipped SIFTS XML file for the provided pdbcode. The downloaded file will have the default extension .xml.gz. While you can change the filename, it must include the .xml.gz ending. The source keyword argument is set to \"https\" by default. Alternatively, you can choose \"ftp\" as the source, which will retrieve the file from the EBI FTP server at ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/. However, please note that using \"https\" is highly recommended. This option will download the file from the EBI PDBe server at https://www.ebi.ac.uk/pdbe/files/sifts/.\n\n\n\n\n\n","category":"method"},{"location":"SIFTS_API/#MIToS.SIFTS.siftsmapping-Union{Tuple{T}, Tuple{F}, Tuple{String, Type{F}, String, Type{T}, String}} where {F, T}","page":"SIFTS","title":"MIToS.SIFTS.siftsmapping","text":"Parses a SIFTS XML file and returns a OrderedDict between residue numbers of two DataBases with the given identifiers. A chain could be specified (All by default). If missings is true (default) all the residues are used, even if they haven’t coordinates in the PDB file.\n\n\n\n\n\n","category":"method"},{"location":"SIFTS_API/#MIToS.Utils.parse_file-Tuple{LightXML.XMLDocument, Type{MIToS.SIFTS.SIFTSXML}}","page":"SIFTS","title":"MIToS.Utils.parse_file","text":"parse_file(document::LightXML.XMLDocument, ::Type{SIFTSXML}; chain=All, missings::Bool=true)\n\nReturns a Vector{SIFTSResidue} parsed from a SIFTSXML file. By default, parses all the chains and includes missing residues.\n\n\n\n\n\n","category":"method"},{"location":"","page":"Home","title":"Home","text":"\"MIToS\"/\n\"MIToS\"/","category":"page"},{"location":"","page":"Home","title":"Home","text":"A Julia Package to Analyze Protein Sequences, Structures, and Evolutionary Information","category":"page"},{"location":"#Modules","page":"Home","title":"Modules","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"MIToS tools are separated into different modules for different tasks.","category":"page"},{"location":"","page":"Home","title":"Home","text":"MSA: This module defines multiple functions and types for dealing with Multiple Sequence Alignments (MSAs) and their annotations. It also includes facilities for sequence clustering and shuffling, among others.\nPDB: This module defines types and methods to work with protein structures from different sources, such as the Protein Data Bank (PDB) or AlphaFold DB. It includes functions to superpose structures, measure the distance between residues, and much more.\nInformation: This module defines residue contingency tables and methods on them to estimate information measures. This allow to measure evolutionary information on MSAs positions. It includes functions to estimate corrected mutual information (ZMIp, ZBLMIp) between MSA columns, as well as conservation estimations using Shannon entropy and the Kullback-Leibler divergence.\nSIFTS: This module allows access to SIFTS residue-level mapping of UniProt, Pfam, and other databases with PDB entries.\nPfam: This module uses the previous modules to work with Pfam MSAs. It also has useful parameter optimization functions to be used with Pfam alignments.\nUtils: MIToS has also a Utils module with common utils functions and types used in different modules of this package.","category":"page"},{"location":"#Citation","page":"Home","title":"Citation","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"If you use MIToS [1], please cite:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Diego J. Zea, Diego Anfossi, Morten Nielsen, Cristina Marino-Buslje; MIToS.jl: mutual information tools for protein sequence analysis in the Julia language, Bioinformatics, Volume 33, Issue 4, 15 February 2017, Pages 564–565, https://doi.org/10.1093/bioinformatics/btw646","category":"page"},{"location":"#Older-MIToS-versions","page":"Home","title":"Older MIToS versions","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"You can change the MIToS version of the documentation at the bottom left of this site—the older version available is MIToS 2.0. If you are using MIToS v1 in a version of Julia pre-1.0, please read this older documentation instead.","category":"page"},{"location":"#Acknowledgments","page":"Home","title":"Acknowledgments","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"MIToS was initially developed at the Structural Bioinformatics Unit of the Fundación Instituto Leloir (FIL) in Argentina. Its development now continues at the Molecular Assemblies and Genome Integrity group of the Institute for Integrative Biology of the Cell (I2BC) in France.","category":"page"},{"location":"","page":"Home","title":"Home","text":"We want to thank all contributors who have helped improve MIToS. We also thank the Julia community and all the MIToS users for their feedback and support.","category":"page"},{"location":"","page":"Home","title":"Home","text":"\"FIL\n\"FIL","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"@info \"Installation docs\"","category":"page"},{"location":"Installation/#Installation","page":"Installation","title":"Installation","text":"","category":"section"},{"location":"Installation/","page":"Installation","title":"Installation","text":"First you need to install Julia.(Image: ) MIToS' stable version can be installed by typing on the Julia REPL:","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"using Pkg\nPkg.add(\"MIToS\")","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"If everything goes well with the installation, MIToS will be loaded without errors by typing:","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"using MIToS","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"To update MIToS to the latest version, you can run:","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"using Pkg\nPkg.update(\"MIToS\")","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"tip: Ways to run Julia\nJulia REPL (Image: ): Built-in Julia command line. Start a Julia interactive session (REPL) by double-clicking the Julia executable or running julia from the system command line.\nIJulia (Image: ): Jupyter/IPython notebook for Julia.\nPluto (Image: ): A simple reactive notebook for Julia.\nVS Code Extension for Julia (Image: ): The Julia's Integrated Development Environment (IDE).","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"info: Running the test suite\nOptionally, you can run the test suite to ensure everything works as expected. The test suite is extensive and can take several minutes to run. It is the same test suite used for MIToS' continuous integration (CI), so everything should pass. To run the test suite, execute using Pkg; Pkg.test(\"MIToS\") in the Julia REPL.","category":"page"},{"location":"Installation/#Plots-installation","page":"Installation","title":"Plots installation","text":"","category":"section"},{"location":"Installation/","page":"Installation","title":"Installation","text":"Julia plotting capabilities are available through external packages. MIToS makes use of RecipesBase to define plot recipes, which can be plotted using Plots(Image: ) and its different backends. You need to install Plots(Image: ) to plot MIToS objects:","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"using Pkg\nPkg.add(\"Plots\")","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"Once it is installed, you need to load Plots in order to use the plot function. There is more information about it in the Plots documentation(Image: ).","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"using Plots","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"To generate graph (network), arc and chord (circo) plots, you also need to install and load GraphRecipes(Image: ).","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"using Pkg\nPkg.add(\"GraphRecipes\")\n\nusing GraphRecipes","category":"page"},{"location":"Installation/","page":"Installation","title":"Installation","text":"You can look for examples in the GraphRecipes documentation(Image: ).","category":"page"}] }