Skip to content

Commit

Permalink
Merge pull request #59 from alneberg/explore_command
Browse files Browse the repository at this point in the history
First version of the explore command
  • Loading branch information
remiolsen authored Jan 29, 2024
2 parents b63e7d8 + 5cb657e commit 28e491b
Show file tree
Hide file tree
Showing 14 changed files with 569 additions and 70 deletions.
33 changes: 33 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
{
"name": "Anglerfish",
"build": {
// Sets the run context to one level up instead of the .devcontainer folder.
"context": "..",
// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
"dockerfile": "../Dockerfile",
// Use devcontainer target
"target": "devcontainer",
},
"initializeCommand": "echo 'Workaround, see github https://github.com/microsoft/vscode-remote-release/issues/9302#issuecomment-1854476541'",
"features": {},
"customizations": {
"vscode": {
"extensions": [
"esbenp.prettier-vscode",
"wholroyd.jinja",
"ms-python.python",
"[email protected]",
"ms-azuretools.vscode-docker",
],
},
},
// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],
// "postCreateCommand": "pip3 install -e .",
// Configure tool-specific properties.
// "customizations": {},
// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
// "remoteUser": "devcontainer"
}
1 change: 1 addition & 0 deletions .github/workflows/anglerfish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ jobs:
python=${{ matrix.python-version }}
pip
environment-file: environment.yml
environment-name: anglerfish-dev

# Install Anglerfish
- shell: bash -l {0}
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# .pre-commit-config.yaml
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.6
rev: v0.1.13
hooks:
- id: ruff
- id: ruff-format
Expand Down
32 changes: 20 additions & 12 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM mambaorg/micromamba
FROM mambaorg/micromamba as base

LABEL author="Remi-Andre Olsen" \
description="Anglerfish development version" \
Expand All @@ -8,20 +8,13 @@ USER root

# Check for arm64 architecture to install minimap2
ARG TARGETARCH
ENV MINIMAP_VERSION=2.26
RUN if [ "$TARGETARCH" = "arm64" ]; then \
# Install compliation tools for minimap2
apt-get update;\
apt-get install -y curl build-essential libz-dev;\
fi

ENV MINIMAP_VERSION=2.26

RUN if [ "$TARGETARCH" = "arm64" ]; then \
# Download minimap2
curl -L https://github.com/lh3/minimap2/archive/refs/tags/v${MINIMAP_VERSION}.tar.gz | tar -zxvf - ;\
fi

RUN if [ "$TARGETARCH" = "arm64" ]; then \
# Compile minimap2
cd minimap2-${MINIMAP_VERSION};\
make arm_neon=1 aarch64=1; \
Expand All @@ -32,6 +25,7 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /
COPY --chown=$MAMBA_USER:$MAMBA_USER requirements.txt /

# Remove minimap2 from environment.yml for arm64
RUN if [ "$TARGETARCH" = "arm64" ]; then \
grep -v 'minimap2' /environment.yml > /environment.tmp.yml ;\
else \
Expand All @@ -45,7 +39,21 @@ WORKDIR /usr/src/anglerfish

# Activate the environment
ARG MAMBA_DOCKERFILE_ACTIVATE=1
RUN micromamba install -y -f /environment.tmp.yml && micromamba clean --all --yes

RUN eval "$(micromamba shell hook --shell bash)" && python -m pip install .[dev]
RUN micromamba install -y -n base -f /environment.tmp.yml && micromamba clean --all --yes

#####
# Devcontainer
#####
FROM base as devcontainer

# Useful tools for devcontainer
RUN apt-get update;\
apt-get install -y git vim
RUN eval "$(micromamba shell hook --shell bash)" && python -m pip install -e .[dev]

#####
# Main
#####
FROM base as main
USER $MAMBA_USER
RUN eval "$(micromamba shell hook --shell bash)" && python -m pip install .[dev]
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,25 @@ conda install -c bioconda anglerfish
pip install --upgrade --force-reinstall git+https://github.com/remiolsen/anglerfish.git
```

### For Arm64 processors (e.g. Apple M2)

Anglerfish depends on minimap2 which needs to be [compiled from source](https://github.com/lh3/minimap2?tab=readme-ov-file#installation) for Arm64 processors.
When minimap2 is compiled and available on $PATH, anglerfish can be installed with pip:

```shell
pip install bio-anglerfish
```

Additionaly, if Docker is your cup of tea, the Dockerfile supplied in this repository should also work on both arm64 and x86 processors.

## Source development

1. [Install miniconda](https://docs.conda.io/en/latest/miniconda.html).

2. Set up repo clone with editable install

For x86 processors (e.g. Intel/AMD):

```
git clone https://github.com/remiolsen/anglerfish.git
cd anglerfish
Expand All @@ -49,6 +62,18 @@ conda activate anglerfish-dev
pip install -e ".[dev]"
```

For Arm64 processors (e.g. Apple M1/M2). First [compile and install minimap2 manually](https://github.com/lh3/minimap2?tab=readme-ov-file#installation), then:

```
git clone https://github.com/remiolsen/anglerfish.git
cd anglerfish
# Create a the anglerfish conda environment (but remove minimap2)
conda env create -f <(grep -v minimap2 environment.yml)
# Install anglerfish
conda activate anglerfish-dev
pip install -e ".[dev]"
```

3. (Optional) Install pre-commit to prevent committing code that will fail linting

```
Expand Down Expand Up @@ -146,6 +171,16 @@ In folder `anglerfish_????_??_??_?????/`
- `anglerfish_stats.txt` Barcode statistics from anglerfish run
- `anglerfish_stats.json` Machine readable anglerfish statistics

## Anglerfish Explore (Experimental)

`anglerfish explore` is a command that aims to explore a sequencing pool without a given samplesheet and give hints on what adapter types are present, which index lenghts are used and whether there are any UMIs within the index sequence. The Anglerfish explore command is still under heavy development but can be triggered by running:

```shell
python anglerfish/explore/cli.py
```

inside the anglerfish directory.

## Credits

The Anglerfish code was written by [@remiolsen](https://github.com/remiolsen) but it would not exist without the contributions of [@FranBonath](https://github.com/FranBonath), [@taborsak](https://github.com/taborsak), [@ssjunnebo](https://github.com/ssjunnebo) and Carl Rubin.
Expand Down
69 changes: 69 additions & 0 deletions anglerfish/demux/adaptor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import importlib
import os

import yaml


class Adaptor:
def __init__(self, adaptors, delim, adaptor, i7_index=None, i5_index=None):
self.i7 = AdaptorPart(adaptors[adaptor]["i7"], adaptor, delim, i7_index)
self.i5 = AdaptorPart(adaptors[adaptor]["i5"], adaptor, delim, i5_index)
self.name = f"{adaptor}"
self.delim = delim

def get_i7_mask(self, insert_Ns=True):
return self.i7.get_mask(insert_Ns)

def get_i5_mask(self, insert_Ns=True):
return self.i5.get_mask(insert_Ns)

def get_fastastring(self, insert_Ns=True):
fasta_i5 = f">{self.name}_i5\n{self.get_i5_mask(insert_Ns)}\n"
fasta_i7 = f">{self.name}_i7\n{self.get_i7_mask(insert_Ns)}\n"
return fasta_i5 + fasta_i7


class AdaptorPart:
def __init__(self, sequence, name, delim, index):
self.sequence = sequence
self.name = name
self.delim = delim
self.index = index

def has_index(self):
return self.sequence.find(self.delim) > -1

def len_before_index(self):
return self.sequence.find(self.delim)

def len_after_index(self):
return len(self.sequence) - self.sequence.find(self.delim) - len(self.delim)

def get_mask(self, insert_Ns):
if self.has_index():
if not insert_Ns:
return self.sequence.replace(self.delim, "")
else:
return self.sequence.replace(self.delim, "N" * len(self.index))
else:
return self.sequence


# Fetch all adaptors
def load_adaptors(raw=False):
p = importlib.resources.files("anglerfish.config").joinpath("adaptors.yaml")
assert isinstance(p, os.PathLike)

adaptors_raw = []
with open(p) as f:
adaptors_raw = yaml.safe_load(f)

if raw:
return adaptors_raw
adaptors = []
for adaptor in adaptors_raw:
adaptors.append(
Adaptor(adaptors_raw, "-NNN-", adaptor, i7_index=None, i5_index=None)
)

return adaptors
46 changes: 31 additions & 15 deletions anglerfish/demux/demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def parse_cs(cs_string, index, max_distance):
return nts, lev.distance(index.lower(), nts)


def run_minimap2(fastq_in, indexfile, output_paf, threads):
def run_minimap2(fastq_in, indexfile, output_paf, threads, minimap_b=1):
"""
Runs Minimap2
"""
Expand All @@ -36,24 +36,29 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads):
"10",
"-w",
"5",
"-B1",
"-A6",
"--dual=no",
"-A",
"6",
"-B",
str(minimap_b),
"-c",
"-t",
str(threads),
indexfile,
fastq_in,
]

with open(output_paf, "ab") as ofile:
subprocess.run(cmd, stdout=ofile, check=True)
run_log = f"{output_paf}.log"
with open(output_paf, "ab") as ofile, open(run_log, "ab") as log_file:
p1 = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=log_file)
subprocess.run("sort", stdin=p1.stdout, stdout=ofile, check=True)


def parse_paf_lines(paf, min_qual=10):
def parse_paf_lines(paf, min_qual=10, complex_identifier=False):
"""
Read and parse one paf alignment lines.
Returns a dict with the import values for later use
If complex_identifier is True (default False), the keys will be on the form
"{read}_{i5_or_i7}_{strand_str}"
"""
entries = {}
with open(paf) as paf:
Expand All @@ -62,17 +67,28 @@ def parse_paf_lines(paf, min_qual=10):
try:
# TODO: objectify this
entry = {
"read": aln[0],
"adapter": aln[5],
"rlen": int(aln[1]), # read length
"rstart": int(aln[2]), # start alignment on read
"rend": int(aln[3]), # end alignment on read
"strand": aln[4],
"cg": aln[-2], # cigar string
"cs": aln[-1], # cs string
"q": int(aln[11]), # Q score
"iseq": None,
"sample": None,
}
read = aln[0]
read = entry["read"]
if complex_identifier:
i5_or_i7 = entry["adapter"].split("_")[-1]
if entry["strand"] == "+":
strand_str = "positive"
else:
strand_str = "negative"
ix = f"{read}_{i5_or_i7}_{strand_str}"
else:
ix = read
except IndexError:
log.debug(f"Could not find all paf columns: {read}")
continue
Expand All @@ -81,10 +97,10 @@ def parse_paf_lines(paf, min_qual=10):
log.debug(f"Low quality alignment: {read}")
continue

if read in entries.keys():
entries[read].append(entry)
if ix in entries.keys():
entries[ix].append(entry)
else:
entries[read] = [entry]
entries[ix] = [entry]

return entries

Expand Down Expand Up @@ -166,14 +182,14 @@ def cluster_matches(
fi7 = ""
for _, adaptor, _ in sample_adaptor:
try:
i5_seq = adaptor.i5_index
i5_seq = adaptor.i5.index
if i5_reversed and i5_seq is not None:
i5_seq = str(Seq(i5_seq).reverse_complement())
fi5, d1 = parse_cs(i5["cs"], i5_seq, max_distance)
except AttributeError:
d1 = 0 # presumably it's single index, so no i5

i7_seq = adaptor.i7_index
i7_seq = adaptor.i7.index
if i7_reversed and i7_seq is not None:
i7_seq = str(Seq(i7_seq).reverse_complement())
fi7, d2 = parse_cs(i7["cs"], i7_seq, max_distance)
Expand All @@ -189,8 +205,8 @@ def cluster_matches(
continue
if dists[index_min] > max_distance:
# Find only full length i7(+i5) adaptor combos. Basically a list of "known unknowns"
if len(fi7) + len(fi5) == len(adaptor.i7_index or "") + len(
adaptor.i5_index or ""
if len(fi7) + len(fi5) == len(adaptor.i7.index or "") + len(
adaptor.i5.index or ""
):
fi75 = "+".join([i for i in [fi7, fi5] if not i == ""])
unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."])
Expand Down
4 changes: 2 additions & 2 deletions anglerfish/demux/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ def write_dataframe(self, outdir, samplesheet):
sen_dict = asdict(sentry)
if sen_dict["sample_name"] == s_dict["sample_name"]:
s_dict["adaptor_name"] = sen_dict["adaptor"].name
s_dict["i7_index"] = sen_dict["adaptor"].i7_index
s_dict["i5_index"] = sen_dict["adaptor"].i5_index
s_dict["i7_index"] = sen_dict["adaptor"].i7.index
s_dict["i5_index"] = sen_dict["adaptor"].i5.index
out_list.append(s_dict)
for key, unmatch in self.unmatched_stats.items():
for unmatch_sample in unmatch:
Expand Down
Loading

0 comments on commit 28e491b

Please sign in to comment.