Skip to content

Commit

Permalink
fix: explicitly ignore repeats in the ld command (#218)
Browse files Browse the repository at this point in the history
  • Loading branch information
aryarm authored Jun 2, 2023
1 parent cb18970 commit b9d0da1
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 6 deletions.
5 changes: 4 additions & 1 deletion docs/commands/ld.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ Compute the pair-wise LD (`Pearson's correlation coefficient <https://numpy.org/

The ``ld`` command takes as input a set of genotypes in VCF and a list of haplotypes (specified as a :doc:`.hap file </formats/haplotypes>`) and outputs a new :doc:`.hap file </formats/haplotypes>` with the computed LD values in an extra field.

By default, LD is computed with each haplotype in the **.hap** file. To compute LD with the variants in the genotypes file instead, you should use the `--from-gts <#cmdoption-haptools-ld-from-gts>`_ switch. When this mode is enabled, the **.hap** output will be replaced by an :doc:`.ld file </formats/ld>`.
By default, LD is computed with each haplotype in the ``.hap`` file. To compute LD with the variants in the genotypes file instead, you should use the `--from-gts <#cmdoption-haptools-ld-from-gts>`_ switch. When this mode is enabled, the ``.hap`` output will be replaced by an :doc:`.ld file </formats/ld>`.

.. note::
Repeats are not currently supported by the ``ld`` command. Any repeats in your ``.hap`` file will be ignored.

You may also specify genotypes in PLINK2 PGEN format instead of VCF format. See the documentation for genotypes in :ref:`the format docs <formats-genotypesplink>` for more information.

Expand Down
19 changes: 14 additions & 5 deletions haptools/ld.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,14 @@ def calc_ld(
haplotype_ids.add(target)
hp.read(region=region, haplotypes=haplotype_ids)

# remove all repeats from the haplotypes object since we don't yet support them
for repeat_id in hp.type_ids["R"]:
del hp.data[repeat_id]
num_repeats = len(hp.type_ids["R"])
if num_repeats:
log.info(f"Ignoring {num_repeats} repeats in .hap file")
hp.type_ids["R"] = []

if from_gts:
variants = None
if target in hp.data and ids:
Expand All @@ -123,18 +131,19 @@ def calc_ld(
variants = {var.id for h in hp.type_ids["H"] for var in hp.data[h].variants}

# check to see whether the target was a haplotype
if target in hp.type_ids["H"]:
log.info(f"Identified target '{target}' as a haplotype")
try:
target = hp.data.pop(target)
except:
# the target is a variant, instead
pass
else:
log.info(f"Identified target '{target}' as a haplotype")
hp.index(force=True)
if len(hp.data) == 0 and not from_gts:
log.error(
"There must be at least one more haplotype in the .hap file "
"than the TARGET haplotype specified."
)
else:
# TODO: handle other line types
pass

# check that all of the haplotypes were loaded successfully and warn otherwise
if ids is not None and not from_gts and len(ids) > len(hp.data):
Expand Down
23 changes: 23 additions & 0 deletions tests/test_ld.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,29 @@ def test_basic(capfd):
assert result.exit_code == 0


def test_simple_with_repeat(capfd):
gt_file = DATADIR / "simple.vcf"
hp_file = DATADIR / "simple.hap"

cmd = f"ld H1 {gt_file} {hp_file}"
runner = CliRunner()
result = runner.invoke(main, cmd.split(" "), catch_exceptions=False)
captured = capfd.readouterr()
assert captured.out
expected = captured.out
assert result.exit_code == 0

# now try again. The result should be the same because it ignores repeats
hp_file = DATADIR / "simple_tr.hap"

cmd = f"ld H1 {gt_file} {hp_file}"
runner = CliRunner()
result = runner.invoke(main, cmd.split(" "), catch_exceptions=False)
captured = capfd.readouterr()
assert expected == captured.out
assert result.exit_code == 0


def test_basic_variant(capfd):
expected = """#\torderH\tld
#\tversion\t0.2.0
Expand Down

0 comments on commit b9d0da1

Please sign in to comment.