From 03a980fe9513daf9ce08ac4e93afa0c2ba1ba51d Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 19 Jun 2024 07:04:02 -0700 Subject: [PATCH] Use `relaxed_combine_rows` when merging mcols (#99) * As noted in #98, using `relaxed_combine_rows` allows to perform operations on granges objects that may contain different metadata columns. * Set numpy version to < 2.0; since a few operations are incompatible with the new release * Add tests --- setup.cfg | 2 +- src/genomicranges/GenomicRanges.py | 2 +- src/genomicranges/GenomicRangesList.py | 2 +- tests/test_gr_methods_basic.py | 42 ++++++++++++++++++++++++++ tests/test_gr_set_ops.py | 40 ++++++++++++++++++++++++ 5 files changed, 85 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index fd5a46b..4ab37bd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,7 +52,7 @@ install_requires = biocframe>=0.5.11 iranges[optional]>=0.2.6 biocutils>=0.1.3 - numpy + numpy<2.0.0 [options.packages.find] where = src diff --git a/src/genomicranges/GenomicRanges.py b/src/genomicranges/GenomicRanges.py index 23f6fd8..3ac3536 100644 --- a/src/genomicranges/GenomicRanges.py +++ b/src/genomicranges/GenomicRanges.py @@ -2848,7 +2848,7 @@ def _combine_GenomicRanges(*x: GenomicRanges) -> GenomicRanges: seqnames=ut.combine_sequences(*[y._seqnames for y in x]), strand=ut.combine_sequences(*[y._strand for y in x]), names=all_names, - mcols=ut.combine_rows(*[y._mcols for y in x]), + mcols=ut.relaxed_combine_rows(*[y._mcols for y in x]), seqinfo=merge_SeqInfo([y._seqinfo for y in x]), metadata=x[0]._metadata, validate=False, diff --git a/src/genomicranges/GenomicRangesList.py b/src/genomicranges/GenomicRangesList.py index f6d91cd..b5ee808 100644 --- a/src/genomicranges/GenomicRangesList.py +++ b/src/genomicranges/GenomicRangesList.py @@ -873,7 +873,7 @@ def _combine_grl(*x: GenomicRangesList): ranges=ut.combine_sequences(*[y._ranges for y in x]), range_lengths=ut.combine_sequences(*[y._range_lengths for y in x]), names=all_names, - mcols=ut.combine_rows(*[y._mcols for y in x]), + mcols=ut.relaxed_combine_rows(*[y._mcols for y in x]), metadata=x[0]._metadata, validate=False, ) diff --git a/tests/test_gr_methods_basic.py b/tests/test_gr_methods_basic.py index 65cf2d8..74c85e2 100644 --- a/tests/test_gr_methods_basic.py +++ b/tests/test_gr_methods_basic.py @@ -3,6 +3,7 @@ import numpy as np from genomicranges import GenomicRanges from biocframe import BiocFrame +import biocutils as ut from iranges import IRanges from random import random import genomicranges @@ -102,3 +103,44 @@ def test_export(): "chr3", ] assert df["strand"].tolist() == ["-", "+", "+", "*", "*", "+", "+", "+", "-", "-"] + + +def test_combine(): + g_src = GenomicRanges( + seqnames=["chr1", "chr2", "chr1", "chr3", "chr2"], + ranges=IRanges( + start=[101, 102, 103, 104, 109], width=[112, 103, 128, 134, 111] + ), + strand=["*", "-", "*", "+", "-"], + ) + + g_tgt = GenomicRanges( + seqnames=[ + "chr1", + "chr2", + "chr2", + "chr2", + "chr1", + "chr1", + "chr3", + "chr3", + "chr3", + "chr3", + ], + ranges=IRanges(start=range(101, 111), width=range(121, 131)), + strand=["*", "-", "-", "*", "*", "+", "+", "+", "-", "-"], + mcols=BiocFrame( + { + "score": range(0, 10), + "GC": [random() for _ in range(10)], + } + ), + ) + assert g_src is not None + assert g_tgt is not None + + out: GenomicRanges = ut.combine_sequences(g_src, g_tgt) + + assert out is not None + assert len(out) == 15 + assert len(out.get_mcols().get_column_names()) == 2 diff --git a/tests/test_gr_set_ops.py b/tests/test_gr_set_ops.py index 0d1166b..17deb04 100644 --- a/tests/test_gr_set_ops.py +++ b/tests/test_gr_set_ops.py @@ -70,3 +70,43 @@ def test_intersect(): assert (out.start == np.array([9])).all() assert (out.width == np.array([2])).all() assert (out.strand == np.array([-1])).all() + + +def test_intersect(): + g_src = GenomicRanges( + seqnames=["chr1", "chr2", "chr1", "chr3", "chr2"], + ranges=IRanges( + start=[101, 102, 103, 104, 109], width=[112, 103, 128, 134, 111] + ), + strand=["*", "-", "*", "+", "-"], + ) + + g_tgt = GenomicRanges( + seqnames=[ + "chr1", + "chr2", + "chr2", + "chr2", + "chr1", + "chr1", + "chr3", + "chr3", + "chr3", + "chr3", + ], + ranges=IRanges(start=range(101, 111), width=range(121, 131)), + strand=["*", "-", "-", "*", "*", "+", "+", "+", "-", "-"], + mcols=BiocFrame( + { + "score": range(0, 10), + "GC": [random() for _ in range(10)], + } + ), + ) + assert g_src is not None + assert g_tgt is not None + + out = g_src.intersect(g_tgt) + + assert out is not None + assert len(out) == 3