Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support reading and writing to/from file-like objects #148

Merged
merged 4 commits into from
Apr 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ _static
_templates
setup-requires/*
.cache/v/cache/lastfailed
.idea
136 changes: 91 additions & 45 deletions cyvcf2/cyvcf2.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ import numpy as np
from array import array
import math
import ctypes

try:
from pathlib import Path
except ImportError:
from pathlib2 import Path # python 2 backport
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose for python2 this makes pathlib or pathlib2 a dependency? Can we just not support python2 after a certain version of cyvcf2? It would make things so much nicer, especially if we can use the typing library for python 3?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to keep python2 support.


from libc cimport stdlib
cimport numpy as np
Expand Down Expand Up @@ -130,7 +133,87 @@ cdef set_constants(VCF v):
v.HOM_ALT = 3


cdef class VCF:

cdef class HTSFile:

cdef htsFile *hts
cdef bytes fname
cdef bytes mode
cdef bint from_path
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there's probably more in VCF that's needed across both VCF and Writer that we can move here.


cdef _open_htsfile(self, fname, mode):
"""Opens an htsfile for reading or writing.
Parameters
----------
fname: str
filename (str or Path), file descriptor (int), or file-like object (has fileno method).
mode: str
the mode to pass to hts_open.
"""
cdef hFILE *hf
self.mode = to_bytes(mode)
reading = b"r" in self.mode
if not reading and b"w" not in self.mode:
raise IOError("No 'r' or 'w' in mode %s" % str(self.mode))
self.from_path = False
# for htslib, wbu seems to not work
if mode == b"wbu":
mode = to_bytes(b"wb0")
if isinstance(fname, (basestring, Path)):
self.from_path = True
self.fname = to_bytes(str(fname))
if self.fname == b"-":
self.fname = to_bytes(b"/dev/stdin") if reading else to_bytes(b"/dev/stdout")
if self.fname.endswith(b".gz") and self.mode == b"w":
self.mode = b"wz"
elif self.fname.endswith(b".bcf") and self.mode == b"w":
self.mode = b"wb"
self.fname = to_bytes(str(fname))
self.mode = to_bytes(mode)
self.hts = hts_open(self.fname, self.mode)
# from a file descriptor
elif isinstance(fname, int):
self.mode = to_bytes(mode)
hf = hdopen(int(fname), self.mode)
self.hts = hts_hopen(hf, "<file>", self.mode)
self.fname = None
# reading from a File object or other object with fileno
elif hasattr(fname, "fileno"):
if fname.closed:
raise IOError('I/O operation on closed file')
self.mode = to_bytes(mode)
hf = hdopen(fname.fileno(), self.mode)
self.hts = hts_hopen(hf, "<file>", self.mode)
# .name can be TextIOWrapper
try:
self.fname = to_bytes(fname.name)
except AttributeError:
self.fname = None
else:
raise IOError("Cannot open '%s' for writing." % str(type(fname)))

if self.hts == NULL:
raise IOError("Error opening %s" % str(fname))
if reading:
if self.hts.format.format != vcf and self.hts.format.format != bcf:
raise IOError(
"%s is not valid bcf or vcf (format: %s mode: %s)" % (fname, self.hts.format.format, mode)
)
else:
if self.hts.format.format != text_format and self.hts.format.format != binary_format:
raise IOError(
"%s is not valid text_format or binary_format (format: %s mode: %s)" % (fname, self.hts.format.format, mode)
)

def close(self):
if self.hts != NULL:
if self.from_path:
hts_close(self.hts)
self.hts = NULL


cdef class VCF(HTSFile):
"""
VCF class holds methods to iterate over and query a VCF.
Expand All @@ -146,20 +229,20 @@ cdef class VCF:
if True, then any '.' present in a genotype will classify the corresponding element in the gt_types array as UNKNOWN.
samples: list
list of samples to extract from full set in file.
threads: int
the number of threads to use including this reader.
Returns
-------
VCF object for iterating and querying.
"""

cdef htsFile *hts
cdef const bcf_hdr_t *hdr
cdef tbx_t *idx
cdef hts_idx_t *hidx
cdef int n_samples
cdef int PASS
cdef bytes fname
cdef bint gts012
cdef bint lazy
cdef bint strict_gt
Expand All @@ -178,25 +261,8 @@ cdef class VCF:
cdef readonly int UNKNOWN

def __init__(self, fname, mode="r", gts012=False, lazy=False, strict_gt=False, samples=None, threads=None):
cdef hFILE *hf

if isinstance(fname, basestring):
if fname == b"-" or fname == "-":
fname = b"/dev/stdin"
fname, mode = to_bytes(fname), to_bytes(mode)
self.hts = hts_open(fname, mode)
self.fname = fname
else:
mode = to_bytes(mode)
hf = hdopen(int(fname), mode)
self.hts = hts_hopen(hf, "<file>", mode)

if self.hts == NULL:
raise IOError("Error opening %s" % fname)
if self.hts.format.format != vcf and self.hts.format.format != bcf:
raise IOError("%s if not valid bcf or vcf" % fname)

cdef bcf_hdr_t *hdr
self._open_htsfile(fname, mode)
hdr = self.hdr = bcf_hdr_read(self.hts)
if samples is not None:
self.set_samples(samples)
Expand Down Expand Up @@ -491,13 +557,6 @@ cdef class VCF:

contains = __contains__

def close(self):
if self.hts != NULL:
if self.fname != "-":
# TODO flush
hts_close(self.hts)
self.hts = NULL

def __dealloc__(self):
if self.hts != NULL and self.hdr != NULL:
bcf_hdr_destroy(self.hdr)
Expand Down Expand Up @@ -2155,6 +2214,7 @@ cdef from_bytes(s):
return s


# TODO: make Writer extend HTSFile not VCF by moving common methods into HTSFile
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^this should be a tech-debt issue

cdef class Writer(VCF):
"""
Writer class makes a VCF Writer.
Expand All @@ -2177,15 +2237,7 @@ cdef class Writer(VCF):
cdef const bcf_hdr_t *ohdr

def __init__(Writer self, fname, VCF tmpl, mode="w"):
self.name = to_bytes(fname)
if fname.endswith(".gz") and mode == "w":
mode = "wz"
if fname.endswith(".bcf") and mode == "w":
mode = "wb"
self.hts = hts_open(self.name, to_bytes(mode))
if self.hts == NULL:
raise Exception("error opening file: %s" % self.name)

self._open_htsfile(fname, mode)
bcf_hdr_sync(tmpl.hdr)
self.ohdr = tmpl.hdr
self.hdr = bcf_hdr_dup(tmpl.hdr)
Expand All @@ -2195,14 +2247,8 @@ cdef class Writer(VCF):
@classmethod
def from_string(Writer cls, fname, header_string, mode="w"):
cdef Writer self = Writer.__new__(Writer)

self.name = to_bytes(fname)
if fname.endswith(".gz") and mode == "w":
mode = "wz"
if fname.endswith(".bcf") and mode == "w":
mode = "wb"
self.hts = hts_open(self.name, to_bytes(mode))
cdef char *hmode = "w"
self._open_htsfile(fname, mode)
self.hdr = bcf_hdr_init(hmode)
self.ohdr = bcf_hdr_dup(self.hdr)
if bcf_hdr_parse(self.hdr, to_bytes(header_string)) != 0:
Expand Down
68 changes: 54 additions & 14 deletions cyvcf2/tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
import sys
import os
import atexit
try:
from pathlib import Path
except ImportError:
from pathlib2 import Path # python 2 backport


HERE = os.path.dirname(__file__)
VCF_PATH = os.path.join(HERE, "test.vcf.gz")
Expand All @@ -20,8 +25,29 @@
basestring = (str, bytes)

def test_init():
# string
v = VCF(VCF_PATH)
assert v
expected_count = sum(1 for _ in v)
v.close()

# Path
v = VCF(Path(VCF_PATH))
value = sum(1 for _ in v)
assert value == expected_count

# file descriptor
with open(VCF_PATH) as fp:
fd = fp.fileno()
v = VCF(fd)
assert sum(1 for _ in v) == expected_count
v.close() # this should not close the file descriptor originally opened

# file-like object
with open(VCF_PATH) as fp:
v = VCF(fp)
assert sum(1 for _ in v) == expected_count
v.close() # this should not close the file descriptor originally opened

def test_type():
vcf = VCF(VCF_PATH)
Expand Down Expand Up @@ -237,32 +263,46 @@ def test_writer_from_string():
w.close()


def test_writer():

v = VCF(VCF_PATH)
f = tempfile.mktemp(suffix=".vcf")
atexit.register(os.unlink, f)

o = Writer(f, v)
rec = next(v)
def run_writer(writer, filename, rec):
rec.INFO["AC"] = "3"
rec.FILTER = ["LowQual"]
o.write_record(rec)
writer.write_record(rec)

rec.FILTER = ["LowQual", "VQSRTrancheSNP99.90to100.00"]
o.write_record(rec)

writer.write_record(rec)

rec.FILTER = "PASS"
o.write_record(rec)
writer.write_record(rec)

o.close()
writer.close()

expected = ["LowQual", "LowQual;VQSRTrancheSNP99.90to100.00", None]

for i, variant in enumerate(VCF(f)):
for i, variant in enumerate(VCF(filename)):
assert variant.FILTER == expected[i], (variant.FILTER, expected[i])

def test_writer():
v = VCF(VCF_PATH)
f = tempfile.mktemp(suffix=".vcf")
atexit.register(os.unlink, f)
rec = next(v)

# string
run_writer(Writer(f, v), f, rec)

# Path
path = Path(f)
run_writer(Writer(path, v), f, rec)

# file descriptor
with open(VCF_PATH) as fp:
fd = fp.fileno()
run_writer(Writer(fd, v), f, rec)

# file-like object
with open(VCF_PATH) as fp:
run_writer(Writer(fp, v), f, rec)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm all ears for having a regression test for stdin/stdout support in the reader/writer, but I don't have a good solution


def test_add_info_to_header():
v = VCF(VCF_PATH)
v.add_info_to_header({'ID': 'abcdefg', 'Description': 'abcdefg',
Expand Down