diff --git a/.circleci/config.yml b/.circleci/config.yml
index 2c49b56a..fd6c2500 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,4 +1,8 @@
-version: 2
+version: 2.1
+
+orbs:
+ codecov: codecov/codecov@3.2.4
+
jobs:
build:
docker:
@@ -44,8 +48,11 @@ jobs:
name: Run Python tests and upload coverage
command: |
python3 -m pytest --cov=tsinfer --cov-report=xml --cov-branch -xvs tests
- python3 -m codecov -X gcov -F python
rm .coverage
+
+ - codecov/upload:
+ flags: python
+ token: CODECOV_TOKEN
- run:
name: Compile C with gcc
@@ -77,7 +84,10 @@ jobs:
gcov -pb ./libtsinfer.a.p/object_heap.c.gcno ../lib/object_heap.c
gcov -pb ./libtsinfer.a.p/err.c.gcno ../lib/err.c
cd ..
- codecov -X gcov -F C
+
+ - codecov/upload:
+ flags: C
+ token: CODECOV_TOKEN
- run:
name: Valgrind for C tests.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cf2708f9..9fd93067 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,13 +11,17 @@ In development
**Performance improvements**
-- Reduce memory usage when running `match_samples` against large cohorts
- containing sequences with substantial amounts of error.
+- Reduce memory usage when running `match_samples` against large cohorts
+ containing sequences with substantial amounts of error.
({pr}`761`, {user}`jeromekelleher`)
- `truncate_ancestors` no longer requires loading all the ancestors into RAM.
({pr}`811`, {user}`benjeffery`)
+- Reduce memory requirements of the `generate_ancestors` function by providing
+ the `genotype_encoding` ({pr}`809`) and `mmap_temp_dir` ({pr}`808`) options
+ ({user}`jeromekelleher`).
+
## [0.3.0] - 2022-10-25
**Features**
@@ -29,11 +33,11 @@ In development
- The CLI interface now allows recombination rate (or rate maps) and mismatch ratios
to be specified ({pr}`731`, {issue}`435` {user}`hyanwong`)
-
+
- The calls to match-ancestors and match-samples via the CLI are now logged
in the provenance entries of the output tree sequence ({pr}`732` and `741`,
{issue}`730` {user}`hyanwong`)
-
+
- The CLI interface allows `--no-post-process` to be specified (for details of post-
processing, see "Breaking changes" below) ({pr}`727`, {issue}`721` {user}`hyanwong`)
@@ -45,7 +49,7 @@ In development
- `sample_data.subset()` now accepts a sequence_length ({pr}`681`, {user}`hyanwong`)
- `verify` no longer raises error when comparing a genotype to missingness.
- ({pr}`716`, {issue}`625`, {user}`benjeffery`)
+ ({pr}`716`, {issue}`625`, {user}`benjeffery`)
**Breaking changes**:
@@ -54,8 +58,8 @@ In development
tsinfer to aid the matching process) then splits the ultimate ancestor into separate
pieces. If splitting is not required, the `post_process` step can also be called as a
separate function with the parameter `split_ultimate=False` ({pr}`687`, {pr}`750`,
- {issue}`673`, {user}`hyanwong`)
-
+ {issue}`673`, {user}`hyanwong`)
+
- Post-processing by default erases tree topology that exists before the first site
and one unit after the last site, to avoid extrapolating into regions with no data.
This can be disabled by calling `post_process` step as a separate function with the
@@ -94,7 +98,7 @@ In development
- Oldest nodes in a standard inferred tree sequence are no longer set to frequencies ~2
and ~3 (i.e. 2 or 3 times as old as all the other nodes), but are spaced above the
others by the mean time between unique ancestor ages ({pr}`485`, {user}`hyanwong`)
-
+
- The `tsinfer.SampleData.from_tree_sequence()` function now defaults to setting
`use_sites_time` and `use_individuals_time` to `False` rather than `True`
({pr}`599`, {user}`hyanwong`)
diff --git a/_tsinfermodule.c b/_tsinfermodule.c
index 20424655..0e6b62f7 100644
--- a/_tsinfermodule.c
+++ b/_tsinfermodule.c
@@ -92,13 +92,14 @@ AncestorBuilder_init(AncestorBuilder *self, PyObject *args, PyObject *kwds)
{
int ret = -1;
int err;
- static char *kwlist[] = {"num_samples", "max_sites", "genotype_encoding", NULL};
+ static char *kwlist[] = {"num_samples", "max_sites", "genotype_encoding", "mmap_fd", NULL};
int num_samples, max_sites, genotype_encoding;
int flags = 0;
+ int mmap_fd = -1;
self->builder = NULL;
- if (!PyArg_ParseTupleAndKeywords(args, kwds, "ii|i", kwlist,
- &num_samples, &max_sites, &genotype_encoding)) {
+ if (!PyArg_ParseTupleAndKeywords(args, kwds, "ii|ii", kwlist,
+ &num_samples, &max_sites, &genotype_encoding, &mmap_fd)) {
goto out;
}
self->builder = PyMem_Malloc(sizeof(ancestor_builder_t));
@@ -108,7 +109,7 @@ AncestorBuilder_init(AncestorBuilder *self, PyObject *args, PyObject *kwds)
}
flags = genotype_encoding;
Py_BEGIN_ALLOW_THREADS
- err = ancestor_builder_alloc(self->builder, num_samples, max_sites, flags);
+ err = ancestor_builder_alloc(self->builder, num_samples, max_sites, mmap_fd, flags);
Py_END_ALLOW_THREADS
if (err != 0) {
handle_library_error(err);
diff --git a/lib/ancestor_builder.c b/lib/ancestor_builder.c
index 7e98ce44..5af66d7c 100644
--- a/lib/ancestor_builder.c
+++ b/lib/ancestor_builder.c
@@ -16,6 +16,15 @@
** You should have received a copy of the GNU General Public License
** along with tsinfer. If not, see .
*/
+/* It's not worth trying to get mmap'd genotypes working on windows,
+ * and is just a silent no-op if it's tried.
+ */
+#if defined(_WIN32)
+#else
+/* Needed for ftruncate */
+#define _XOPEN_SOURCE 700
+#define MMAP_GENOTYPES 1
+#endif
#include "tsinfer.h"
#include "err.h"
@@ -25,6 +34,17 @@
#include
#include
+#include
+#include
+#include
+#include
+
+#ifdef MMAP_GENOTYPES
+#include
+#include
+#include
+#endif
+
#include "avl.h"
/* Note: using an unoptimised version of bit packing here because we're
@@ -135,6 +155,7 @@ ancestor_builder_print_state(ancestor_builder_t *self, FILE *out)
fprintf(out, "Ancestor builder\n");
fprintf(out, "flags = %d\n", (int) self->flags);
+ fprintf(out, "mmap_fd = %d\n", self->mmap_fd);
fprintf(out, "num_samples = %d\n", (int) self->num_samples);
fprintf(out, "num_sites = %d\n", (int) self->num_sites);
fprintf(out, "num_ancestors = %d\n", (int) self->num_ancestors);
@@ -181,23 +202,62 @@ ancestor_builder_print_state(ancestor_builder_t *self, FILE *out)
return 0;
}
-int
-ancestor_builder_alloc(
- ancestor_builder_t *self, size_t num_samples, size_t max_sites, int flags)
+#ifdef MMAP_GENOTYPES
+
+static int
+ancestor_builder_make_genotype_mmap(ancestor_builder_t *self)
{
+
int ret = 0;
- unsigned long max_size = 1024 * 1024;
- memset(self, 0, sizeof(ancestor_builder_t));
- if (num_samples <= 1) {
- ret = TSI_ERR_BAD_NUM_SAMPLES;
+ self->mmap_size = self->max_sites * self->encoded_genotypes_size;
+ if (ftruncate(self->mmap_fd, (off_t) self->mmap_size) != 0) {
+ ret = TSI_ERR_IO;
goto out;
}
+ self->mmap_buffer = mmap(
+ NULL, self->mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, self->mmap_fd, 0);
+ if (self->mmap_buffer == MAP_FAILED) {
+ self->mmap_buffer = NULL;
+ ret = TSI_ERR_IO;
+ goto out;
+ }
+ self->mmap_offset = 0;
+out:
+ return ret;
+}
+static int
+ancestor_builder_free_genotype_mmap(ancestor_builder_t *self)
+{
+ if (self->mmap_buffer != NULL) {
+ /* There's nothing we can do about it here, so don't check errors. */
+ munmap(self->mmap_buffer, self->mmap_size);
+ }
+ /* Try to truncate to zero so we don't flush out all the data */
+ ftruncate(self->mmap_fd, 0);
+ return 0;
+}
+#endif
+
+int
+ancestor_builder_alloc(ancestor_builder_t *self, size_t num_samples, size_t max_sites,
+ int mmap_fd, int flags)
+{
+ int ret = 0;
+ unsigned long max_size = 1024 * 1024;
+
+ memset(self, 0, sizeof(ancestor_builder_t));
self->num_samples = num_samples;
self->max_sites = max_sites;
+ self->mmap_fd = mmap_fd;
self->num_sites = 0;
self->flags = flags;
+
+ if (num_samples <= 1) {
+ ret = TSI_ERR_BAD_NUM_SAMPLES;
+ goto out;
+ }
if (self->flags & TSI_GENOTYPE_ENCODING_ONE_BIT) {
self->encoded_genotypes_size = (num_samples / 8) + ((num_samples % 8) != 0);
self->decoded_genotypes_size = self->encoded_genotypes_size * 8;
@@ -228,6 +288,14 @@ ancestor_builder_alloc(
if (ret != 0) {
goto out;
}
+#if MMAP_GENOTYPES
+ if (self->mmap_fd != -1) {
+ ret = ancestor_builder_make_genotype_mmap(self);
+ if (ret != 0) {
+ goto out;
+ }
+ }
+#endif
avl_init_tree(&self->time_map, cmp_time_map, NULL);
out:
return ret;
@@ -236,13 +304,19 @@ ancestor_builder_alloc(
size_t
ancestor_builder_get_memsize(const ancestor_builder_t *self)
{
- /* Ignore the other allocs as insignificant */
+ /* Ignore the other allocs as insignificant, and don't report the
+ * size of the mmap'd region */
return self->main_allocator.total_size + self->indexing_allocator.total_size;
}
int
ancestor_builder_free(ancestor_builder_t *self)
{
+#if MMAP_GENOTYPES
+ if (self->mmap_fd != -1) {
+ ancestor_builder_free_genotype_mmap(self);
+ }
+#endif
tsi_safe_free(self->sites);
tsi_safe_free(self->descriptors);
tsk_safe_free(self->genotype_encode_buffer);
@@ -558,7 +632,18 @@ ancestor_builder_encode_genotypes(
static uint8_t *
ancestor_builder_allocate_genotypes(ancestor_builder_t *self)
{
- return tsk_blkalloc_get(&self->main_allocator, self->encoded_genotypes_size);
+ uint8_t *ret = NULL;
+ void *p;
+
+ if (self->mmap_buffer == NULL) {
+ ret = tsk_blkalloc_get(&self->main_allocator, self->encoded_genotypes_size);
+ } else {
+ p = (char *) self->mmap_buffer + self->mmap_offset;
+ self->mmap_offset += self->encoded_genotypes_size;
+ assert(self->mmap_offset <= self->mmap_size);
+ ret = (uint8_t *) p;
+ }
+ return ret;
}
int WARN_UNUSED
diff --git a/lib/err.c b/lib/err.c
index b4fc8104..f94b524d 100644
--- a/lib/err.c
+++ b/lib/err.c
@@ -18,6 +18,7 @@
*/
#include "err.h"
+#include
const char *
tsi_strerror(int err)
@@ -105,6 +106,9 @@ tsi_strerror(int err)
case TSI_ERR_ONE_BIT_NON_BINARY:
ret = "One-bit genotype encoding only supports binary 0/1 data";
break;
+ case TSI_ERR_IO:
+ ret = tsk_strerror(TSK_ERR_IO);
+ break;
}
return ret;
}
diff --git a/lib/err.h b/lib/err.h
index 90cd7a51..5b847c12 100644
--- a/lib/err.h
+++ b/lib/err.h
@@ -26,6 +26,7 @@
#define TSI_ERR_MATCH_IMPOSSIBLE_EXTREME_MUTATION_PROBA -22
#define TSI_ERR_MATCH_IMPOSSIBLE_ZERO_RECOMB_PRECISION -23
#define TSI_ERR_ONE_BIT_NON_BINARY -24
+#define TSI_ERR_IO -25
// clang-format on
#ifdef __GNUC__
diff --git a/lib/tests/tests.c b/lib/tests/tests.c
index 1b8db218..5ded462c 100644
--- a/lib/tests/tests.c
+++ b/lib/tests/tests.c
@@ -331,7 +331,8 @@ initialise_builder(tree_sequence_builder_t *tsb, double oldest_time)
static void
run_random_data(size_t num_samples, size_t num_sites, int seed,
- double recombination_rate, double mismatch_rate, int ancestor_builder_options)
+ double recombination_rate, double mismatch_rate, int ancestor_builder_mmap_fd,
+ int ancestor_builder_options)
{
tsk_table_collection_t tables;
ancestor_builder_t ancestor_builder;
@@ -359,8 +360,8 @@ run_random_data(size_t num_samples, size_t num_sites, int seed,
}
CU_ASSERT_FATAL(num_samples >= 2);
- ret = ancestor_builder_alloc(
- &ancestor_builder, num_samples, num_sites, ancestor_builder_options);
+ ret = ancestor_builder_alloc(&ancestor_builder, num_samples, num_sites,
+ ancestor_builder_mmap_fd, ancestor_builder_options);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tree_sequence_builder_alloc(&tsb, num_sites, NULL, 1, 1, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
@@ -451,23 +452,37 @@ test_ancestor_builder_errors(void)
allele_t genotypes_zeros[4] = { 0, 0, 0, 0 };
tsk_id_t start, end;
allele_t haplotype[4];
+ FILE *mmap_file;
- ret = ancestor_builder_alloc(&ancestor_builder, 0, 1, 0);
+ /* Bad file descriptor for mmap FD */
+ ret = ancestor_builder_alloc(&ancestor_builder, 2, 1, -2, 0);
+ CU_ASSERT_EQUAL_FATAL(ret, TSI_ERR_IO);
+ ancestor_builder_free(&ancestor_builder);
+
+ /* File is opened in the wrong mode */
+ mmap_file = fopen(_tmp_file_name, "w");
+ CU_ASSERT_FATAL(mmap_file != NULL);
+ ret = ancestor_builder_alloc(&ancestor_builder, 2, 1, fileno(mmap_file), 0);
+ CU_ASSERT_EQUAL_FATAL(ret, TSI_ERR_IO);
+ ancestor_builder_free(&ancestor_builder);
+ fclose(mmap_file);
+
+ ret = ancestor_builder_alloc(&ancestor_builder, 0, 1, -1, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSI_ERR_BAD_NUM_SAMPLES);
ancestor_builder_free(&ancestor_builder);
- ret = ancestor_builder_alloc(&ancestor_builder, 1, 1, 0);
+ ret = ancestor_builder_alloc(&ancestor_builder, 1, 1, -1, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSI_ERR_BAD_NUM_SAMPLES);
ancestor_builder_free(&ancestor_builder);
- ret = ancestor_builder_alloc(&ancestor_builder, 2, 0, 0);
+ ret = ancestor_builder_alloc(&ancestor_builder, 2, 0, -1, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_EQUAL_FATAL(ancestor_builder.num_sites, 0);
ret = ancestor_builder_add_site(&ancestor_builder, 4, genotypes_ones);
CU_ASSERT_EQUAL_FATAL(ret, TSI_ERR_TOO_MANY_SITES);
ancestor_builder_free(&ancestor_builder);
- ret = ancestor_builder_alloc(&ancestor_builder, 4, 2, 0);
+ ret = ancestor_builder_alloc(&ancestor_builder, 4, 2, -1, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = ancestor_builder_add_site(&ancestor_builder, 4, genotypes_zeros);
CU_ASSERT_EQUAL_FATAL(ret, 0);
@@ -480,7 +495,6 @@ test_ancestor_builder_errors(void)
ancestor_builder.descriptors[0].num_focal_sites,
ancestor_builder.descriptors[0].focal_sites, &start, &end, haplotype);
CU_ASSERT_EQUAL_FATAL(ret, TSI_ERR_BAD_FOCAL_SITE);
-
ancestor_builder_free(&ancestor_builder);
}
@@ -493,7 +507,7 @@ test_ancestor_builder_one_site(void)
allele_t ancestor[1];
tsk_id_t start, end, focal_sites[1];
- ret = ancestor_builder_alloc(&ancestor_builder, 4, 1, 0);
+ ret = ancestor_builder_alloc(&ancestor_builder, 4, 1, -1, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = ancestor_builder_add_site(&ancestor_builder, 4, genotypes);
CU_ASSERT_EQUAL_FATAL(ret, 0);
@@ -808,8 +822,8 @@ test_random_data_n5_m3(void)
for (seed = 1; seed < 100; seed++) {
/* printf("seed = %d\n", seed); */
- run_random_data(5, 3, seed, 1e-3, 1e-20, 0);
- run_random_data(5, 3, seed, 1e-20, 1e-3, 0);
+ run_random_data(5, 3, seed, 1e-3, 1e-20, -1, 0);
+ run_random_data(5, 3, seed, 1e-20, 1e-3, -1, 0);
}
}
@@ -822,8 +836,8 @@ test_random_data_n5_m20(void)
for (j = 0; j < sizeof(options) / sizeof(*options); j++) {
for (seed = 1; seed < 10; seed++) {
- run_random_data(5, 20, seed, 1e-3, 1e-20, options[j]);
- run_random_data(5, 20, seed, 1e-20, 1e-3, options[j]);
+ run_random_data(5, 20, seed, 1e-3, 1e-20, -1, options[j]);
+ run_random_data(5, 20, seed, 1e-20, 1e-3, -1, options[j]);
}
}
}
@@ -835,8 +849,8 @@ test_random_data_n10_m10(void)
int options[] = { 0, TSI_GENOTYPE_ENCODING_ONE_BIT };
for (j = 0; j < sizeof(options) / sizeof(*options); j++) {
- run_random_data(10, 10, 43, 1e-3, 1e-20, options[j]);
- run_random_data(10, 10, 43, 1e-20, 1e-3, options[j]);
+ run_random_data(10, 10, 43, 1e-3, 1e-20, -1, options[j]);
+ run_random_data(10, 10, 43, 1e-20, 1e-3, -1, options[j]);
}
}
@@ -847,8 +861,8 @@ test_random_data_n10_m100(void)
int options[] = { 0, TSI_GENOTYPE_ENCODING_ONE_BIT };
for (j = 0; j < sizeof(options) / sizeof(*options); j++) {
- run_random_data(10, 100, 43, 1e-3, 1e-20, options[j]);
- run_random_data(10, 100, 43, 1e-20, 1e-3, options[j]);
+ run_random_data(10, 100, 43, 1e-3, 1e-20, -1, options[j]);
+ run_random_data(10, 100, 43, 1e-20, 1e-3, -1, options[j]);
}
}
@@ -859,8 +873,8 @@ test_random_data_n100_m10(void)
int options[] = { 0, TSI_GENOTYPE_ENCODING_ONE_BIT };
for (j = 0; j < sizeof(options) / sizeof(*options); j++) {
- run_random_data(10, 10, 1243, 1e-3, 1e-20, options[j]);
- run_random_data(10, 10, 1243, 1e-20, 1e-3, options[j]);
+ run_random_data(10, 10, 1243, 1e-3, 1e-20, -1, options[j]);
+ run_random_data(10, 10, 1243, 1e-20, 1e-3, -1, options[j]);
}
}
@@ -871,9 +885,24 @@ test_random_data_n100_m100(void)
int options[] = { 0, TSI_GENOTYPE_ENCODING_ONE_BIT };
for (j = 0; j < sizeof(options) / sizeof(*options); j++) {
- run_random_data(100, 100, 42, 1e-3, 1e-20, options[j]);
- run_random_data(100, 100, 42, 1e-20, 1e-3, options[j]);
+ run_random_data(100, 100, 42, 1e-3, 1e-20, -1, options[j]);
+ run_random_data(100, 100, 42, 1e-20, 1e-3, -1, options[j]);
+ }
+}
+
+static void
+test_random_data_ab_mmap(void)
+{
+ size_t j;
+ int options[] = { 0, TSI_GENOTYPE_ENCODING_ONE_BIT };
+ FILE *mmap_file = fopen(_tmp_file_name, "w+");
+
+ CU_ASSERT_FATAL(mmap_file != NULL);
+
+ for (j = 0; j < sizeof(options) / sizeof(*options); j++) {
+ run_random_data(100, 100, 42, 1e-3, 1e-20, fileno(mmap_file), options[j]);
}
+ fclose(mmap_file);
}
static void
@@ -1040,6 +1069,7 @@ main(int argc, char **argv)
{ "test_random_data_n10_m100", test_random_data_n10_m100 },
{ "test_random_data_n100_m10", test_random_data_n100_m10 },
{ "test_random_data_n100_m100", test_random_data_n100_m100 },
+ { "test_random_data_ab_mmap", test_random_data_ab_mmap },
{ "test_packbits_1", test_packbits_1 },
{ "test_packbits_2", test_packbits_2 },
diff --git a/lib/tsinfer.h b/lib/tsinfer.h
index bb1ec71b..628a5519 100644
--- a/lib/tsinfer.h
+++ b/lib/tsinfer.h
@@ -116,6 +116,12 @@ typedef struct {
size_t encoded_genotypes_size;
size_t decoded_genotypes_size;
uint8_t *genotype_encode_buffer;
+ /* Optional file-descriptor for the file to be used as the mmap memory
+ * store for encoded genotypes */
+ int mmap_fd;
+ void *mmap_buffer;
+ size_t mmap_offset;
+ size_t mmap_size;
} ancestor_builder_t;
typedef struct _mutation_list_node_t {
@@ -203,8 +209,8 @@ typedef struct {
} output;
} ancestor_matcher_t;
-int ancestor_builder_alloc(
- ancestor_builder_t *self, size_t num_samples, size_t num_sites, int flags);
+int ancestor_builder_alloc(ancestor_builder_t *self, size_t num_samples,
+ size_t num_sites, int mmap_fd, int flags);
int ancestor_builder_free(ancestor_builder_t *self);
int ancestor_builder_print_state(ancestor_builder_t *self, FILE *out);
int ancestor_builder_add_site(
diff --git a/tests/test_inference.py b/tests/test_inference.py
index 02e5c2c9..3d32ed27 100644
--- a/tests/test_inference.py
+++ b/tests/test_inference.py
@@ -27,6 +27,7 @@
import random
import re
import string
+import sys
import tempfile
import unittest
import unittest.mock as mock
@@ -42,6 +43,8 @@
import tsinfer
import tsinfer.eval_util as eval_util
+IS_WINDOWS = sys.platform == "win32"
+
def get_random_data_example(num_samples, num_sites, seed=42, num_states=2):
np.random.seed(seed)
@@ -1721,6 +1724,51 @@ def test_encodings_identical_results(self, genotype_encoding, num_threads):
)
a1.assert_data_equal(a2)
+ @pytest.mark.parametrize("genotype_encoding", tsinfer.GenotypeEncoding)
+ @pytest.mark.parametrize("num_threads", [0, 1, 10])
+ def test_mmap_identical_results(self, genotype_encoding, num_threads):
+ n = 27
+ m = 182
+ G, positions = get_random_data_example(n, m)
+ sample_data = tsinfer.SampleData(sequence_length=m)
+ for genotypes, position in zip(G, positions):
+ sample_data.add_site(position, genotypes)
+ sample_data.finalise()
+ # Do we get precisely the same result as the C engine with 8 bit encoding
+ # and non-mmaped storage?
+ a1 = tsinfer.generate_ancestors(sample_data)
+ with tempfile.TemporaryDirectory() as tmpdir:
+ a2 = tsinfer.generate_ancestors(
+ sample_data,
+ genotype_encoding=genotype_encoding,
+ num_threads=num_threads,
+ mmap_temp_dir=tmpdir,
+ )
+ # Temporary file should be deleted
+ assert len(os.listdir(tmpdir)) == 0
+ a1.assert_data_equal(a2)
+
+ def test_mmap_missing_dir(self):
+ m = 10
+ G, positions = get_random_data_example(2, m)
+ with tsinfer.SampleData(sequence_length=m) as sample_data:
+ for genotypes, position in zip(G, positions):
+ sample_data.add_site(position, genotypes)
+ with pytest.raises(FileNotFoundError):
+ tsinfer.generate_ancestors(sample_data, mmap_temp_dir="/does_not_exist")
+
+ @pytest.mark.skipif(IS_WINDOWS, reason="Windows is annoying")
+ def test_mmap_unwriteable_dir(self):
+ m = 10
+ G, positions = get_random_data_example(2, m)
+ with tsinfer.SampleData(sequence_length=m) as sample_data:
+ for genotypes, position in zip(G, positions):
+ sample_data.add_site(position, genotypes)
+
+ with pytest.raises(PermissionError):
+ # Assuming /bin is unwriteable here
+ tsinfer.generate_ancestors(sample_data, mmap_temp_dir="/bin")
+
def test_one_bit_encoding_missing_data(self):
m = 10
G, positions = get_random_data_example(5, m, seed=1234, num_states=3)
diff --git a/tests/test_low_level.py b/tests/test_low_level.py
index 8b414f0a..550a8994 100644
--- a/tests/test_low_level.py
+++ b/tests/test_low_level.py
@@ -26,6 +26,9 @@
import _tsinfer
+IS_WINDOWS = sys.platform == "win32"
+
+
class TestOutOfMemory:
"""
Make sure we raise the correct error when out of memory occurs in
@@ -116,11 +119,17 @@ def test_init(self):
_tsinfer.AncestorBuilder(
num_samples=2, max_sites=2, genotype_encoding=bad_value
)
-
+ with pytest.raises(TypeError):
+ _tsinfer.AncestorBuilder(num_samples=2, max_sites=2, mmap_fd=bad_value)
for bad_num_samples in [0, 1]:
with pytest.raises(_tsinfer.LibraryError):
_tsinfer.AncestorBuilder(num_samples=bad_num_samples, max_sites=0)
+ @pytest.mark.skipif(IS_WINDOWS, reason="mmap_fd is a no-op on Windows")
+ def test_bad_fd(self):
+ with pytest.raises(_tsinfer.LibraryError, match="Bad file desc"):
+ _tsinfer.AncestorBuilder(num_samples=2, max_sites=2, mmap_fd=-2)
+
def test_add_site(self):
ab = _tsinfer.AncestorBuilder(num_samples=2, max_sites=10)
for bad_type in ["sdf", {}, None]:
diff --git a/tsinfer/inference.py b/tsinfer/inference.py
index 43e33108..c4c6247a 100644
--- a/tsinfer/inference.py
+++ b/tsinfer/inference.py
@@ -27,6 +27,7 @@
import json
import logging
import queue
+import tempfile
import threading
import time
@@ -349,6 +350,7 @@ def generate_ancestors(
exclude_positions=None,
num_threads=0,
genotype_encoding=None,
+ mmap_temp_dir=None,
# Deliberately undocumented parameters below
engine=constants.C_ENGINE,
progress_monitor=None,
@@ -357,7 +359,7 @@ def generate_ancestors(
):
"""
generate_ancestors(sample_data, *, path=None, exclude_positions=None,\
- num_threads=0, genotype_encoding=None, **kwargs)
+ num_threads=0, genotype_encoding=None, mmap_temp_dir=None, **kwargs)
Runs the ancestor generation :ref:`algorithm `
on the specified :class:`SampleData` instance and returns the resulting
@@ -372,6 +374,30 @@ def generate_ancestors(
Other keyword arguments are passed to the :class:`AncestorData` constructor,
which may be used to control the storage properties of the generated file.
+ Ancestor generation involves loading the entire genotype matrix into
+ memory, by default using one byte per haploid genotype, which can be
+ prohibitively large when working with sample sizes of 100,000 or more.
+ There are two options to help mitigate memory usage. The
+ ``genotype_encoding`` parameter allows the user to specify a more compact
+ encoding scheme, which reduces storage space for datasets with small
+ numbers of alleles. Currently, the :attr:`.GenotypeEncoding.ONE_BIT`
+ encoding is supported, which provides 8-fold compression of biallelic,
+ non-missing data. An error is raised if an encoding that does not support
+ the range of values present in a given dataset is provided.
+
+ The second option for reducing the RAM footprint of this function is to
+ use the ``mmap_temp_dir`` parameter. This allows the genotype data to be
+ cached on file, transparently using the operating system's virtual memory
+ subsystem to swap in and out the data. This can work well if the encoded
+ genotype matrix *almost* fits in RAM and fast local storage is available.
+ However, if the size of the encoded genotype matrix is more than, say,
+ twice the available RAM it is unlikely that this function will complete
+ in a reasonable time-frame. A temporary file is created in the specified
+ ``mmap_temp_dir``, which is automatically deleted when the function
+ completes.
+
+ .. warning:: The ``mmap_temp_dir`` option is a silent no-op on Windows!
+
:param SampleData sample_data: The :class:`SampleData` instance that we are
genering putative ancestors from.
:param str path: The path of the file to store the sample data. If None,
@@ -385,6 +411,11 @@ def generate_ancestors(
:param int genotype_encoding: The encoding to use for genotype data internally
when generating ancestors. See the :class:`.GenotypeEncoding` class for
the available options. Defaults to one-byte per genotype.
+ :param str mmap_temp_dir: The directory within which to create the
+ temporary backing file when using mmaped memory for bulk genotype
+ storage. If None (the default) allocate memory directly using the
+ standard mechanism. This is an advanced option, usually only relevant
+ when working with very large datasets (see above for more information).
:return: The inferred ancestors stored in an :class:`AncestorData` instance.
:rtype: AncestorData
"""
@@ -408,6 +439,7 @@ def generate_ancestors(
num_threads=num_threads,
engine=engine,
genotype_encoding=genotype_encoding,
+ mmap_temp_dir=mmap_temp_dir,
progress_monitor=progress_monitor,
)
generator.add_sites(exclude_positions)
@@ -857,6 +889,7 @@ def __init__(
num_threads=0,
engine=constants.C_ENGINE,
genotype_encoding=constants.GenotypeEncoding.EIGHT_BIT,
+ mmap_temp_dir=None,
progress_monitor=None,
):
self.sample_data = sample_data
@@ -870,13 +903,27 @@ def __init__(
self.inference_site_ids = []
self.num_samples = sample_data.num_samples
self.num_threads = num_threads
-
+ self.mmap_temp_file = None
+ mmap_fd = -1
+
+ genotype_matrix_size = self.max_sites * self.num_samples
+ if genotype_encoding == constants.GenotypeEncoding.ONE_BIT:
+ genotype_matrix_size /= 8
+ genotype_mem = humanize.naturalsize(genotype_matrix_size, binary=True)
+ logging.info(f"Max encoded genotype matrix size={genotype_mem}")
+ if mmap_temp_dir is not None:
+ self.mmap_temp_file = tempfile.NamedTemporaryFile(
+ dir=mmap_temp_dir, prefix="tsinfer-mmap-genotypes-"
+ )
+ logging.info(f"Using mmapped {self.mmap_temp_file.name} for genotypes")
+ mmap_fd = self.mmap_temp_file.fileno()
if engine == constants.C_ENGINE:
logger.debug("Using C AncestorBuilder implementation")
self.ancestor_builder = _tsinfer.AncestorBuilder(
self.num_samples,
self.max_sites,
genotype_encoding=genotype_encoding,
+ mmap_fd=mmap_fd,
)
elif engine == constants.PY_ENGINE:
logger.debug("Using Python AncestorBuilder implementation")
@@ -1087,6 +1134,11 @@ def run(self):
self._run_threaded(progress)
progress.close()
logger.info("Finished building ancestors")
+ if self.mmap_temp_file is not None:
+ try:
+ self.mmap_temp_file.close()
+ except: # noqa
+ pass
return self.ancestor_data