Skip to content

Commit

Permalink
Add mmap_temp_dir option to generate_ancestors
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromekelleher committed Apr 5, 2023
1 parent 77e3466 commit 99fb936
Show file tree
Hide file tree
Showing 10 changed files with 267 additions and 47 deletions.
20 changes: 12 additions & 8 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,17 @@ In development

**Performance improvements**

- Reduce memory usage when running `match_samples` against large cohorts
containing sequences with substantial amounts of error.
- Reduce memory usage when running `match_samples` against large cohorts
containing sequences with substantial amounts of error.
({pr}`761`, {user}`jeromekelleher`)

- `truncate_ancestors` no longer requires loading all the ancestors into RAM.
({pr}`811`, {user}`benjeffery`)

- Reduce memory requirements of the `generate_ancestors` function by providing
the `genotype_encoding` ({pr}`809`) and `mmap_temp_dir` ({pr}`808`) options
({user}`jeromekelleher`).

## [0.3.0] - 2022-10-25

**Features**
Expand All @@ -29,11 +33,11 @@ In development

- The CLI interface now allows recombination rate (or rate maps) and mismatch ratios
to be specified ({pr}`731`, {issue}`435` {user}`hyanwong`)

- The calls to match-ancestors and match-samples via the CLI are now logged
in the provenance entries of the output tree sequence ({pr}`732` and `741`,
{issue}`730` {user}`hyanwong`)

- The CLI interface allows `--no-post-process` to be specified (for details of post-
processing, see "Breaking changes" below) ({pr}`727`, {issue}`721` {user}`hyanwong`)

Expand All @@ -45,7 +49,7 @@ In development
- `sample_data.subset()` now accepts a sequence_length ({pr}`681`, {user}`hyanwong`)

- `verify` no longer raises error when comparing a genotype to missingness.
({pr}`716`, {issue}`625`, {user}`benjeffery`)
({pr}`716`, {issue}`625`, {user}`benjeffery`)

**Breaking changes**:

Expand All @@ -54,8 +58,8 @@ In development
tsinfer to aid the matching process) then splits the ultimate ancestor into separate
pieces. If splitting is not required, the `post_process` step can also be called as a
separate function with the parameter `split_ultimate=False` ({pr}`687`, {pr}`750`,
{issue}`673`, {user}`hyanwong`)
{issue}`673`, {user}`hyanwong`)

- Post-processing by default erases tree topology that exists before the first site
and one unit after the last site, to avoid extrapolating into regions with no data.
This can be disabled by calling `post_process` step as a separate function with the
Expand Down Expand Up @@ -94,7 +98,7 @@ In development
- Oldest nodes in a standard inferred tree sequence are no longer set to frequencies ~2
and ~3 (i.e. 2 or 3 times as old as all the other nodes), but are spaced above the
others by the mean time between unique ancestor ages ({pr}`485`, {user}`hyanwong`)

- The `tsinfer.SampleData.from_tree_sequence()` function now defaults to setting
`use_sites_time` and `use_individuals_time` to `False` rather than `True`
({pr}`599`, {user}`hyanwong`)
Expand Down
9 changes: 5 additions & 4 deletions _tsinfermodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,14 @@ AncestorBuilder_init(AncestorBuilder *self, PyObject *args, PyObject *kwds)
{
int ret = -1;
int err;
static char *kwlist[] = {"num_samples", "max_sites", "genotype_encoding", NULL};
static char *kwlist[] = {"num_samples", "max_sites", "genotype_encoding", "mmap_fd", NULL};
int num_samples, max_sites, genotype_encoding;
int flags = 0;
int mmap_fd = -1;

self->builder = NULL;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "ii|i", kwlist,
&num_samples, &max_sites, &genotype_encoding)) {
if (!PyArg_ParseTupleAndKeywords(args, kwds, "ii|ii", kwlist,
&num_samples, &max_sites, &genotype_encoding, &mmap_fd)) {
goto out;
}
self->builder = PyMem_Malloc(sizeof(ancestor_builder_t));
Expand All @@ -108,7 +109,7 @@ AncestorBuilder_init(AncestorBuilder *self, PyObject *args, PyObject *kwds)
}
flags = genotype_encoding;
Py_BEGIN_ALLOW_THREADS
err = ancestor_builder_alloc(self->builder, num_samples, max_sites, flags);
err = ancestor_builder_alloc(self->builder, num_samples, max_sites, mmap_fd, flags);
Py_END_ALLOW_THREADS
if (err != 0) {
handle_library_error(err);
Expand Down
103 changes: 94 additions & 9 deletions lib/ancestor_builder.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@
** You should have received a copy of the GNU General Public License
** along with tsinfer. If not, see <http://www.gnu.org/licenses/>.
*/
/* It's not worth trying to get mmap'd genotypes working on windows,
* and is just a silent no-op if it's tried.
*/
#if defined(_WIN32)
#else
/* Needed for ftruncate */
#define _XOPEN_SOURCE 700
#define MMAP_GENOTYPES 1
#endif

#include "tsinfer.h"
#include "err.h"
Expand All @@ -25,6 +34,17 @@
#include <string.h>
#include <stdbool.h>

#include <errno.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>

#ifdef MMAP_GENOTYPES
#include <sys/mman.h>
#include <unistd.h>
#include <sys/types.h>
#endif

#include "avl.h"

/* Note: using an unoptimised version of bit packing here because we're
Expand Down Expand Up @@ -135,6 +155,7 @@ ancestor_builder_print_state(ancestor_builder_t *self, FILE *out)

fprintf(out, "Ancestor builder\n");
fprintf(out, "flags = %d\n", (int) self->flags);
fprintf(out, "mmap_fd = %d\n", self->mmap_fd);
fprintf(out, "num_samples = %d\n", (int) self->num_samples);
fprintf(out, "num_sites = %d\n", (int) self->num_sites);
fprintf(out, "num_ancestors = %d\n", (int) self->num_ancestors);
Expand Down Expand Up @@ -181,23 +202,62 @@ ancestor_builder_print_state(ancestor_builder_t *self, FILE *out)
return 0;
}

int
ancestor_builder_alloc(
ancestor_builder_t *self, size_t num_samples, size_t max_sites, int flags)
#ifdef MMAP_GENOTYPES

static int
ancestor_builder_make_genotype_mmap(ancestor_builder_t *self)
{

int ret = 0;
unsigned long max_size = 1024 * 1024;

memset(self, 0, sizeof(ancestor_builder_t));
if (num_samples <= 1) {
ret = TSI_ERR_BAD_NUM_SAMPLES;
self->mmap_size = self->max_sites * self->encoded_genotypes_size;
if (ftruncate(self->mmap_fd, (off_t) self->mmap_size) != 0) {
ret = TSI_ERR_IO;
goto out;
}
self->mmap_buffer = mmap(
NULL, self->mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, self->mmap_fd, 0);
if (self->mmap_buffer == MAP_FAILED) {
self->mmap_buffer = NULL;
ret = TSI_ERR_IO;
goto out;
}
self->mmap_offset = 0;
out:
return ret;
}

static int
ancestor_builder_free_genotype_mmap(ancestor_builder_t *self)
{
if (self->mmap_buffer != NULL) {
/* There's nothing we can do about it here, so don't check errors. */
munmap(self->mmap_buffer, self->mmap_size);
}
/* Try to truncate to zero so we don't flush out all the data */
ftruncate(self->mmap_fd, 0);
return 0;
}
#endif

int
ancestor_builder_alloc(ancestor_builder_t *self, size_t num_samples, size_t max_sites,
int mmap_fd, int flags)
{
int ret = 0;
unsigned long max_size = 1024 * 1024;

memset(self, 0, sizeof(ancestor_builder_t));
self->num_samples = num_samples;
self->max_sites = max_sites;
self->mmap_fd = mmap_fd;
self->num_sites = 0;
self->flags = flags;

if (num_samples <= 1) {
ret = TSI_ERR_BAD_NUM_SAMPLES;
goto out;
}
if (self->flags & TSI_GENOTYPE_ENCODING_ONE_BIT) {
self->encoded_genotypes_size = (num_samples / 8) + ((num_samples % 8) != 0);
self->decoded_genotypes_size = self->encoded_genotypes_size * 8;
Expand Down Expand Up @@ -228,6 +288,14 @@ ancestor_builder_alloc(
if (ret != 0) {
goto out;
}
#if MMAP_GENOTYPES
if (self->mmap_fd != -1) {
ret = ancestor_builder_make_genotype_mmap(self);
if (ret != 0) {
goto out;
}
}
#endif
avl_init_tree(&self->time_map, cmp_time_map, NULL);
out:
return ret;
Expand All @@ -236,13 +304,19 @@ ancestor_builder_alloc(
size_t
ancestor_builder_get_memsize(const ancestor_builder_t *self)
{
/* Ignore the other allocs as insignificant */
/* Ignore the other allocs as insignificant, and don't report the
* size of the mmap'd region */
return self->main_allocator.total_size + self->indexing_allocator.total_size;
}

int
ancestor_builder_free(ancestor_builder_t *self)
{
#if MMAP_GENOTYPES
if (self->mmap_fd != -1) {
ancestor_builder_free_genotype_mmap(self);
}
#endif
tsi_safe_free(self->sites);
tsi_safe_free(self->descriptors);
tsk_safe_free(self->genotype_encode_buffer);
Expand Down Expand Up @@ -558,7 +632,18 @@ ancestor_builder_encode_genotypes(
static uint8_t *
ancestor_builder_allocate_genotypes(ancestor_builder_t *self)
{
return tsk_blkalloc_get(&self->main_allocator, self->encoded_genotypes_size);
uint8_t *ret = NULL;
void *p;

if (self->mmap_buffer == NULL) {
ret = tsk_blkalloc_get(&self->main_allocator, self->encoded_genotypes_size);
} else {
p = (char *) self->mmap_buffer + self->mmap_offset;
self->mmap_offset += self->encoded_genotypes_size;
assert(self->mmap_offset <= self->mmap_size);
ret = (uint8_t *) p;
}
return ret;
}

int WARN_UNUSED
Expand Down
4 changes: 4 additions & 0 deletions lib/err.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
*/

#include "err.h"
#include <tskit.h>

const char *
tsi_strerror(int err)
Expand Down Expand Up @@ -105,6 +106,9 @@ tsi_strerror(int err)
case TSI_ERR_ONE_BIT_NON_BINARY:
ret = "One-bit genotype encoding only supports binary 0/1 data";
break;
case TSI_ERR_IO:
ret = tsk_strerror(TSK_ERR_IO);
break;
}
return ret;
}
1 change: 1 addition & 0 deletions lib/err.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#define TSI_ERR_MATCH_IMPOSSIBLE_EXTREME_MUTATION_PROBA -22
#define TSI_ERR_MATCH_IMPOSSIBLE_ZERO_RECOMB_PRECISION -23
#define TSI_ERR_ONE_BIT_NON_BINARY -24
#define TSI_ERR_IO -25
// clang-format on

#ifdef __GNUC__
Expand Down
Loading

0 comments on commit 99fb936

Please sign in to comment.