From 770e6753675643966f7952d10fe75e2d4a881566 Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Mon, 4 Mar 2019 21:55:16 -0500 Subject: [PATCH 1/4] Changed sampling of denoised copy ratios to address memory spike and updated output formats and filenames. --- .../hellbender/gcnvkernel/io/io_consts.py | 4 ++-- .../gcnvkernel/io/io_denoising_calling.py | 20 +++++++++---------- .../hellbender/gcnvkernel/models/commons.py | 8 +++++--- .../models/model_denoising_calling.py | 13 ++++++++---- 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_consts.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_consts.py index deabf936017..5f422acb3c0 100644 --- a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_consts.py +++ b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_consts.py @@ -54,8 +54,8 @@ default_class_log_posterior_tsv_filename = "log_q_tau_tk.tsv" default_baseline_copy_number_tsv_filename = "baseline_copy_number_t.tsv" default_copy_number_segments_tsv_filename = "copy_number_segments.tsv" -default_denoised_copy_ratios_mean_tsv_filename = "denoised_copy_ratios_mu.tsv" -default_denoised_copy_ratios_std_tsv_filename = "denoised_copy_ratios_std.tsv" +default_denoised_copy_ratios_mean_tsv_filename = "mu_denoised_copy_ratio_t.tsv" +default_denoised_copy_ratios_std_tsv_filename = "std_denoised_copy_ratio_t.tsv" default_denoising_config_json_filename = "denoising_config.json" default_calling_config_json_filename = "calling_config.json" diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_denoising_calling.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_denoising_calling.py index 35288f1b8ff..ab051e2aff3 100644 --- a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_denoising_calling.py +++ b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/io/io_denoising_calling.py @@ -154,12 +154,12 @@ def __call__(self): self.denoising_model_approx) # compute approximate denoised copy ratios - denoising_copy_ratios_approx_generator = commons.get_sampling_generator_for_model_approximation( - model_approx=self.denoising_model_approx, model_var_name='denoised_copy_ratios') - denoised_copy_ratios_mean, denoised_copy_ratios_variance =\ - math.calculate_mean_and_variance_online(denoising_copy_ratios_approx_generator) - denoised_copy_ratios_mean = np.transpose(denoised_copy_ratios_mean) - denoised_copy_ratios_std = np.transpose(np.sqrt(denoised_copy_ratios_variance)) + _logger.info("Sampling and approximating posteriors for denoised copy ratios...") + denoising_copy_ratios_st_approx_generator = commons.get_sampling_generator_for_model_approximation( + model_approx=self.denoising_model_approx, node=self.denoising_model['denoised_copy_ratio_st']) + mu_denoised_copy_ratio_st, var_denoised_copy_ratio_st =\ + math.calculate_mean_and_variance_online(denoising_copy_ratios_st_approx_generator) + std_denoised_copy_ratio_st = np.sqrt(var_denoised_copy_ratio_st) for si, sample_name in enumerate(self.denoising_calling_workspace.sample_names): sample_name_comment_line = [io_consts.sample_name_sam_header_prefix + sample_name] @@ -201,20 +201,20 @@ def __call__(self): write_shape_info=False) # write denoised copy ratio means - denoised_copy_ratio_mu_s = denoised_copy_ratios_mean[:, si] + mu_denoised_copy_ratio_t = mu_denoised_copy_ratio_st[si, :] io_commons.write_ndarray_to_tsv( os.path.join(sample_posterior_path, io_consts.default_denoised_copy_ratios_mean_tsv_filename), - denoised_copy_ratio_mu_s, + mu_denoised_copy_ratio_t, extra_comment_lines=sample_name_comment_line, header=io_consts.denoised_copy_ratio_mean_column_name, write_shape_info=False ) # write denoised copy ratio standard deviations - denoised_copy_ratio_std_s = denoised_copy_ratios_std[:, si] + std_denoised_copy_ratio_t = std_denoised_copy_ratio_st[si, :] io_commons.write_ndarray_to_tsv( os.path.join(sample_posterior_path, io_consts.default_denoised_copy_ratios_std_tsv_filename), - denoised_copy_ratio_std_s, + std_denoised_copy_ratio_t, extra_comment_lines=sample_name_comment_line, header=io_consts.denoised_copy_ratio_std_column_name, write_shape_info=False diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/commons.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/commons.py index 3a483db4192..a52e48d89ae 100644 --- a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/commons.py +++ b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/commons.py @@ -226,16 +226,18 @@ def add_sample_to_cum_sum(posterior_sample, _cum_sum): return outputs[-1] / size -def get_sampling_generator_for_model_approximation(model_approx: pm.MeanField, model_var_name: str, +def get_sampling_generator_for_model_approximation(model_approx: pm.MeanField, node, num_samples: int = 20) -> Generator: """Get a generator that returns samples of a precomputed model approximation for a specific variable in that model Args: model_approx: an instance of PyMC3 mean-field approximation - model_var_name: a stochastic node in the model + node: a stochastic node in the model num_samples: number of samples to draw Returns: A generator that will yield `num_samples` samples from an approximation to a posterior """ - return (model_approx.sample()[model_var_name] for _ in range(num_samples)) + + sample = stochastic_node_mean_symbolic(model_approx, node, size=1) + return (sample.eval() for _ in range(num_samples)) diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/model_denoising_calling.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/model_denoising_calling.py index 81d3fa62356..f557aab6db3 100644 --- a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/model_denoising_calling.py +++ b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/model_denoising_calling.py @@ -437,13 +437,18 @@ def __init__(self, self.log_trans_tkk: Optional[np.ndarray] = None # GC bias factors - # (to be initialize by calling `initialize_bias_inference_vars`) + # (to be initialized by calling `initialize_bias_inference_vars`) self.W_gc_tg: Optional[tst.SparseConstant] = None # auxiliary data structures for hybrid q_c_expectation_mode calculation - # (to be initialize by calling `initialize_bias_inference_vars`) + # (to be initialized by calling `initialize_bias_inference_vars`) self.interval_neighbor_index_list: Optional[List[List[int]]] = None + # denoised copy ratios + denoised_copy_ratio_st = np.zeros((self.num_samples, self.num_intervals), dtype=types.floatX) + self.denoised_copy_ratio_st: types.TensorSharedVariable = th.shared( + denoised_copy_ratio_st, name="denoised_copy_ratio_st", borrow=config.borrow_numpy) + # initialize posterior posterior_initializer.initialize_posterior(denoising_config, calling_config, self) self.initialize_bias_inference_vars() @@ -778,10 +783,10 @@ def __init__(self, # the expected number of erroneously mapped reads mean_mapping_error_correction_s = eps * read_depth_s * shared_workspace.average_ploidy_s - denoised_copy_ratios = ((shared_workspace.n_st - mean_mapping_error_correction_s.dimshuffle(0, 'x')) + denoised_copy_ratio_st = ((shared_workspace.n_st - mean_mapping_error_correction_s.dimshuffle(0, 'x')) / ((1.0 - eps) * read_depth_s.dimshuffle(0, 'x') * bias_st)) - Deterministic(name='denoised_copy_ratios', var=denoised_copy_ratios) + Deterministic(name='denoised_copy_ratio_st', var=denoised_copy_ratio_st) mu_stc = ((1.0 - eps) * read_depth_s.dimshuffle(0, 'x', 'x') * bias_st.dimshuffle(0, 1, 'x') From 5823f508b4293c8f3dba220721def6d04b017cc8 Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Fri, 8 Mar 2019 10:22:57 -0500 Subject: [PATCH 2/4] Updated theano version to 1.0.4 and changed numpy install source to conda defaults to enable MKL. --- scripts/gatkcondaenv.yml.template | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gatkcondaenv.yml.template b/scripts/gatkcondaenv.yml.template index a62e0d9bbcf..74ee61bb093 100644 --- a/scripts/gatkcondaenv.yml.template +++ b/scripts/gatkcondaenv.yml.template @@ -8,6 +8,7 @@ dependencies: - intel-openmp=2018.0.0 - mkl=2018.0.1 - mkl-service=1.1.2 +- defaults::numpy==1.13.3 - openssl=1.0.2l=0 - pip=9.0.1=py36_1 - python=3.6.2=0 @@ -29,7 +30,6 @@ dependencies: - keras==2.2.0 - markdown==2.6.9 - matplotlib==2.1.0 - - numpy==1.13.3 - pandas==0.21.0 - patsy==0.4.1 - protobuf==3.5.0.post1 @@ -44,7 +44,7 @@ dependencies: - scipy==1.0.0 - six==1.11.0 - $tensorFlowDependency - - theano==0.9.0 + - theano==1.0.4 - tqdm==4.19.4 - werkzeug==0.12.2 - gatkPythonPackageArchive.zip From 96b2f8cdf1957fc1d58cdcd04565399152315334 Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Sat, 9 Mar 2019 20:03:11 -0500 Subject: [PATCH 3/4] Updated theano flags to use MKL and OpenMP elemwise. --- .../hellbender/tools/copynumber/case_denoising_calling.py | 3 ++- .../tools/copynumber/case_determine_ploidy_and_depth.py | 3 ++- .../hellbender/tools/copynumber/cohort_denoising_calling.py | 3 ++- .../tools/copynumber/cohort_determine_ploidy_and_depth.py | 3 ++- .../hellbender/tools/copynumber/segment_gcnv_calls.py | 3 ++- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/case_denoising_calling.py b/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/case_denoising_calling.py index ce04b66ac3a..54cbd2a834e 100644 --- a/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/case_denoising_calling.py +++ b/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/case_denoising_calling.py @@ -1,7 +1,8 @@ import os # set theano flags -os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore,openmp=true" +os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore," + \ + "openmp=true,blas.ldflags=-lmkl_rt,openmp_elemwise_minsize=10" import logging import argparse diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/case_determine_ploidy_and_depth.py b/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/case_determine_ploidy_and_depth.py index 4c4600702d6..0929dbf2048 100644 --- a/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/case_determine_ploidy_and_depth.py +++ b/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/case_determine_ploidy_and_depth.py @@ -1,7 +1,8 @@ import os # set theano flags -os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore,openmp=true" +os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore," + \ + "openmp=true,blas.ldflags=-lmkl_rt,openmp_elemwise_minsize=10" import argparse import gcnvkernel diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/cohort_denoising_calling.py b/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/cohort_denoising_calling.py index 9fa4768b3d4..aa1ccfd2348 100644 --- a/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/cohort_denoising_calling.py +++ b/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/cohort_denoising_calling.py @@ -2,7 +2,8 @@ import shutil # set theano flags -os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore,openmp=true" +os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore," + \ + "openmp=true,blas.ldflags=-lmkl_rt,openmp_elemwise_minsize=10" import logging import argparse diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/cohort_determine_ploidy_and_depth.py b/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/cohort_determine_ploidy_and_depth.py index e5b1190b948..1c557aac27d 100644 --- a/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/cohort_determine_ploidy_and_depth.py +++ b/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/cohort_determine_ploidy_and_depth.py @@ -1,7 +1,8 @@ import os # set theano flags -os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore,openmp=true" +os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore," + \ + "openmp=true,blas.ldflags=-lmkl_rt,openmp_elemwise_minsize=10" import argparse import gcnvkernel diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/segment_gcnv_calls.py b/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/segment_gcnv_calls.py index f38631a7c9f..87f4557fd6e 100644 --- a/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/segment_gcnv_calls.py +++ b/src/main/resources/org/broadinstitute/hellbender/tools/copynumber/segment_gcnv_calls.py @@ -1,7 +1,8 @@ import os # set theano flags -os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore,openmp=true" +os.environ["THEANO_FLAGS"] = "device=cpu,floatX=float64,optimizer=fast_run,compute_test_value=ignore," + \ + "openmp=true,blas.ldflags=-lmkl_rt,openmp_elemwise_minsize=10" import logging import argparse From f772b3ba6b895b9b32222fd1396c45a17e9daf83 Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Fri, 15 Mar 2019 17:45:11 -0400 Subject: [PATCH 4/4] Addressed PR comments. --- .../org/broadinstitute/hellbender/gcnvkernel/models/commons.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/commons.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/commons.py index a52e48d89ae..b0bb3c2aeb9 100644 --- a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/commons.py +++ b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/commons.py @@ -239,5 +239,5 @@ def get_sampling_generator_for_model_approximation(model_approx: pm.MeanField, n A generator that will yield `num_samples` samples from an approximation to a posterior """ - sample = stochastic_node_mean_symbolic(model_approx, node, size=1) + sample = model_approx.sample_node(node, size=1)[0] return (sample.eval() for _ in range(num_samples))