diff --git a/README.md b/README.md index 32dbfd1..43f8821 100644 --- a/README.md +++ b/README.md @@ -232,8 +232,8 @@ strategy** `sourcetracker2 gibbs -i otu_table.biom -m map.txt -o example4/ --burnin 100` **Calculate the proportion of each source in each sink, using a sink -rarefaction depth of 2500** -`sourcetracker2 gibbs -i otu_table.biom -m map.txt -o example5/ --sink_rarefaction_depth 2500` +rarefaction depth of 1700 and source rarefaction depth of 2000** +`sourcetracker2 gibbs -i otu_table.biom -m map.txt -o example5/ --sink_rarefaction_depth 1700 --source_rarefaction_depth 2000` **Calculate the proportion of each source in each sink, using ipyparallel to run in parallel with 5 jobs** `sourcetracker2 gibbs -i otu_table.biom -m map.txt -o example6/ --jobs 5` diff --git a/sourcetracker/_cli/gibbs.py b/sourcetracker/_cli/gibbs.py index fd21459..4a3dbc4 100755 --- a/sourcetracker/_cli/gibbs.py +++ b/sourcetracker/_cli/gibbs.py @@ -173,12 +173,35 @@ def gibbs(table_fp, mapping_fp, output_dir, loo, jobs, alpha1, alpha2, beta, # Rarify collapsed source data if requested. if source_rarefaction_depth > 0: - csources = subsample_dataframe(csources, source_rarefaction_depth) + d = (csources.sum(1) >= source_rarefaction_depth) + if not d.all(): + print(csources.sum(1)) + too_shallow = (~d).sum() + shallowest = csources.sum(1).min() + raise ValueError(('You requested rarefaction of source samples at ' + '%s, but there are %s collapsed source samples ' + 'that have less sequences than that. The ' + 'shallowest of these is %s sequences.') % + (source_rarefaction_depth, too_shallow, + shallowest)) + else: + csources = subsample_dataframe(csources, source_rarefaction_depth) # Prepare sink data and rarify if requested. sinks = feature_table.loc[sink_samples, :] if sink_rarefaction_depth > 0: - sinks = subsample_dataframe(sinks, sink_rarefaction_depth) + d = (sinks.sum(1) >= sink_rarefaction_depth) + if not d.all(): + too_shallow = (~d).sum() + shallowest = sinks.sum(1).min() + raise ValueError(('You requested rarefaction of sink samples at ' + '%s, but there are %s sink samples that have ' + 'less sequences than that. The shallowest of ' + 'these is %s sequences.') % + (sink_rarefaction_depth, too_shallow, + shallowest)) + else: + sinks = subsample_dataframe(sinks, sink_rarefaction_depth) # If we've been asked to do multiple jobs, we need to spin up a cluster. if jobs > 1: