Skip to content

Commit

Permalink
Merge pull request #355 from jeromekelleher/remove-ucsc-problematic
Browse files Browse the repository at this point in the history
Remove UCSC problematic sites list
  • Loading branch information
jeromekelleher authored Oct 10, 2024
2 parents 6f9df5c + 4dbe849 commit f2f2b19
Show file tree
Hide file tree
Showing 8 changed files with 405 additions and 752 deletions.
465 changes: 300 additions & 165 deletions notebooks/test_ts.ipynb

Large diffs are not rendered by default.

47 changes: 25 additions & 22 deletions sc2ts/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,10 +259,14 @@ def add_provenance(ts, output_file):
@click.argument("ts", type=click.Path(dir_okay=False))
@click.argument("match_db", type=click.Path(dir_okay=False))
@click.option(
"--additional-problematic-sites",
"--problematic-sites",
default=None,
type=click.Path(exists=True, dir_okay=False),
help="File containing the list of additional problematic sites to exclude.",
help=(
"File containing the list of problematic sites to exclude. "
"Note this is combined with the sites defined by --mask-flanks "
"and --mask-problematic-regions options"
),
)
@click.option(
"--mask-flanks",
Expand All @@ -277,39 +281,38 @@ def add_provenance(ts, output_file):
"--mask-problematic-regions",
is_flag=True,
flag_value=True,
help=(
"If true, add the problematic regions problematic sites"
),
help=("If true, add the problematic regions problematic sites"),
)
@click.option("-v", "--verbose", count=True)
@click.option("-l", "--log-file", default=None, type=click.Path(dir_okay=False))
def initialise(
ts, match_db, additional_problematic_sites, mask_flanks, mask_problematic_regions, verbose, log_file
ts,
match_db,
problematic_sites,
mask_flanks,
mask_problematic_regions,
verbose,
log_file,
):
"""
Initialise a new base tree sequence to begin inference.
"""
setup_logging(verbose, log_file)

additional_problematic = np.array([], dtype=int)
if additional_problematic_sites is not None:
additional_problematic = np.loadtxt(
additional_problematic_sites, ndmin=1
).astype(int)
logger.info(
f"Excluding additional {len(additional_problematic)} problematic sites"
)
problematic = np.array([], dtype=int)
if problematic_sites is not None:
problematic = np.loadtxt(problematic_sites, ndmin=1).astype(int)
logger.info(f"Loaded {len(problematic)} problematic sites")
if mask_flanks:
additional_problematic = np.concatenate(
(core.get_flank_coordinates(), additional_problematic)
)
flanks = core.get_flank_coordinates()
logger.info(f"Masking {len(flanks)} sites in flanks")
problematic = np.concatenate((flanks, problematic))
if mask_problematic_regions:
additional_problematic = np.concatenate(
(core.get_problematic_regions(), additional_problematic)
)
known_regions = core.get_problematic_regions()
logger.info(f"Masking {len(known_regions)} sites in known problematic regions")
problematic = np.concatenate((known_regions, problematic))

additional_problematic = np.unique(additional_problematic)
base_ts = sc2ts.initial_ts(additional_problematic.tolist(), use_ucsc=False)
base_ts = sc2ts.initial_ts(np.unique(problematic))
add_provenance(base_ts, ts)
logger.info(f"New base ts at {ts}")
sc2ts.MatchDb.initialise(match_db)
Expand Down
14 changes: 10 additions & 4 deletions sc2ts/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,6 @@ def __len__(self):
data_path = pathlib.Path(__file__).parent / "data"


def get_problematic_sites():
return np.loadtxt(data_path / "problematic_sites.txt", dtype=np.int64)


def get_problematic_regions():
"""
These regions have been reported to have highly recurrent or unusual
Expand Down Expand Up @@ -96,6 +92,16 @@ def get_flank_coordinates():
)


def get_masked_sites(ts):
"""
Return the set of sites not used in the sequence.
"""
unused = np.ones(int(ts.sequence_length), dtype=bool)
unused[ts.sites_position.astype(int)] = False
unused[0] = False
return np.where(unused)[0]


@dataclasses.dataclass
class CovLineage:
name: str
Expand Down
Loading

0 comments on commit f2f2b19

Please sign in to comment.