mvp

#!/usr/bin/env python

from __future__ import print_function

from argparse import ArgumentParser
from functools import reduce
import re
import sys

from Bio import SeqIO
from Bio.Seq import Seq
import pysam


def main(infile, outfile, reference, motif_list, seq_type='dna'):
    '''
    Identify variants that cause the gain or loss of motifs.
    '''

    vcf_file = read_input(infile)
    sequence = str(SeqIO.read(reference,"fasta").seq)
    motif_list = validate_motifs(motif_list, seq_type)
    partner_motif_list = [motif_revcomp(motif, seq_type)
                          if motif_revcomp(motif, seq_type) != motif else None
                          for motif in motif_list ]

    motif_counts = dict()
    
    for motif, partner in zip(motif_list, partner_motif_list):
        #Below line resets the positional counter for the vcf file
        vcf_file = read_input(infile)
        motif_counts[motif] = process_motif(vcf_file, sequence, motif, partner)

    print_motif_summary(motif_counts)

def process_motif(vcf_file, sequence, motif, partner):
    radius = motif_len(motif) - 1
    variant_neighborhoods = gerrymander(vcf_file, radius)
    
    motif_counts = dict()

    for neighborhood in variant_neighborhoods:
         variant_position = neighborhood[0].pos
         reference_motifs = motifs(
            reference_segment(neighborhood,sequence,radius),
            motif,
            partner,
         )
         variant_motifs = motifs(
            variant_segment(neighborhood,sequence,radius),
            motif,
            partner,
         )
         motif_counts[variant_position] = dict()
         for strand in ['.','+','-']:
            if not (reference_motifs[strand] == 0 and variant_motifs[strand] == 0):
               motif_counts[variant_position][strand] = dict(
                  reference=reference_motifs[strand],
                  query=variant_motifs[strand],
               )
    return motif_counts

def read_input(infile):
    if infile.endswith('.vcf.gz'):
        mode = 'rb'
    elif infile.endswith('.vcf'):
        mode = 'r'
    else:
        print("Input file must be VCF or compressed VCF",file=sys.stderr)
        sys.exit(1)
    return pysam.VariantFile(infile, mode)

def validate_motifs(motif_list, seq_type):
    '''
    Check validity of the input motifs and standardize their format

    Takes an list of motif strings as input and returns them as
    a list of regular expressions.
    '''

    bad_motifs = []

    for i in range(len(motif_list)):
        original_string = motif_list[i]
        # The input regular expression should be simple, containing only brackets
        # and characters within our target alphabet.
        # This check can be improved.
        if re.search('[\d\.\{\}\(\)\\\\]',motif_list[i]):
            print(("Motif submitted as {} is invalid."
                  " Please use simple regular expressions and/or IUPAC ambiguity codes.").format(original_string),
                  file=sys.stderr)
            bad_motifs += motif_list[i]
            continue
        motif_list[i] = iupac2regex(motif_list[i], seq_type)
        try:
            re.compile(motif_list[i])
        except re.error:
            print(("Error: Motif submitted as {} was converted to the invalid regular expression {}."
                  " Please report this problem. The motif will be skipped for now.").format(original_string,motif_list[i]),
                  file=sys.stderr)
            del motif_list[i]

    if bad_motifs:
        print("At least one motif provided was invalid. Please correct your inputs and try again.",
              file=sys.stderr)
        sys.exit(1)
    elif not motif_list:
        print("There are no valid motifs to process.",
              file=sys.stderr)
        sys.exit(1)

    return motif_list


# Define a dictionary to map IUPAC ambiguity codes to regular expressions.
# This is used by iupac2regex().
iupac_map = dict(
    dna = dict(
        R = "AG",
        Y = "CT",
        S = "GC",
        W = "AT",
        K = "GT",
        M = "AC",
        B = "CGT",
        D = "AGT",
        H = "ACT",
        V = "ACG",
        N = "ACGT",
    ),
    aa = dict(
        B = "DN",
        X = "ACDEFGHIKLMNPQRSTVWY",
        Z = "EQ",
    )
)
inv_iupac_map = dict(
    # converting to set removes duplicate elements, then we turn them back into lists
    dna = {tuple(sorted(set(meaning))):code for code, meaning in iupac_map['dna'].items()},
    aa = {tuple(sorted(set(meaning))):code for code, meaning in iupac_map['aa'].items()},
)

def iupac2regex(seq, seq_type):
    '''
    Convert sequence with IUPAC ambiguity codes to a regular expression
    '''
    seq = seq.upper()
    seq = reduce(lambda motif, character: motif.replace(character, '[' + iupac_map[seq_type][character] + ']' ),
                 iupac_map[seq_type], seq)
    return seq

def regex2iupac(seq, seq_type):
    '''
    Convert a sequence with simple regular expressions to its IUPAC representation
    '''
    return re.sub('\[([^\]]*)\]',
           lambda x: inv_iupac_map[seq_type][tuple(sorted(set(x.group(1))))],
           seq.upper())

def motif_revcomp(motif, seq_type):

    if seq_type.lower() != 'dna':
        return None

    return iupac2regex(str(Seq(regex2iupac(motif, seq_type)).reverse_complement()), seq_type)

def motif_len(motif):
    '''
    Determine the length of a motif represented as a simple regular expression
    '''

    single_char_motif = re.sub(
        r'\[[^\[]*\]',
        r'0',
        motif,
    )
    return len(single_char_motif)

def gerrymander(vcf_file, radius):
    '''
    Return variants positioned within a certain range of each other
    '''
    districts = []
    distance = 0
    last_position = 0
    block = []

    for record in vcf_file:
        distance = record.pos - last_position

        if distance > radius:
            # The current block is complete; start a new one
            districts.append(block)
            block = []
        else:
            # reset the range check
            distance = 0

        block.append(record)
        last_position = record.pos

    # The last block didn't get added yet
    districts.append(block)
    # The first block is always empty
    return districts[1:]

def reference_segment(neighborhood, sequence, radius):
    # 1 must be subtracted from the VCF positions because VCF positions
    # are one-based and Python's string indexing is zero-based.
    #
    # 1 must be again added on the right side because python slices are intervals
    # of the form [a, b)
    return sequence[neighborhood[0].pos-1-radius:1+neighborhood[-1].pos-1+radius].upper()

def variant_segment(neighborhood, sequence, radius):
    '''
    Apply variants to get the consensus sequence for the given segment
    '''
    segment = reference_segment(neighborhood, sequence, radius)
    start_offset = neighborhood[0].pos - radius

    for variant in neighborhood:
        local_position = variant.pos - start_offset
        seq_prefix = segment[:local_position]
        seq_suffix = segment[local_position:].replace(
            variant.ref,variant.alts[0],1)
        variant_segment = seq_prefix + seq_suffix

    return variant_segment.upper()

def motifs(sequence, motif, partner):
    '''
    Find the number of motifs in a given sequence

    Given a sequence and its partner motif (if available),
    return the number of motifs per strand as a dictionary.
    '''
    results = {'.':0,'+':0,'-':0}
    default_strand = '.'
    matches = re.findall(motif, sequence, flags=re.IGNORECASE)

    if partner:
        partner_matches = re.findall(partner, sequence, flags=re.IGNORECASE)
        results['-'] = len(partner_matches)
        default_strand = '+'

    results[default_strand] = len(matches)

    return results

def print_motif_summary(motif_counts):
    print("\t".join(
        [
            'motif',
            'strand',
            'position',
            'reference',
            'variant',
        ]
    ))
    for motif in motif_counts:
        for position in iter(sorted(motif_counts[motif])):
            for strand in motif_counts[motif][position]:
                print("\t".join(
                    [
                        motif,
                        strand,
                        str(position),
                        str(motif_counts[motif][position][strand]['reference']),
                        str(motif_counts[motif][position][strand]['query']),
                    ]
                ))

if __name__ == '__main__':
    parser = ArgumentParser(description='Motif-Variant Probe: detect motif gain and loss due to mutations')
    parser.add_argument('infile',
                        help='vcf or vcf.gz file containing mutations (default: stdin)',
                        default='-')
    parser.add_argument('-o','--outfile',
                        help='results table (default: stdout)',
                        default='-')
    parser.add_argument('-r','--reference',
                        help='reference sequence in fasta format',
                        required=True)
    motif_input = parser.add_mutually_exclusive_group(required=True)
    motif_input.add_argument('-f','--motif-file',
                             help='file containing a list of motifs to check')
    motif_input.add_argument('-m','--motif-list',
                             help='a comma-delimited string of motifs to check')
    parser.add_argument('-t','--sequence-type',
                        help='DNA or amino acid (default: dna)',
                        default='dna',
                        choices=['dna','aa'])
    arguments = parser.parse_args()

    if arguments.motif_file:
        with open(arguments.motif_file) as infile:
            motif_list = infile.read().splitlines()
    else:
        motif_list = arguments.motif_list.split(',')

    main(infile=arguments.infile,
         outfile=arguments.outfile,
         reference=arguments.reference,
         motif_list=motif_list,
         seq_type=arguments.sequence_type,
    )