Module autostreamtree.sequence

Expand source code
from typing import List


def decode(gt: tuple, ref: str, alts: list, as_iupac: bool = False,
           as_tuple: bool = False, as_list: bool = False) -> str:
    """
    Decode a genotype from a VCF file and return the corresponding DNA sequence

    Args:
        gt (tuple): The genotype as a tuple of integers representing the
                    indices of the reference and alternate alleles.
        ref (str): The reference allele.
        alts (list): A list of alternate alleles.
        as_iupac (bool): Whether to return the consensus IUPAC symbol if the
                         genotype is heterozygous. Default is False.
        as_tuple (bool): Whether to return the result as a tuple of two strings
                         Default is False.
        as_list (bool): Whether to return the result as a list of two strings.
                        Default is False.

    Returns:
        ret (str): The decoded genotype as a string. By default, this is a
                   string in the format "ref/alt", but the format can be
                   customized using the optional arguments.
    """
    # initialize return value
    ret = [None, None]

    # if genotype is missing, set to "N"
    if gt[0] is None or gt[1] is None:
        ret = ["N", "N"]
    else:
        # decode each allele
        if gt[0] == 0:
            ret[0] = ref
        else:
            ret[0] = alts[gt[0] - 1]
        if gt[1] == 0:
            ret[1] = ref
        else:
            ret[1] = alts[gt[1] - 1]

    # apply optional formatting
    if as_iupac:
        return dna_consensus("/".join(ret))
    elif as_tuple:
        return tuple(ret)
    elif as_list:
        return ret
    else:
        return "/".join(ret)


def get_nuc_freqs(seqs: dict, ploidy: int) -> list:
    """
    Compute the nucleotide frequencies of a set of DNA sequences.

    Args:
        seqs (dict): A dictionary of DNA sequences. The keys are sample names
                     and the values are strings of nucleotides.
        ploidy (int): The ploidy of the sequences. If ploidy=1, ambiguities
                      will be skipped. If ploidy=2, ambiguities will be
                      resolved.

    Returns:
        freqs (list): A list of dictionaries, where each dictionary contains
                      the nucleotide frequencies for a single position in the
                      sequences.

    Raises:
        ValueError: If any sequence is empty or contains only invalid
                    characters, or if sequences are not all the same length.
    """
    if not all(seqs):
        raise ValueError("One or more sequences are empty.")

    sequence_length = len(seqs[list(seqs.keys())[0]])
    if not all(len(seq) == sequence_length for seq in seqs.values()):
        raise ValueError("Sequences are not all the same length.")

    freqs = []
    for loc in range(sequence_length):
        allnucs = ""
        for samp in seqs.keys():
            allnucs += dna_consensus(seqs[samp][loc]).lower()

        badchars = ["?", "-", "n"]
        if ploidy == 1:
            badchars += ["r", "y", "s", "w", "k", "m", "b", "d", "h", "v"]
        allnucs = ''.join([nuc for nuc in allnucs if nuc not in badchars])

        if ploidy == 2:
            iupacs = ["r", "y", "s", "w", "k", "m", "b", "d", "h", "v"]
            for ambig in iupacs:
                allnucs = allnucs.replace(
                    ambig, "".join(get_iupac_caseless(ambig))
                )
            for nuc in ["a", "c", "t", "g"]:
                allnucs = allnucs.replace(nuc, nuc + nuc)

        total = len(allnucs)
        counts = {"a": 0, "g": 0, "c": 0, "t": 0}
        for c in allnucs:
            if c in counts:
                counts[c] += 1
        if total <= 0:
            freqs.append({"a": 0, "g": 0, "c": 0, "t": 0})
        else:
            for nuc in counts.keys():
                counts[nuc] = float(counts[nuc] / total)
            freqs.append(counts)
    return freqs


def dna_consensus(seq: str) -> str:
    """
    Make a consensus DNA sequence from alleles separated by a "/" character.

    Args:
        seq (str): A string of DNA sequences separated by a "/" character.

    Returns:
        consens (str): A consensus DNA sequence derived from the input
                       sequences.
    """
    if not seq:
        return None
    if len(seq) <= 0:
        return None
    alleles = seq.split("/")
    consens = ""
    if len(alleles) < 1:
        return None
    elif not all(len(x) == len(alleles[0]) for x in alleles):
        raise ValueError(
            "Not all alleles are the same length: " + str(alleles)
        )
    elif len(alleles) == 1:
        return alleles[0]
    else:
        for i in range(len(alleles[0])):
            nucs = ""
            for a in alleles:
                nucs += a[i]
            temp = list_to_sort_unique_string(nucs.upper())
            consens += reverse_iupac_case(temp)
    return consens


def reverse_iupac_case(char: str) -> str:
    """
    Translate a string of DNA bases to an IUPAC ambiguity code, retaining case.

    Args:
        char (str): A string of DNA bases.

    Returns:
        iupac (str): An IUPAC ambiguity code.
    """
    iupac = {
        'A': 'A',
        'N': 'N',
        '-': '-',
        'C': 'C',
        'G': 'G',
        'T': 'T',
        'AG': 'R',
        'CT': 'Y',
        'AC': 'M',
        'GT': 'K',
        'AT': 'W',
        'CG': 'S',
        'CGT': 'B',
        'AGT': 'D',
        'ACT': 'H',
        'ACG': 'V',
        'ACGT': 'N',
        'a': 'a',
        'n': 'n',
        'c': 'c',
        'g': 'g',
        't': 't',
        'ag': 'r',
        'ct': 'y',
        'ac': 'm',
        'gt': 'k',
        'at': 'w',
        'cg': 's',
        'cgt': 'b',
        'agt': 'd',
        'act': 'h',
        'acg': 'v',
        'acgt': 'n'
    }
    return iupac[char]


def list_to_sort_unique_string(input: list) -> str:
    """
    Convert a list of characters to a sorted, unique string.

    Args:
        l (list): A list of characters.

    Returns:
        s (str): A string containing the characters from the input list, sorted
                 and with duplicates removed.
    """
    s = ''.join(sorted(set(input)))
    return s


def get_iupac_caseless(char: str) -> List[str]:
    """
    Split a character to IUPAC codes, assuming diploidy.

    Args:
        char (str): A character to be split.

    Returns:
        codes (list): A list of IUPAC codes corresponding to the input
                      character.
    """
    lower = False
    if char.islower():
        lower = True
        char = char.upper()
    iupac = {
        "A": ["A"],
        "G": ["G"],
        "C": ["C"],
        "T": ["T"],
        "N": ["A", "C", "G", "T"],
        "?": ["A", "C", "G", "T"],
        "-": ["A", "C", "G", "T", "-"],
        "R": ["A", "G"],
        "Y": ["C", "T"],
        "S": ["G", "C"],
        "W": ["A", "T"],
        "K": ["G", "T"],
        "M": ["A", "C"],
        "B": ["C", "G", "T"],
        "D": ["A", "G", "T"],
        "H": ["A", "C", "T"],
        "V": ["A", "C", "G"]
    }
    codes = iupac[char]
    if lower:
        codes = [c.lower() for c in codes]
    return codes


def phase_snp(snp: str) -> str:
    """
    Phase a single nucleotide polymorphism (SNP) by splitting its IUPAC code
    into two alleles.

    Args:
        snp (str): The IUPAC code for the SNP.

    Returns:
        phase (str): The SNP alleles separated by a "/" character.
    """
    nucs = get_iupac_caseless(snp)
    if len(nucs) > 2 or len(nucs) < 1:
        return "n/n"
    elif len(nucs) == 1:
        phase = nucs[0] + '/' + nucs[0]
        return phase
    else:
        return "/".join(nucs)

Functions

def decode(gt: tuple, ref: str, alts: list, as_iupac: bool = False, as_tuple: bool = False, as_list: bool = False) -> str

Decode a genotype from a VCF file and return the corresponding DNA sequence

Args

gt : tuple
The genotype as a tuple of integers representing the indices of the reference and alternate alleles.
ref : str
The reference allele.
alts : list
A list of alternate alleles.
as_iupac : bool
Whether to return the consensus IUPAC symbol if the genotype is heterozygous. Default is False.
as_tuple : bool
Whether to return the result as a tuple of two strings Default is False.
as_list : bool
Whether to return the result as a list of two strings. Default is False.

Returns

ret (str): The decoded genotype as a string. By default, this is a
string in the format "ref/alt", but the format can be customized using the optional arguments.
Expand source code
def decode(gt: tuple, ref: str, alts: list, as_iupac: bool = False,
           as_tuple: bool = False, as_list: bool = False) -> str:
    """
    Decode a genotype from a VCF file and return the corresponding DNA sequence

    Args:
        gt (tuple): The genotype as a tuple of integers representing the
                    indices of the reference and alternate alleles.
        ref (str): The reference allele.
        alts (list): A list of alternate alleles.
        as_iupac (bool): Whether to return the consensus IUPAC symbol if the
                         genotype is heterozygous. Default is False.
        as_tuple (bool): Whether to return the result as a tuple of two strings
                         Default is False.
        as_list (bool): Whether to return the result as a list of two strings.
                        Default is False.

    Returns:
        ret (str): The decoded genotype as a string. By default, this is a
                   string in the format "ref/alt", but the format can be
                   customized using the optional arguments.
    """
    # initialize return value
    ret = [None, None]

    # if genotype is missing, set to "N"
    if gt[0] is None or gt[1] is None:
        ret = ["N", "N"]
    else:
        # decode each allele
        if gt[0] == 0:
            ret[0] = ref
        else:
            ret[0] = alts[gt[0] - 1]
        if gt[1] == 0:
            ret[1] = ref
        else:
            ret[1] = alts[gt[1] - 1]

    # apply optional formatting
    if as_iupac:
        return dna_consensus("/".join(ret))
    elif as_tuple:
        return tuple(ret)
    elif as_list:
        return ret
    else:
        return "/".join(ret)
def dna_consensus(seq: str) -> str

Make a consensus DNA sequence from alleles separated by a "/" character.

Args

seq : str
A string of DNA sequences separated by a "/" character.

Returns

consens (str): A consensus DNA sequence derived from the input
sequences.
Expand source code
def dna_consensus(seq: str) -> str:
    """
    Make a consensus DNA sequence from alleles separated by a "/" character.

    Args:
        seq (str): A string of DNA sequences separated by a "/" character.

    Returns:
        consens (str): A consensus DNA sequence derived from the input
                       sequences.
    """
    if not seq:
        return None
    if len(seq) <= 0:
        return None
    alleles = seq.split("/")
    consens = ""
    if len(alleles) < 1:
        return None
    elif not all(len(x) == len(alleles[0]) for x in alleles):
        raise ValueError(
            "Not all alleles are the same length: " + str(alleles)
        )
    elif len(alleles) == 1:
        return alleles[0]
    else:
        for i in range(len(alleles[0])):
            nucs = ""
            for a in alleles:
                nucs += a[i]
            temp = list_to_sort_unique_string(nucs.upper())
            consens += reverse_iupac_case(temp)
    return consens
def get_iupac_caseless(char: str) -> List[str]

Split a character to IUPAC codes, assuming diploidy.

Args

char : str
A character to be split.

Returns

codes (list): A list of IUPAC codes corresponding to the input
character.
Expand source code
def get_iupac_caseless(char: str) -> List[str]:
    """
    Split a character to IUPAC codes, assuming diploidy.

    Args:
        char (str): A character to be split.

    Returns:
        codes (list): A list of IUPAC codes corresponding to the input
                      character.
    """
    lower = False
    if char.islower():
        lower = True
        char = char.upper()
    iupac = {
        "A": ["A"],
        "G": ["G"],
        "C": ["C"],
        "T": ["T"],
        "N": ["A", "C", "G", "T"],
        "?": ["A", "C", "G", "T"],
        "-": ["A", "C", "G", "T", "-"],
        "R": ["A", "G"],
        "Y": ["C", "T"],
        "S": ["G", "C"],
        "W": ["A", "T"],
        "K": ["G", "T"],
        "M": ["A", "C"],
        "B": ["C", "G", "T"],
        "D": ["A", "G", "T"],
        "H": ["A", "C", "T"],
        "V": ["A", "C", "G"]
    }
    codes = iupac[char]
    if lower:
        codes = [c.lower() for c in codes]
    return codes
def get_nuc_freqs(seqs: dict, ploidy: int) -> list

Compute the nucleotide frequencies of a set of DNA sequences.

Args

seqs : dict
A dictionary of DNA sequences. The keys are sample names and the values are strings of nucleotides.
ploidy : int
The ploidy of the sequences. If ploidy=1, ambiguities will be skipped. If ploidy=2, ambiguities will be resolved.

Returns

freqs (list): A list of dictionaries, where each dictionary contains
the nucleotide frequencies for a single position in the sequences.

Raises

ValueError
If any sequence is empty or contains only invalid characters, or if sequences are not all the same length.
Expand source code
def get_nuc_freqs(seqs: dict, ploidy: int) -> list:
    """
    Compute the nucleotide frequencies of a set of DNA sequences.

    Args:
        seqs (dict): A dictionary of DNA sequences. The keys are sample names
                     and the values are strings of nucleotides.
        ploidy (int): The ploidy of the sequences. If ploidy=1, ambiguities
                      will be skipped. If ploidy=2, ambiguities will be
                      resolved.

    Returns:
        freqs (list): A list of dictionaries, where each dictionary contains
                      the nucleotide frequencies for a single position in the
                      sequences.

    Raises:
        ValueError: If any sequence is empty or contains only invalid
                    characters, or if sequences are not all the same length.
    """
    if not all(seqs):
        raise ValueError("One or more sequences are empty.")

    sequence_length = len(seqs[list(seqs.keys())[0]])
    if not all(len(seq) == sequence_length for seq in seqs.values()):
        raise ValueError("Sequences are not all the same length.")

    freqs = []
    for loc in range(sequence_length):
        allnucs = ""
        for samp in seqs.keys():
            allnucs += dna_consensus(seqs[samp][loc]).lower()

        badchars = ["?", "-", "n"]
        if ploidy == 1:
            badchars += ["r", "y", "s", "w", "k", "m", "b", "d", "h", "v"]
        allnucs = ''.join([nuc for nuc in allnucs if nuc not in badchars])

        if ploidy == 2:
            iupacs = ["r", "y", "s", "w", "k", "m", "b", "d", "h", "v"]
            for ambig in iupacs:
                allnucs = allnucs.replace(
                    ambig, "".join(get_iupac_caseless(ambig))
                )
            for nuc in ["a", "c", "t", "g"]:
                allnucs = allnucs.replace(nuc, nuc + nuc)

        total = len(allnucs)
        counts = {"a": 0, "g": 0, "c": 0, "t": 0}
        for c in allnucs:
            if c in counts:
                counts[c] += 1
        if total <= 0:
            freqs.append({"a": 0, "g": 0, "c": 0, "t": 0})
        else:
            for nuc in counts.keys():
                counts[nuc] = float(counts[nuc] / total)
            freqs.append(counts)
    return freqs
def list_to_sort_unique_string(input: list) -> str

Convert a list of characters to a sorted, unique string.

Args

l : list
A list of characters.

Returns

s (str): A string containing the characters from the input list, sorted
and with duplicates removed.
Expand source code
def list_to_sort_unique_string(input: list) -> str:
    """
    Convert a list of characters to a sorted, unique string.

    Args:
        l (list): A list of characters.

    Returns:
        s (str): A string containing the characters from the input list, sorted
                 and with duplicates removed.
    """
    s = ''.join(sorted(set(input)))
    return s
def phase_snp(snp: str) -> str

Phase a single nucleotide polymorphism (SNP) by splitting its IUPAC code into two alleles.

Args

snp : str
The IUPAC code for the SNP.

Returns

phase (str): The SNP alleles separated by a "/" character.

Expand source code
def phase_snp(snp: str) -> str:
    """
    Phase a single nucleotide polymorphism (SNP) by splitting its IUPAC code
    into two alleles.

    Args:
        snp (str): The IUPAC code for the SNP.

    Returns:
        phase (str): The SNP alleles separated by a "/" character.
    """
    nucs = get_iupac_caseless(snp)
    if len(nucs) > 2 or len(nucs) < 1:
        return "n/n"
    elif len(nucs) == 1:
        phase = nucs[0] + '/' + nucs[0]
        return phase
    else:
        return "/".join(nucs)
def reverse_iupac_case(char: str) -> str

Translate a string of DNA bases to an IUPAC ambiguity code, retaining case.

Args

char : str
A string of DNA bases.

Returns

iupac (str): An IUPAC ambiguity code.

Expand source code
def reverse_iupac_case(char: str) -> str:
    """
    Translate a string of DNA bases to an IUPAC ambiguity code, retaining case.

    Args:
        char (str): A string of DNA bases.

    Returns:
        iupac (str): An IUPAC ambiguity code.
    """
    iupac = {
        'A': 'A',
        'N': 'N',
        '-': '-',
        'C': 'C',
        'G': 'G',
        'T': 'T',
        'AG': 'R',
        'CT': 'Y',
        'AC': 'M',
        'GT': 'K',
        'AT': 'W',
        'CG': 'S',
        'CGT': 'B',
        'AGT': 'D',
        'ACT': 'H',
        'ACG': 'V',
        'ACGT': 'N',
        'a': 'a',
        'n': 'n',
        'c': 'c',
        'g': 'g',
        't': 't',
        'ag': 'r',
        'ct': 'y',
        'ac': 'm',
        'gt': 'k',
        'at': 'w',
        'cg': 's',
        'cgt': 'b',
        'agt': 'd',
        'act': 'h',
        'acg': 'v',
        'acgt': 'n'
    }
    return iupac[char]