Module autostreamtree.sequence
Expand source code
from typing import List
def decode(gt: tuple, ref: str, alts: list, as_iupac: bool = False,
as_tuple: bool = False, as_list: bool = False) -> str:
"""
Decode a genotype from a VCF file and return the corresponding DNA sequence
Args:
gt (tuple): The genotype as a tuple of integers representing the
indices of the reference and alternate alleles.
ref (str): The reference allele.
alts (list): A list of alternate alleles.
as_iupac (bool): Whether to return the consensus IUPAC symbol if the
genotype is heterozygous. Default is False.
as_tuple (bool): Whether to return the result as a tuple of two strings
Default is False.
as_list (bool): Whether to return the result as a list of two strings.
Default is False.
Returns:
ret (str): The decoded genotype as a string. By default, this is a
string in the format "ref/alt", but the format can be
customized using the optional arguments.
"""
# initialize return value
ret = [None, None]
# if genotype is missing, set to "N"
if gt[0] is None or gt[1] is None:
ret = ["N", "N"]
else:
# decode each allele
if gt[0] == 0:
ret[0] = ref
else:
ret[0] = alts[gt[0] - 1]
if gt[1] == 0:
ret[1] = ref
else:
ret[1] = alts[gt[1] - 1]
# apply optional formatting
if as_iupac:
return dna_consensus("/".join(ret))
elif as_tuple:
return tuple(ret)
elif as_list:
return ret
else:
return "/".join(ret)
def get_nuc_freqs(seqs: dict, ploidy: int) -> list:
"""
Compute the nucleotide frequencies of a set of DNA sequences.
Args:
seqs (dict): A dictionary of DNA sequences. The keys are sample names
and the values are strings of nucleotides.
ploidy (int): The ploidy of the sequences. If ploidy=1, ambiguities
will be skipped. If ploidy=2, ambiguities will be
resolved.
Returns:
freqs (list): A list of dictionaries, where each dictionary contains
the nucleotide frequencies for a single position in the
sequences.
Raises:
ValueError: If any sequence is empty or contains only invalid
characters, or if sequences are not all the same length.
"""
if not all(seqs):
raise ValueError("One or more sequences are empty.")
sequence_length = len(seqs[list(seqs.keys())[0]])
if not all(len(seq) == sequence_length for seq in seqs.values()):
raise ValueError("Sequences are not all the same length.")
freqs = []
for loc in range(sequence_length):
allnucs = ""
for samp in seqs.keys():
allnucs += dna_consensus(seqs[samp][loc]).lower()
badchars = ["?", "-", "n"]
if ploidy == 1:
badchars += ["r", "y", "s", "w", "k", "m", "b", "d", "h", "v"]
allnucs = ''.join([nuc for nuc in allnucs if nuc not in badchars])
if ploidy == 2:
iupacs = ["r", "y", "s", "w", "k", "m", "b", "d", "h", "v"]
for ambig in iupacs:
allnucs = allnucs.replace(
ambig, "".join(get_iupac_caseless(ambig))
)
for nuc in ["a", "c", "t", "g"]:
allnucs = allnucs.replace(nuc, nuc + nuc)
total = len(allnucs)
counts = {"a": 0, "g": 0, "c": 0, "t": 0}
for c in allnucs:
if c in counts:
counts[c] += 1
if total <= 0:
freqs.append({"a": 0, "g": 0, "c": 0, "t": 0})
else:
for nuc in counts.keys():
counts[nuc] = float(counts[nuc] / total)
freqs.append(counts)
return freqs
def dna_consensus(seq: str) -> str:
"""
Make a consensus DNA sequence from alleles separated by a "/" character.
Args:
seq (str): A string of DNA sequences separated by a "/" character.
Returns:
consens (str): A consensus DNA sequence derived from the input
sequences.
"""
if not seq:
return None
if len(seq) <= 0:
return None
alleles = seq.split("/")
consens = ""
if len(alleles) < 1:
return None
elif not all(len(x) == len(alleles[0]) for x in alleles):
raise ValueError(
"Not all alleles are the same length: " + str(alleles)
)
elif len(alleles) == 1:
return alleles[0]
else:
for i in range(len(alleles[0])):
nucs = ""
for a in alleles:
nucs += a[i]
temp = list_to_sort_unique_string(nucs.upper())
consens += reverse_iupac_case(temp)
return consens
def reverse_iupac_case(char: str) -> str:
"""
Translate a string of DNA bases to an IUPAC ambiguity code, retaining case.
Args:
char (str): A string of DNA bases.
Returns:
iupac (str): An IUPAC ambiguity code.
"""
iupac = {
'A': 'A',
'N': 'N',
'-': '-',
'C': 'C',
'G': 'G',
'T': 'T',
'AG': 'R',
'CT': 'Y',
'AC': 'M',
'GT': 'K',
'AT': 'W',
'CG': 'S',
'CGT': 'B',
'AGT': 'D',
'ACT': 'H',
'ACG': 'V',
'ACGT': 'N',
'a': 'a',
'n': 'n',
'c': 'c',
'g': 'g',
't': 't',
'ag': 'r',
'ct': 'y',
'ac': 'm',
'gt': 'k',
'at': 'w',
'cg': 's',
'cgt': 'b',
'agt': 'd',
'act': 'h',
'acg': 'v',
'acgt': 'n'
}
return iupac[char]
def list_to_sort_unique_string(input: list) -> str:
"""
Convert a list of characters to a sorted, unique string.
Args:
l (list): A list of characters.
Returns:
s (str): A string containing the characters from the input list, sorted
and with duplicates removed.
"""
s = ''.join(sorted(set(input)))
return s
def get_iupac_caseless(char: str) -> List[str]:
"""
Split a character to IUPAC codes, assuming diploidy.
Args:
char (str): A character to be split.
Returns:
codes (list): A list of IUPAC codes corresponding to the input
character.
"""
lower = False
if char.islower():
lower = True
char = char.upper()
iupac = {
"A": ["A"],
"G": ["G"],
"C": ["C"],
"T": ["T"],
"N": ["A", "C", "G", "T"],
"?": ["A", "C", "G", "T"],
"-": ["A", "C", "G", "T", "-"],
"R": ["A", "G"],
"Y": ["C", "T"],
"S": ["G", "C"],
"W": ["A", "T"],
"K": ["G", "T"],
"M": ["A", "C"],
"B": ["C", "G", "T"],
"D": ["A", "G", "T"],
"H": ["A", "C", "T"],
"V": ["A", "C", "G"]
}
codes = iupac[char]
if lower:
codes = [c.lower() for c in codes]
return codes
def phase_snp(snp: str) -> str:
"""
Phase a single nucleotide polymorphism (SNP) by splitting its IUPAC code
into two alleles.
Args:
snp (str): The IUPAC code for the SNP.
Returns:
phase (str): The SNP alleles separated by a "/" character.
"""
nucs = get_iupac_caseless(snp)
if len(nucs) > 2 or len(nucs) < 1:
return "n/n"
elif len(nucs) == 1:
phase = nucs[0] + '/' + nucs[0]
return phase
else:
return "/".join(nucs)
Functions
def decode(gt: tuple, ref: str, alts: list, as_iupac: bool = False, as_tuple: bool = False, as_list: bool = False) -> str
-
Decode a genotype from a VCF file and return the corresponding DNA sequence
Args
gt
:tuple
- The genotype as a tuple of integers representing the indices of the reference and alternate alleles.
ref
:str
- The reference allele.
alts
:list
- A list of alternate alleles.
as_iupac
:bool
- Whether to return the consensus IUPAC symbol if the genotype is heterozygous. Default is False.
as_tuple
:bool
- Whether to return the result as a tuple of two strings Default is False.
as_list
:bool
- Whether to return the result as a list of two strings. Default is False.
Returns
ret (str): The decoded genotype as a string. By default, this is a
- string in the format "ref/alt", but the format can be customized using the optional arguments.
Expand source code
def decode(gt: tuple, ref: str, alts: list, as_iupac: bool = False, as_tuple: bool = False, as_list: bool = False) -> str: """ Decode a genotype from a VCF file and return the corresponding DNA sequence Args: gt (tuple): The genotype as a tuple of integers representing the indices of the reference and alternate alleles. ref (str): The reference allele. alts (list): A list of alternate alleles. as_iupac (bool): Whether to return the consensus IUPAC symbol if the genotype is heterozygous. Default is False. as_tuple (bool): Whether to return the result as a tuple of two strings Default is False. as_list (bool): Whether to return the result as a list of two strings. Default is False. Returns: ret (str): The decoded genotype as a string. By default, this is a string in the format "ref/alt", but the format can be customized using the optional arguments. """ # initialize return value ret = [None, None] # if genotype is missing, set to "N" if gt[0] is None or gt[1] is None: ret = ["N", "N"] else: # decode each allele if gt[0] == 0: ret[0] = ref else: ret[0] = alts[gt[0] - 1] if gt[1] == 0: ret[1] = ref else: ret[1] = alts[gt[1] - 1] # apply optional formatting if as_iupac: return dna_consensus("/".join(ret)) elif as_tuple: return tuple(ret) elif as_list: return ret else: return "/".join(ret)
def dna_consensus(seq: str) -> str
-
Make a consensus DNA sequence from alleles separated by a "/" character.
Args
seq
:str
- A string of DNA sequences separated by a "/" character.
Returns
consens (str): A consensus DNA sequence derived from the input
- sequences.
Expand source code
def dna_consensus(seq: str) -> str: """ Make a consensus DNA sequence from alleles separated by a "/" character. Args: seq (str): A string of DNA sequences separated by a "/" character. Returns: consens (str): A consensus DNA sequence derived from the input sequences. """ if not seq: return None if len(seq) <= 0: return None alleles = seq.split("/") consens = "" if len(alleles) < 1: return None elif not all(len(x) == len(alleles[0]) for x in alleles): raise ValueError( "Not all alleles are the same length: " + str(alleles) ) elif len(alleles) == 1: return alleles[0] else: for i in range(len(alleles[0])): nucs = "" for a in alleles: nucs += a[i] temp = list_to_sort_unique_string(nucs.upper()) consens += reverse_iupac_case(temp) return consens
def get_iupac_caseless(char: str) -> List[str]
-
Split a character to IUPAC codes, assuming diploidy.
Args
char
:str
- A character to be split.
Returns
codes (list): A list
ofIUPAC codes corresponding to the input
- character.
Expand source code
def get_iupac_caseless(char: str) -> List[str]: """ Split a character to IUPAC codes, assuming diploidy. Args: char (str): A character to be split. Returns: codes (list): A list of IUPAC codes corresponding to the input character. """ lower = False if char.islower(): lower = True char = char.upper() iupac = { "A": ["A"], "G": ["G"], "C": ["C"], "T": ["T"], "N": ["A", "C", "G", "T"], "?": ["A", "C", "G", "T"], "-": ["A", "C", "G", "T", "-"], "R": ["A", "G"], "Y": ["C", "T"], "S": ["G", "C"], "W": ["A", "T"], "K": ["G", "T"], "M": ["A", "C"], "B": ["C", "G", "T"], "D": ["A", "G", "T"], "H": ["A", "C", "T"], "V": ["A", "C", "G"] } codes = iupac[char] if lower: codes = [c.lower() for c in codes] return codes
def get_nuc_freqs(seqs: dict, ploidy: int) -> list
-
Compute the nucleotide frequencies of a set of DNA sequences.
Args
seqs
:dict
- A dictionary of DNA sequences. The keys are sample names and the values are strings of nucleotides.
ploidy
:int
- The ploidy of the sequences. If ploidy=1, ambiguities will be skipped. If ploidy=2, ambiguities will be resolved.
Returns
freqs (list): A list
ofdictionaries, where each dictionary contains
- the nucleotide frequencies for a single position in the sequences.
Raises
ValueError
- If any sequence is empty or contains only invalid characters, or if sequences are not all the same length.
Expand source code
def get_nuc_freqs(seqs: dict, ploidy: int) -> list: """ Compute the nucleotide frequencies of a set of DNA sequences. Args: seqs (dict): A dictionary of DNA sequences. The keys are sample names and the values are strings of nucleotides. ploidy (int): The ploidy of the sequences. If ploidy=1, ambiguities will be skipped. If ploidy=2, ambiguities will be resolved. Returns: freqs (list): A list of dictionaries, where each dictionary contains the nucleotide frequencies for a single position in the sequences. Raises: ValueError: If any sequence is empty or contains only invalid characters, or if sequences are not all the same length. """ if not all(seqs): raise ValueError("One or more sequences are empty.") sequence_length = len(seqs[list(seqs.keys())[0]]) if not all(len(seq) == sequence_length for seq in seqs.values()): raise ValueError("Sequences are not all the same length.") freqs = [] for loc in range(sequence_length): allnucs = "" for samp in seqs.keys(): allnucs += dna_consensus(seqs[samp][loc]).lower() badchars = ["?", "-", "n"] if ploidy == 1: badchars += ["r", "y", "s", "w", "k", "m", "b", "d", "h", "v"] allnucs = ''.join([nuc for nuc in allnucs if nuc not in badchars]) if ploidy == 2: iupacs = ["r", "y", "s", "w", "k", "m", "b", "d", "h", "v"] for ambig in iupacs: allnucs = allnucs.replace( ambig, "".join(get_iupac_caseless(ambig)) ) for nuc in ["a", "c", "t", "g"]: allnucs = allnucs.replace(nuc, nuc + nuc) total = len(allnucs) counts = {"a": 0, "g": 0, "c": 0, "t": 0} for c in allnucs: if c in counts: counts[c] += 1 if total <= 0: freqs.append({"a": 0, "g": 0, "c": 0, "t": 0}) else: for nuc in counts.keys(): counts[nuc] = float(counts[nuc] / total) freqs.append(counts) return freqs
def list_to_sort_unique_string(input: list) -> str
-
Convert a list of characters to a sorted, unique string.
Args
l
:list
- A list of characters.
Returns
s (str): A string containing the characters from the input list, sorted
- and with duplicates removed.
Expand source code
def list_to_sort_unique_string(input: list) -> str: """ Convert a list of characters to a sorted, unique string. Args: l (list): A list of characters. Returns: s (str): A string containing the characters from the input list, sorted and with duplicates removed. """ s = ''.join(sorted(set(input))) return s
def phase_snp(snp: str) -> str
-
Phase a single nucleotide polymorphism (SNP) by splitting its IUPAC code into two alleles.
Args
snp
:str
- The IUPAC code for the SNP.
Returns
phase (str): The SNP alleles separated by a "/" character.
Expand source code
def phase_snp(snp: str) -> str: """ Phase a single nucleotide polymorphism (SNP) by splitting its IUPAC code into two alleles. Args: snp (str): The IUPAC code for the SNP. Returns: phase (str): The SNP alleles separated by a "/" character. """ nucs = get_iupac_caseless(snp) if len(nucs) > 2 or len(nucs) < 1: return "n/n" elif len(nucs) == 1: phase = nucs[0] + '/' + nucs[0] return phase else: return "/".join(nucs)
def reverse_iupac_case(char: str) -> str
-
Translate a string of DNA bases to an IUPAC ambiguity code, retaining case.
Args
char
:str
- A string of DNA bases.
Returns
iupac (str): An IUPAC ambiguity code.
Expand source code
def reverse_iupac_case(char: str) -> str: """ Translate a string of DNA bases to an IUPAC ambiguity code, retaining case. Args: char (str): A string of DNA bases. Returns: iupac (str): An IUPAC ambiguity code. """ iupac = { 'A': 'A', 'N': 'N', '-': '-', 'C': 'C', 'G': 'G', 'T': 'T', 'AG': 'R', 'CT': 'Y', 'AC': 'M', 'GT': 'K', 'AT': 'W', 'CG': 'S', 'CGT': 'B', 'AGT': 'D', 'ACT': 'H', 'ACG': 'V', 'ACGT': 'N', 'a': 'a', 'n': 'n', 'c': 'c', 'g': 'g', 't': 't', 'ag': 'r', 'ct': 'y', 'ac': 'm', 'gt': 'k', 'at': 'w', 'cg': 's', 'cgt': 'b', 'agt': 'd', 'act': 'h', 'acg': 'v', 'acgt': 'n' } return iupac[char]