learn-bio/sequence-motifs.py

124 lines
3.1 KiB
Python

# find instances of a promoter consensus sequence in e-coli genome
# https://open.oregonstate.education/appliedbioinformatics/chapter/chapter-2-sequence-motifs/
# https://en.wikipedia.org/wiki/Sequence_motif
import textwrap
from Bio import SeqIO, SeqUtils, motifs
from Bio.Seq import Seq
import math
print("""
-----------------------------------------------------------------
Demo: Calculate sequence complexity
-----------------------------------------------------------------
""")
def sequence_complexity(sequence: Seq) -> float:
"""
Complexity is defined as:
1 / N * log_D( N! / (n_A! * n_C! * n_T! * n_G! ) )
Where:
D = 4 (alphabet size),
N = Total length of sequence,
n_X = number of nucleotide X in sequence
AAAAAAAA -> less complex
ATCGATCG -> more complex
"""
n_A = sequence.count("A")
n_T = sequence.count("T")
n_C = sequence.count("C")
n_G = sequence.count("G")
return 1 / len(sequence) * math.log(
(
math.factorial(len(sequence)) / (
math.factorial(n_A) *
math.factorial(n_T) *
math.factorial(n_C) *
math.factorial(n_G)
)
),
4 # base
)
# print(textwrap.dedent(sequence_complexity.__doc__))
print(f'Sequence\tComplexity')
print(f'--------\t----------')
for i in [
'AAAAAAAA',
'AAAAAAAAAAAA',
'ATCGATCG',
'ATCGATCGATCG',
'ATATATAT',
'ACTACTAA',
'AAAAAATA',
]:
print(f'{i}\t{sequence_complexity(i)}')
print("""
-----------------------------------------------------------------
Demo: Generate a consensus sequence based on a collection
of variations
-----------------------------------------------------------------
""")
variations = [
Seq("CAGTT"),
Seq("CATTT"),
Seq("CAGTA"),
Seq("CAGTT"),
Seq("CAGTA")
]
motif = motifs.create(variations)
print("value counts:\n", motif.counts)
print("consensus: ", motif.degenerate_consensus)
print("""Get more common consensus sequences from JASPAR, e.g.:
https://jaspar.genereg.net/matrix/MA0447.1/""")
print("""
-----------------------------------------------------------------
Demo: find a common promoter motif in the E. Coli Genome
-----------------------------------------------------------------
""")
# Using promoters from:
# https://en.wikipedia.org/wiki/Promoter_(genetics)#Bacterial
promoter_consensus = Seq('AAAAAARNR')
# define genome sequence
genome: Seq = None
# load e-coli genome
with open('./data/e-coli.gb', "r") as file:
gb_file = SeqIO.parse(file, 'genbank')
# genome we want is first item in the genbank file
data = next(gb_file)
print('Genome Data')
print('-----------')
print(f'id: {data.id}')
print(f'len: {len(data.seq)} bp\n')
genome = data.seq
# search for promoter seq
result = SeqUtils.nt_search(str(genome), promoter_consensus)
print('Search Results')
print('----------------')
print(f'''original search sequence: {str(promoter_consensus)}
translated sequence: {result[0]}
indices of first 4 instances: {result[1:5]}
contents of first 4 instances:
{[str(genome[result[i]:result[i]+len(promoter_consensus)]) for i in range(1,5)]}
''')