124 lines
3.1 KiB
Python
124 lines
3.1 KiB
Python
|
# find instances of a promoter consensus sequence in e-coli genome
|
||
|
# https://open.oregonstate.education/appliedbioinformatics/chapter/chapter-2-sequence-motifs/
|
||
|
# https://en.wikipedia.org/wiki/Sequence_motif
|
||
|
|
||
|
import textwrap
|
||
|
from Bio import SeqIO, SeqUtils, motifs
|
||
|
from Bio.Seq import Seq
|
||
|
import math
|
||
|
|
||
|
|
||
|
print("""
|
||
|
-----------------------------------------------------------------
|
||
|
Demo: Calculate sequence complexity
|
||
|
-----------------------------------------------------------------
|
||
|
""")
|
||
|
|
||
|
def sequence_complexity(sequence: Seq) -> float:
|
||
|
"""
|
||
|
Complexity is defined as:
|
||
|
|
||
|
1 / N * log_D( N! / (n_A! * n_C! * n_T! * n_G! ) )
|
||
|
|
||
|
Where:
|
||
|
D = 4 (alphabet size),
|
||
|
N = Total length of sequence,
|
||
|
n_X = number of nucleotide X in sequence
|
||
|
|
||
|
AAAAAAAA -> less complex
|
||
|
|
||
|
ATCGATCG -> more complex
|
||
|
"""
|
||
|
n_A = sequence.count("A")
|
||
|
n_T = sequence.count("T")
|
||
|
n_C = sequence.count("C")
|
||
|
n_G = sequence.count("G")
|
||
|
return 1 / len(sequence) * math.log(
|
||
|
(
|
||
|
math.factorial(len(sequence)) / (
|
||
|
math.factorial(n_A) *
|
||
|
math.factorial(n_T) *
|
||
|
math.factorial(n_C) *
|
||
|
math.factorial(n_G)
|
||
|
)
|
||
|
),
|
||
|
4 # base
|
||
|
)
|
||
|
|
||
|
# print(textwrap.dedent(sequence_complexity.__doc__))
|
||
|
|
||
|
print(f'Sequence\tComplexity')
|
||
|
print(f'--------\t----------')
|
||
|
for i in [
|
||
|
'AAAAAAAA',
|
||
|
'AAAAAAAAAAAA',
|
||
|
'ATCGATCG',
|
||
|
'ATCGATCGATCG',
|
||
|
'ATATATAT',
|
||
|
'ACTACTAA',
|
||
|
'AAAAAATA',
|
||
|
]:
|
||
|
print(f'{i}\t{sequence_complexity(i)}')
|
||
|
|
||
|
|
||
|
print("""
|
||
|
-----------------------------------------------------------------
|
||
|
Demo: Generate a consensus sequence based on a collection
|
||
|
of variations
|
||
|
-----------------------------------------------------------------
|
||
|
""")
|
||
|
|
||
|
variations = [
|
||
|
Seq("CAGTT"),
|
||
|
Seq("CATTT"),
|
||
|
Seq("CAGTA"),
|
||
|
Seq("CAGTT"),
|
||
|
Seq("CAGTA")
|
||
|
]
|
||
|
motif = motifs.create(variations)
|
||
|
|
||
|
print("value counts:\n", motif.counts)
|
||
|
|
||
|
print("consensus: ", motif.degenerate_consensus)
|
||
|
|
||
|
print("""Get more common consensus sequences from JASPAR, e.g.:
|
||
|
https://jaspar.genereg.net/matrix/MA0447.1/""")
|
||
|
|
||
|
|
||
|
print("""
|
||
|
-----------------------------------------------------------------
|
||
|
Demo: find a common promoter motif in the E. Coli Genome
|
||
|
-----------------------------------------------------------------
|
||
|
""")
|
||
|
|
||
|
# Using promoters from:
|
||
|
# https://en.wikipedia.org/wiki/Promoter_(genetics)#Bacterial
|
||
|
promoter_consensus = Seq('AAAAAARNR')
|
||
|
|
||
|
# define genome sequence
|
||
|
genome: Seq = None
|
||
|
|
||
|
# load e-coli genome
|
||
|
with open('./data/e-coli.gb', "r") as file:
|
||
|
gb_file = SeqIO.parse(file, 'genbank')
|
||
|
|
||
|
# genome we want is first item in the genbank file
|
||
|
data = next(gb_file)
|
||
|
print('Genome Data')
|
||
|
print('-----------')
|
||
|
print(f'id: {data.id}')
|
||
|
print(f'len: {len(data.seq)} bp\n')
|
||
|
|
||
|
genome = data.seq
|
||
|
|
||
|
# search for promoter seq
|
||
|
result = SeqUtils.nt_search(str(genome), promoter_consensus)
|
||
|
print('Search Results')
|
||
|
print('----------------')
|
||
|
print(f'''original search sequence: {str(promoter_consensus)}
|
||
|
translated sequence: {result[0]}
|
||
|
indices of first 4 instances: {result[1:5]}
|
||
|
contents of first 4 instances:
|
||
|
{[str(genome[result[i]:result[i]+len(promoter_consensus)]) for i in range(1,5)]}
|
||
|
''')
|