learn-bio/sequence-motifs.py

# find instances of a promoter consensus sequence in e-coli genome
# https://open.oregonstate.education/appliedbioinformatics/chapter/chapter-2-sequence-motifs/
# https://en.wikipedia.org/wiki/Sequence_motif

import textwrap
from Bio import SeqIO, SeqUtils, motifs
from Bio.Seq import Seq
import math


print("""
-----------------------------------------------------------------
Demo: Calculate sequence complexity
-----------------------------------------------------------------
""")

def sequence_complexity(sequence: Seq) -> float:
    """
    Complexity is defined as:

    1 / N * log_D( N! / (n_A! * n_C! * n_T! * n_G! ) )

    Where:
    D = 4 (alphabet size),
    N = Total length of sequence,
    n_X = number of nucleotide X in sequence

    AAAAAAAA -> less complex

    ATCGATCG -> more complex
    """
    n_A = sequence.count("A")
    n_T = sequence.count("T")
    n_C = sequence.count("C")
    n_G = sequence.count("G")
    return 1 / len(sequence) * math.log(
        (
            math.factorial(len(sequence)) / (
                math.factorial(n_A) *
                math.factorial(n_T) *
                math.factorial(n_C) *
                math.factorial(n_G)
            )
        ),
        4 # base
    )

# print(textwrap.dedent(sequence_complexity.__doc__))

print(f'Sequence\tComplexity')
print(f'--------\t----------')
for i in [
    'AAAAAAAA',
    'AAAAAAAAAAAA',
    'ATCGATCG',
    'ATCGATCGATCG',
    'ATATATAT',
    'ACTACTAA',
    'AAAAAATA',
]:
    print(f'{i}\t{sequence_complexity(i)}')


print("""
-----------------------------------------------------------------
Demo: Generate a consensus sequence based on a collection
of variations
-----------------------------------------------------------------
""")

variations = [
    Seq("CAGTT"),
    Seq("CATTT"),
    Seq("CAGTA"),
    Seq("CAGTT"),
    Seq("CAGTA")
]
motif = motifs.create(variations)

print("value counts:\n", motif.counts)

print("consensus: ", motif.degenerate_consensus)

print("""Get more common consensus sequences from JASPAR, e.g.:
https://jaspar.genereg.net/matrix/MA0447.1/""")


print("""
-----------------------------------------------------------------
Demo: find a common promoter motif in the E. Coli Genome
-----------------------------------------------------------------
""")

# Using promoters from:
# https://en.wikipedia.org/wiki/Promoter_(genetics)#Bacterial
promoter_consensus = Seq('AAAAAARNR')

# define genome sequence
genome: Seq = None

# load e-coli genome
with open('./data/e-coli.gb', "r") as file:
    gb_file = SeqIO.parse(file, 'genbank')

    # genome we want is first item in the genbank file
    data = next(gb_file)
    print('Genome Data')
    print('-----------')
    print(f'id: {data.id}')
    print(f'len: {len(data.seq)} bp\n')

    genome = data.seq

# search for promoter seq
result = SeqUtils.nt_search(str(genome), promoter_consensus)
print('Search Results')
print('----------------')
print(f'''original search sequence: {str(promoter_consensus)}
translated sequence: {result[0]}
indices of first 4 instances: {result[1:5]}
contents of first 4 instances:
{[str(genome[result[i]:result[i]+len(promoter_consensus)]) for i in range(1,5)]}
''')