learn-bio/sequence-motifs.py

# find instances of a promoter consensus sequence in e-coli genome
# https://open.oregonstate.education/appliedbioinformatics/chapter/chapter-2-sequence-motifs/
# https://en.wikipedia.org/wiki/Sequence_motif

import textwrap
from Bio import SeqIO, SeqUtils, motifs
from Bio.Seq import Seq
import math


print("""
-----------------------------------------------------------------
Demo: Calculate sequence complexity
-----------------------------------------------------------------
""")

def sequence_complexity(sequence: Seq) -> float:
    """
    Complexity is defined as:

    1 / N * log_D( N! / (n_A! * n_C! * n_T! * n_G! ) )

    Where:
    D = 4 (alphabet size),
    N = Total length of sequence,
    n_X = number of nucleotide X in sequence

    AAAAAAAA -> less complex

    ATCGATCG -> more complex
    """
    n_A = sequence.count("A")
    n_T = sequence.count("T")
    n_C = sequence.count("C")
    n_G = sequence.count("G")
    return 1 / len(sequence) * math.log(
        (
            math.factorial(len(sequence)) / (
                math.factorial(n_A) * 
                math.factorial(n_T) * 
                math.factorial(n_C) * 
                math.factorial(n_G) 
            )
        ), 
        4 # base
    )

# print(textwrap.dedent(sequence_complexity.__doc__))

print(f'Sequence\tComplexity')
print(f'--------\t----------')
for i in [
    'AAAAAAAA',
    'AAAAAAAAAAAA',
    'ATCGATCG',
    'ATCGATCGATCG',
    'ATATATAT',
    'ACTACTAA',
    'AAAAAATA',
]:
    print(f'{i}\t{sequence_complexity(i)}')


print("""
-----------------------------------------------------------------
Demo: Generate a consensus sequence based on a collection 
of variations 
-----------------------------------------------------------------
""")

variations = [
    Seq("CAGTT"),
    Seq("CATTT"),
    Seq("CAGTA"),
    Seq("CAGTT"),
    Seq("CAGTA")
]
motif = motifs.create(variations)

print("value counts:\n", motif.counts)

print("consensus: ", motif.degenerate_consensus)

print("""Get more common consensus sequences from JASPAR, e.g.:
https://jaspar.genereg.net/matrix/MA0447.1/""")


print("""
-----------------------------------------------------------------
Demo: find a common promoter motif in the E. Coli Genome
-----------------------------------------------------------------
""")

# Using promoters from:
# https://en.wikipedia.org/wiki/Promoter_(genetics)#Bacterial
promoter_consensus = Seq('AAAAAARNR')

# define genome sequence
genome: Seq = None

# load e-coli genome
with open('./data/e-coli.gb', "r") as file:
    gb_file = SeqIO.parse(file, 'genbank')

    # genome we want is first item in the genbank file
    data = next(gb_file)
    print('Genome Data')
    print('-----------')
    print(f'id: {data.id}')
    print(f'len: {len(data.seq)} bp\n')

    genome = data.seq

# search for promoter seq
result = SeqUtils.nt_search(str(genome), promoter_consensus)
print('Search Results')
print('----------------')
print(f'''original search sequence: {str(promoter_consensus)}
translated sequence: {result[0]}
indices of first 4 instances: {result[1:5]}
contents of first 4 instances:
{[str(genome[result[i]:result[i]+len(promoter_consensus)]) for i in range(1,5)]}
''')
move from general learn repo 2022-10-15 14:06:23 -07:00			`# find instances of a promoter consensus sequence in e-coli genome`
			`# https://open.oregonstate.education/appliedbioinformatics/chapter/chapter-2-sequence-motifs/`
			`# https://en.wikipedia.org/wiki/Sequence_motif`

			`import textwrap`
			`from Bio import SeqIO, SeqUtils, motifs`
			`from Bio.Seq import Seq`
			`import math`


			`print("""`
			`-----------------------------------------------------------------`
			`Demo: Calculate sequence complexity`
			`-----------------------------------------------------------------`
			`""")`

			`def sequence_complexity(sequence: Seq) -> float:`
			`"""`
			`Complexity is defined as:`

			`1 / N * log_D( N! / (n_A! * n_C! * n_T! * n_G! ) )`

			`Where:`
			`D = 4 (alphabet size),`
			`N = Total length of sequence,`
			`n_X = number of nucleotide X in sequence`

			`AAAAAAAA -> less complex`

			`ATCGATCG -> more complex`
			`"""`
			`n_A = sequence.count("A")`
			`n_T = sequence.count("T")`
			`n_C = sequence.count("C")`
			`n_G = sequence.count("G")`
			`return 1 / len(sequence) * math.log(`
			`(`
			`math.factorial(len(sequence)) / (`
			`math.factorial(n_A) *`
			`math.factorial(n_T) *`
			`math.factorial(n_C) *`
			`math.factorial(n_G)`
			`)`
			`),`
			`4 # base`
			`)`

			`# print(textwrap.dedent(sequence_complexity.__doc__))`

			`print(f'Sequence\tComplexity')`
			`print(f'--------\t----------')`
			`for i in [`
			`'AAAAAAAA',`
			`'AAAAAAAAAAAA',`
			`'ATCGATCG',`
			`'ATCGATCGATCG',`
			`'ATATATAT',`
			`'ACTACTAA',`
			`'AAAAAATA',`
			`]:`
			`print(f'{i}\t{sequence_complexity(i)}')`


			`print("""`
			`-----------------------------------------------------------------`
			`Demo: Generate a consensus sequence based on a collection`
			`of variations`
			`-----------------------------------------------------------------`
			`""")`

			`variations = [`
			`Seq("CAGTT"),`
			`Seq("CATTT"),`
			`Seq("CAGTA"),`
			`Seq("CAGTT"),`
			`Seq("CAGTA")`
			`]`
			`motif = motifs.create(variations)`

			`print("value counts:\n", motif.counts)`

			`print("consensus: ", motif.degenerate_consensus)`

			`print("""Get more common consensus sequences from JASPAR, e.g.:`
			`https://jaspar.genereg.net/matrix/MA0447.1/""")`


			`print("""`
			`-----------------------------------------------------------------`
			`Demo: find a common promoter motif in the E. Coli Genome`
			`-----------------------------------------------------------------`
			`""")`

			`# Using promoters from:`
			`# https://en.wikipedia.org/wiki/Promoter_(genetics)#Bacterial`
			`promoter_consensus = Seq('AAAAAARNR')`

			`# define genome sequence`
			`genome: Seq = None`

			`# load e-coli genome`
			`with open('./data/e-coli.gb', "r") as file:`
			`gb_file = SeqIO.parse(file, 'genbank')`

			`# genome we want is first item in the genbank file`
			`data = next(gb_file)`
			`print('Genome Data')`
			`print('-----------')`
			`print(f'id: {data.id}')`
			`print(f'len: {len(data.seq)} bp\n')`

			`genome = data.seq`

			`# search for promoter seq`
			`result = SeqUtils.nt_search(str(genome), promoter_consensus)`
			`print('Search Results')`
			`print('----------------')`
			`print(f'''original search sequence: {str(promoter_consensus)}`
			`translated sequence: {result[0]}`
			`indices of first 4 instances: {result[1:5]}`
			`contents of first 4 instances:`
			`{[str(genome[result[i]:result[i]+len(promoter_consensus)]) for i in range(1,5)]}`
			`''')`