# find instances of a promoter consensus sequence in e-coli genome # https://open.oregonstate.education/appliedbioinformatics/chapter/chapter-2-sequence-motifs/ # https://en.wikipedia.org/wiki/Sequence_motif import textwrap from Bio import SeqIO, SeqUtils, motifs from Bio.Seq import Seq import math print(""" ----------------------------------------------------------------- Demo: Calculate sequence complexity ----------------------------------------------------------------- """) def sequence_complexity(sequence: Seq) -> float: """ Complexity is defined as: 1 / N * log_D( N! / (n_A! * n_C! * n_T! * n_G! ) ) Where: D = 4 (alphabet size), N = Total length of sequence, n_X = number of nucleotide X in sequence AAAAAAAA -> less complex ATCGATCG -> more complex """ n_A = sequence.count("A") n_T = sequence.count("T") n_C = sequence.count("C") n_G = sequence.count("G") return 1 / len(sequence) * math.log( ( math.factorial(len(sequence)) / ( math.factorial(n_A) * math.factorial(n_T) * math.factorial(n_C) * math.factorial(n_G) ) ), 4 # base ) # print(textwrap.dedent(sequence_complexity.__doc__)) print(f'Sequence\tComplexity') print(f'--------\t----------') for i in [ 'AAAAAAAA', 'AAAAAAAAAAAA', 'ATCGATCG', 'ATCGATCGATCG', 'ATATATAT', 'ACTACTAA', 'AAAAAATA', ]: print(f'{i}\t{sequence_complexity(i)}') print(""" ----------------------------------------------------------------- Demo: Generate a consensus sequence based on a collection of variations ----------------------------------------------------------------- """) variations = [ Seq("CAGTT"), Seq("CATTT"), Seq("CAGTA"), Seq("CAGTT"), Seq("CAGTA") ] motif = motifs.create(variations) print("value counts:\n", motif.counts) print("consensus: ", motif.degenerate_consensus) print("""Get more common consensus sequences from JASPAR, e.g.: https://jaspar.genereg.net/matrix/MA0447.1/""") print(""" ----------------------------------------------------------------- Demo: find a common promoter motif in the E. Coli Genome ----------------------------------------------------------------- """) # Using promoters from: # https://en.wikipedia.org/wiki/Promoter_(genetics)#Bacterial promoter_consensus = Seq('AAAAAARNR') # define genome sequence genome: Seq = None # load e-coli genome with open('./data/e-coli.gb', "r") as file: gb_file = SeqIO.parse(file, 'genbank') # genome we want is first item in the genbank file data = next(gb_file) print('Genome Data') print('-----------') print(f'id: {data.id}') print(f'len: {len(data.seq)} bp\n') genome = data.seq # search for promoter seq result = SeqUtils.nt_search(str(genome), promoter_consensus) print('Search Results') print('----------------') print(f'''original search sequence: {str(promoter_consensus)} translated sequence: {result[0]} indices of first 4 instances: {result[1:5]} contents of first 4 instances: {[str(genome[result[i]:result[i]+len(promoter_consensus)]) for i in range(1,5)]} ''')