# Resources: # https://open.oregonstate.education/appliedbioinformatics/chapter/chapter-3/ from doctest import OutputChecker import subprocess from Bio.Blast import NCBIXML, Record from Bio.Seq import Seq import numpy as np print(""" ----------------------------------------------------------------- Demo: Needleman-Wunsch global alignment algorithm (implementation from scratch) ----------------------------------------------------------------- """) def nw_align(a: Seq, b: Seq): """ Globally matches sequences A & B based on a scoring matrix & gap penalty system. Textbook section 3.1.1 """ def score(a, b): """Simple score matrix that only checks equality""" if a == b: return 1 elif a != b: return -1 def gap_penalty(d): G = -1 # penalty scaling parameter. must be # negative, since it's added to the score return G * d # generate a comparison table of size (A + 1) x (B + 1) # note: the +1 in length is to allow for gaps to be accounted for matrix = np.zeros((len(a) + 1, len(b) + 1)) # shape: (a, b) a_seq = ['-']; a_seq.extend([b for b in a]) b_seq = ['-']; b_seq.extend([b for b in b]) # initialize gap row & gap column with calculated penalties for i in range(len(matrix)): matrix[i][0] = gap_penalty(i) for j in range(len(matrix[0])): matrix[0][j] = gap_penalty(j) # generate score + gap penalty for remaining cells by running through each sequence for i in range(1,len(matrix)): for j in range(1, len(matrix[i])): matrix[i][j] = max( score(a_seq[i], b_seq[j]) + matrix[i-1][j-1], matrix[i][j-1] + gap_penalty(1), # adjacent -> gap of 1 matrix[i-1][j] + gap_penalty(1), ) return matrix # example from textbook print(nw_align('CAGCTAGCG', 'CCATACGA')) # print(""" # ----------------------------------------------------------------- # Demo: Smith-Waterman local alignment algorithm # (implementation from scratch) # ----------------------------------------------------------------- # """) # def sw_align(a: Seq, b: Seq): # """ # Locally matches sequences A & B # """ print(""" ----------------------------------------------------------------- Task: Run BLAST tasks (./scripts/blast-tasks.sh) - Download protein database (swissprot) - Extract database - Run search task for BRCA1 (tumor suppression protein) ----------------------------------------------------------------- """) subprocess.run(["bash", "./scripts/blast-tasks.sh"]) print(""" ----------------------------------------------------------------- Demo: Parse BLAST XML output & read matches ----------------------------------------------------------------- """) out_path = './output/brca1_swissprot_out.xml' print(f'read {out_path}...\n') blast_out: Record = None with open(out_path) as file: blast_out = NCBIXML.read(file) for alignment in blast_out.alignments: print('Alignment') print('---------') print(f'title: {alignment.title}') print(f'length: {alignment.length}') print(f'high-scoring pair (HSP) matches: {[hsp.match for hsp in alignment.hsps]}\n')