learn-bio/sequence-alignment.py

103 lines
3.2 KiB
Python
Raw Normal View History

2022-10-15 14:06:23 -07:00
# Resources:
# https://open.oregonstate.education/appliedbioinformatics/chapter/chapter-3/
from doctest import OutputChecker
import subprocess
from Bio.Blast import NCBIXML, Record
from Bio.Seq import Seq
import numpy as np
print("""
-----------------------------------------------------------------
Demo: Needleman-Wunsch global alignment algorithm
(implementation from scratch)
-----------------------------------------------------------------
""")
def nw_align(a: Seq, b: Seq):
"""
Globally matches sequences A & B based on a scoring matrix &
gap penalty system.
Textbook section 3.1.1
"""
def score(a, b):
"""Simple score matrix that only checks equality"""
if a == b:
return 1
elif a != b:
return -1
def gap_penalty(d):
G = -1 # penalty scaling parameter. must be
# negative, since it's added to the score
return G * d
# generate a comparison table of size (A + 1) x (B + 1)
# note: the +1 in length is to allow for gaps to be accounted for
matrix = np.zeros((len(a) + 1, len(b) + 1)) # shape: (a, b)
a_seq = ['-']; a_seq.extend([b for b in a])
b_seq = ['-']; b_seq.extend([b for b in b])
# initialize gap row & gap column with calculated penalties
for i in range(len(matrix)):
matrix[i][0] = gap_penalty(i)
for j in range(len(matrix[0])):
matrix[0][j] = gap_penalty(j)
# generate score + gap penalty for remaining cells by running through each sequence
for i in range(1,len(matrix)):
for j in range(1, len(matrix[i])):
matrix[i][j] = max(
score(a_seq[i], b_seq[j]) + matrix[i-1][j-1],
matrix[i][j-1] + gap_penalty(1), # adjacent -> gap of 1
matrix[i-1][j] + gap_penalty(1),
)
return matrix
# example from textbook
print(nw_align('CAGCTAGCG', 'CCATACGA'))
# print("""
# -----------------------------------------------------------------
# Demo: Smith-Waterman local alignment algorithm
# (implementation from scratch)
# -----------------------------------------------------------------
# """)
# def sw_align(a: Seq, b: Seq):
# """
# Locally matches sequences A & B
# """
print("""
-----------------------------------------------------------------
Task: Run BLAST tasks (./scripts/blast-tasks.sh)
- Download protein database (swissprot)
- Extract database
- Run search task for BRCA1 (tumor suppression protein)
-----------------------------------------------------------------
""")
subprocess.run(["bash", "./scripts/blast-tasks.sh"])
print("""
-----------------------------------------------------------------
Demo: Parse BLAST XML output & read matches
-----------------------------------------------------------------
""")
out_path = './output/brca1_swissprot_out.xml'
print(f'read {out_path}...\n')
blast_out: Record = None
with open(out_path) as file:
blast_out = NCBIXML.read(file)
for alignment in blast_out.alignments:
print('Alignment')
print('---------')
print(f'title: {alignment.title}')
print(f'length: {alignment.length}')
print(f'high-scoring pair (HSP) matches: {[hsp.match for hsp in alignment.hsps]}\n')