103 lines
3.2 KiB
Python
103 lines
3.2 KiB
Python
# Resources:
|
|
# https://open.oregonstate.education/appliedbioinformatics/chapter/chapter-3/
|
|
|
|
from doctest import OutputChecker
|
|
import subprocess
|
|
from Bio.Blast import NCBIXML, Record
|
|
from Bio.Seq import Seq
|
|
import numpy as np
|
|
|
|
print("""
|
|
-----------------------------------------------------------------
|
|
Demo: Needleman-Wunsch global alignment algorithm
|
|
(implementation from scratch)
|
|
-----------------------------------------------------------------
|
|
""")
|
|
|
|
def nw_align(a: Seq, b: Seq):
|
|
"""
|
|
Globally matches sequences A & B based on a scoring matrix &
|
|
gap penalty system.
|
|
|
|
Textbook section 3.1.1
|
|
"""
|
|
|
|
def score(a, b):
|
|
"""Simple score matrix that only checks equality"""
|
|
if a == b:
|
|
return 1
|
|
elif a != b:
|
|
return -1
|
|
|
|
def gap_penalty(d):
|
|
G = -1 # penalty scaling parameter. must be
|
|
# negative, since it's added to the score
|
|
return G * d
|
|
|
|
# generate a comparison table of size (A + 1) x (B + 1)
|
|
# note: the +1 in length is to allow for gaps to be accounted for
|
|
matrix = np.zeros((len(a) + 1, len(b) + 1)) # shape: (a, b)
|
|
a_seq = ['-']; a_seq.extend([b for b in a])
|
|
b_seq = ['-']; b_seq.extend([b for b in b])
|
|
|
|
# initialize gap row & gap column with calculated penalties
|
|
for i in range(len(matrix)):
|
|
matrix[i][0] = gap_penalty(i)
|
|
for j in range(len(matrix[0])):
|
|
matrix[0][j] = gap_penalty(j)
|
|
|
|
# generate score + gap penalty for remaining cells by running through each sequence
|
|
for i in range(1,len(matrix)):
|
|
for j in range(1, len(matrix[i])):
|
|
matrix[i][j] = max(
|
|
score(a_seq[i], b_seq[j]) + matrix[i-1][j-1],
|
|
matrix[i][j-1] + gap_penalty(1), # adjacent -> gap of 1
|
|
matrix[i-1][j] + gap_penalty(1),
|
|
)
|
|
|
|
return matrix
|
|
|
|
# example from textbook
|
|
print(nw_align('CAGCTAGCG', 'CCATACGA'))
|
|
|
|
# print("""
|
|
# -----------------------------------------------------------------
|
|
# Demo: Smith-Waterman local alignment algorithm
|
|
# (implementation from scratch)
|
|
# -----------------------------------------------------------------
|
|
# """)
|
|
|
|
# def sw_align(a: Seq, b: Seq):
|
|
# """
|
|
# Locally matches sequences A & B
|
|
# """
|
|
|
|
print("""
|
|
-----------------------------------------------------------------
|
|
Task: Run BLAST tasks (./scripts/blast-tasks.sh)
|
|
- Download protein database (swissprot)
|
|
- Extract database
|
|
- Run search task for BRCA1 (tumor suppression protein)
|
|
-----------------------------------------------------------------
|
|
""")
|
|
|
|
subprocess.run(["bash", "./scripts/blast-tasks.sh"])
|
|
|
|
print("""
|
|
-----------------------------------------------------------------
|
|
Demo: Parse BLAST XML output & read matches
|
|
-----------------------------------------------------------------
|
|
""")
|
|
|
|
out_path = './output/brca1_swissprot_out.xml'
|
|
print(f'read {out_path}...\n')
|
|
blast_out: Record = None
|
|
with open(out_path) as file:
|
|
blast_out = NCBIXML.read(file)
|
|
for alignment in blast_out.alignments:
|
|
print('Alignment')
|
|
print('---------')
|
|
print(f'title: {alignment.title}')
|
|
print(f'length: {alignment.length}')
|
|
print(f'high-scoring pair (HSP) matches: {[hsp.match for hsp in alignment.hsps]}\n')
|