learn-bio/sequence-alignment.py

# Resources:
# https://open.oregonstate.education/appliedbioinformatics/chapter/chapter-3/

from doctest import OutputChecker
import subprocess
from Bio.Blast import NCBIXML, Record
from Bio.Seq import Seq
import numpy as np

print("""
-----------------------------------------------------------------
Demo: Needleman-Wunsch global alignment algorithm
(implementation from scratch)
-----------------------------------------------------------------
""")

def nw_align(a: Seq, b: Seq):
    """
    Globally matches sequences A & B based on a scoring matrix &
    gap penalty system.

    Textbook section 3.1.1
    """

    def score(a, b):
        """Simple score matrix that only checks equality"""
        if a == b:
            return 1
        elif a != b:
            return -1

    def gap_penalty(d):
        G = -1  # penalty scaling parameter. must be
                # negative, since it's added to the score
        return G * d

    # generate a comparison table of size (A + 1) x (B + 1)
    # note: the +1 in length is to allow for gaps to be accounted for
    matrix = np.zeros((len(a) + 1, len(b) + 1)) # shape: (a, b)
    a_seq = ['-']; a_seq.extend([b for b in a])
    b_seq = ['-']; b_seq.extend([b for b in b])

    # initialize gap row & gap column with calculated penalties
    for i in range(len(matrix)):
        matrix[i][0] = gap_penalty(i)
    for j in range(len(matrix[0])):
        matrix[0][j] = gap_penalty(j)

    # generate score + gap penalty for remaining cells by running through each sequence
    for i in range(1,len(matrix)):
        for j in range(1, len(matrix[i])):
            matrix[i][j] = max(
                score(a_seq[i], b_seq[j]) + matrix[i-1][j-1],
                matrix[i][j-1] + gap_penalty(1), # adjacent -> gap of 1
                matrix[i-1][j] + gap_penalty(1),
            )

    return matrix

# example from textbook
print(nw_align('CAGCTAGCG', 'CCATACGA'))

# print("""
# -----------------------------------------------------------------
# Demo: Smith-Waterman local alignment algorithm
# (implementation from scratch)
# -----------------------------------------------------------------
# """)

# def sw_align(a: Seq, b: Seq):
#     """
#     Locally matches sequences A & B
#     """

print("""
-----------------------------------------------------------------
Task: Run BLAST tasks (./scripts/blast-tasks.sh)
    - Download protein database (swissprot)
    - Extract database
    - Run search task for BRCA1 (tumor suppression protein)
-----------------------------------------------------------------
""")

subprocess.run(["bash", "./scripts/blast-tasks.sh"])

print("""
-----------------------------------------------------------------
Demo: Parse BLAST XML output & read matches
-----------------------------------------------------------------
""")

out_path = './output/brca1_swissprot_out.xml'
print(f'read {out_path}...\n')
blast_out: Record = None
with open(out_path) as file:
    blast_out = NCBIXML.read(file)
for alignment in blast_out.alignments:
    print('Alignment')
    print('---------')
    print(f'title: {alignment.title}')
    print(f'length: {alignment.length}')
    print(f'high-scoring pair (HSP) matches: {[hsp.match for hsp in alignment.hsps]}\n')