move from general learn repo
commit
f5a65867e3
|
@ -0,0 +1,181 @@
|
||||||
|
# Command output
|
||||||
|
output/*
|
||||||
|
!output/README.md
|
||||||
|
|
||||||
|
# Data files that are too big
|
||||||
|
data/swissprot.fasta
|
||||||
|
|
||||||
|
# BLAST output
|
||||||
|
data/*.pdb
|
||||||
|
data/*.phr
|
||||||
|
data/*.pin
|
||||||
|
data/*.pjs
|
||||||
|
data/*.psq
|
||||||
|
data/*.ptf
|
||||||
|
data/*.pto
|
||||||
|
|
||||||
|
# Any archives that didn't get deleted
|
||||||
|
*.gz
|
||||||
|
*.zip
|
||||||
|
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/#use-with-ide
|
||||||
|
.pdm.toml
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
|
@ -0,0 +1,13 @@
|
||||||
|
[[source]]
|
||||||
|
url = "https://pypi.org/simple"
|
||||||
|
verify_ssl = true
|
||||||
|
name = "pypi"
|
||||||
|
|
||||||
|
[packages]
|
||||||
|
biopython = "*"
|
||||||
|
numpy = "*"
|
||||||
|
|
||||||
|
[dev-packages]
|
||||||
|
|
||||||
|
[requires]
|
||||||
|
python_version = "3.10"
|
|
@ -0,0 +1,96 @@
|
||||||
|
{
|
||||||
|
"_meta": {
|
||||||
|
"hash": {
|
||||||
|
"sha256": "70d6aad776fce292fb028e0a430745eb654e21c2d780efc912e29ec1bf9b9c39"
|
||||||
|
},
|
||||||
|
"pipfile-spec": 6,
|
||||||
|
"requires": {
|
||||||
|
"python_version": "3.10"
|
||||||
|
},
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"name": "pypi",
|
||||||
|
"url": "https://pypi.org/simple",
|
||||||
|
"verify_ssl": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"biopython": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:03ee5c72b3cc3f0675a8c22ce1c45fe99a32a60db18df059df479ae6cf619708",
|
||||||
|
"sha256:131093d8a0b8075b692fe73d9a4684d4fc98ff5990f6dce1e1b9f929c58207f1",
|
||||||
|
"sha256:155c5b95857bca7ebd607210cb9d8ea459bb0b86b3ca37ea44ec47c26ede7e9a",
|
||||||
|
"sha256:1af4348c17e43f3c79a16af87424d8e3a32e2168ab9246106a085bbb2b8d3450",
|
||||||
|
"sha256:26c8b935a08efc044bbdd8882b86800c6bd7aa2a22832cee9470aba708cd23b1",
|
||||||
|
"sha256:2dbb4388c75b5dfca8ce729e791f465c9c878dbd7ba2ab9a1f9854609d2b5426",
|
||||||
|
"sha256:365569543ea58dd07ef205ec351c23b6c1a3200d5d321eb28ceaecd55eb5955e",
|
||||||
|
"sha256:3786ed9304f2de9f27a9eaa7d19b6b167eff2be0d15c99000a99785308b7dabe",
|
||||||
|
"sha256:465429ca6fc1a98d25cc7a15708f1d238caa3ada66c3cd47d27405c816c80808",
|
||||||
|
"sha256:4b3d4eec2e348c3d97a7fde80ee0f2b8ebeed849d2bd64a616833a9be03b93c8",
|
||||||
|
"sha256:4be31815226052d86d4c2f6a103c40504e34bba3e25cc1b1d687a3203c42fb6e",
|
||||||
|
"sha256:51eb467a60c38820ad1e6c3a7d4cb10535606f559646e824cc65c96091d91ff7",
|
||||||
|
"sha256:5ae69c5e09769390643aa0f8064517665df6fb99c37433821d6664584d0ecb8c",
|
||||||
|
"sha256:72a1477cf1701964c7224e506a54fd65d1cc5228da200b634a17992230aa1cbd",
|
||||||
|
"sha256:76988ed3d7383d566db1d7fc69c9cf136c6275813fb749fc6753c340f81f1a8f",
|
||||||
|
"sha256:83bfea8a19f9352c47b13965c4b73853e7aeef3c5aed8489895b0679e32c621b",
|
||||||
|
"sha256:884a2b99ac7820cb84f70089769a512e3238ee60438b8c934ed519613dc570ce",
|
||||||
|
"sha256:8f33dafd3c7254fff5e1684b965e45a7c08d9b8e1bf51562b0a521ff9a6f5ea0",
|
||||||
|
"sha256:947b793e804c59ea45ae46945a57612ad1789ca87af4af0d6a62dcecf3a6246a",
|
||||||
|
"sha256:9580978803b582e0612b71673cab289e6bf261a865009cfb9501d65bc726a76e",
|
||||||
|
"sha256:98deacc30b8654cfcdcf707d93fa4e3c8717bbda07c3f9f828cf84753d4a1e4d",
|
||||||
|
"sha256:9eadfd4300f534cd4fa39613eeee786d2c3d6b981d373c5c46616fa1a97cad10",
|
||||||
|
"sha256:aa23a83a220486af6193760d079b36543fe00afcfbd18280ca2fd0b2c1c8dd6d",
|
||||||
|
"sha256:ab93d5749b375be3682866b3a606aa2ebd3e6d868079793925bf4fbb0987cf1f",
|
||||||
|
"sha256:b385ab3eb8921bdf952213bb94c52662696905e5e5b8b81b024156eec3249012",
|
||||||
|
"sha256:b3ab26f26a1956ef26303386510d84e917e31fcbbc94918c336da0163ef628df",
|
||||||
|
"sha256:b5c371b54f9ebb9ec420d535748d40c6945faf829420c1c5b254b1b77f70b153",
|
||||||
|
"sha256:bb3c25ac6688ceac074e8d09951d29d1ef49c0645f677550d7cbe5b950da5ccf",
|
||||||
|
"sha256:bf634a56f449a4123e48e538d661948e5ac29fb452acd2962b8cb834b472a9d7",
|
||||||
|
"sha256:ceab668be9cbdcddef55ad459f87acd0316ae4a00d32251fea4cf665f5062fda",
|
||||||
|
"sha256:d759ccb6e7539130f0b272bc246715cad2a2fb91520d62db183d62d65f80a215",
|
||||||
|
"sha256:d9f6ce961e0c380e2a5435f64c96421dbcebeab6a1b41506bd81251feb733c08",
|
||||||
|
"sha256:e921571b51514a6d35944242d6fef6427c3998acf58940fe1f209ac8a92a6e87",
|
||||||
|
"sha256:edb07eac99d3b8abd7ba56ff4bedec9263f76dfc3c3f450e7d2e2bcdecf8559b",
|
||||||
|
"sha256:f0a7e1d94a318f74974345fd0987ec389b16988ec484e67218e900b116b932a8"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==1.79"
|
||||||
|
},
|
||||||
|
"numpy": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:0fe563fc8ed9dc4474cbf70742673fc4391d70f4363f917599a7fa99f042d5a8",
|
||||||
|
"sha256:12ac457b63ec8ded85d85c1e17d85efd3c2b0967ca39560b307a35a6703a4735",
|
||||||
|
"sha256:2341f4ab6dba0834b685cce16dad5f9b6606ea8a00e6da154f5dbded70fdc4dd",
|
||||||
|
"sha256:296d17aed51161dbad3c67ed6d164e51fcd18dbcd5dd4f9d0a9c6055dce30810",
|
||||||
|
"sha256:488a66cb667359534bc70028d653ba1cf307bae88eab5929cd707c761ff037db",
|
||||||
|
"sha256:4d52914c88b4930dafb6c48ba5115a96cbab40f45740239d9f4159c4ba779962",
|
||||||
|
"sha256:5e13030f8793e9ee42f9c7d5777465a560eb78fa7e11b1c053427f2ccab90c79",
|
||||||
|
"sha256:61be02e3bf810b60ab74e81d6d0d36246dbfb644a462458bb53b595791251911",
|
||||||
|
"sha256:7607b598217745cc40f751da38ffd03512d33ec06f3523fb0b5f82e09f6f676d",
|
||||||
|
"sha256:7a70a7d3ce4c0e9284e92285cba91a4a3f5214d87ee0e95928f3614a256a1488",
|
||||||
|
"sha256:7ab46e4e7ec63c8a5e6dbf5c1b9e1c92ba23a7ebecc86c336cb7bf3bd2fb10e5",
|
||||||
|
"sha256:8981d9b5619569899666170c7c9748920f4a5005bf79c72c07d08c8a035757b0",
|
||||||
|
"sha256:8c053d7557a8f022ec823196d242464b6955a7e7e5015b719e76003f63f82d0f",
|
||||||
|
"sha256:926db372bc4ac1edf81cfb6c59e2a881606b409ddc0d0920b988174b2e2a767f",
|
||||||
|
"sha256:95d79ada05005f6f4f337d3bb9de8a7774f259341c70bc88047a1f7b96a4bcb2",
|
||||||
|
"sha256:95de7dc7dc47a312f6feddd3da2500826defdccbc41608d0031276a24181a2c0",
|
||||||
|
"sha256:a0882323e0ca4245eb0a3d0a74f88ce581cc33aedcfa396e415e5bba7bf05f68",
|
||||||
|
"sha256:a8365b942f9c1a7d0f0dc974747d99dd0a0cdfc5949a33119caf05cb314682d3",
|
||||||
|
"sha256:a8aae2fb3180940011b4862b2dd3756616841c53db9734b27bb93813cd79fce6",
|
||||||
|
"sha256:c237129f0e732885c9a6076a537e974160482eab8f10db6292e92154d4c67d71",
|
||||||
|
"sha256:c67b833dbccefe97cdd3f52798d430b9d3430396af7cdb2a0c32954c3ef73894",
|
||||||
|
"sha256:ce03305dd694c4873b9429274fd41fc7eb4e0e4dea07e0af97a933b079a5814f",
|
||||||
|
"sha256:d331afac87c92373826af83d2b2b435f57b17a5c74e6268b79355b970626e329",
|
||||||
|
"sha256:dada341ebb79619fe00a291185bba370c9803b1e1d7051610e01ed809ef3a4ba",
|
||||||
|
"sha256:ed2cc92af0efad20198638c69bb0fc2870a58dabfba6eb722c933b48556c686c",
|
||||||
|
"sha256:f260da502d7441a45695199b4e7fd8ca87db659ba1c78f2bbf31f934fe76ae0e",
|
||||||
|
"sha256:f2f390aa4da44454db40a1f0201401f9036e8d578a25f01a6e237cea238337ef",
|
||||||
|
"sha256:f76025acc8e2114bb664294a07ede0727aa75d63a06d2fae96bf29a81747e4a7"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==1.23.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"develop": {}
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
## Bioinformatics
|
||||||
|
|
||||||
|
### Genomes Used
|
||||||
|
|
||||||
|
- [E. Coli](https://www.ncbi.nlm.nih.gov/nuccore/JANTOA000000000.1?report=genbank)
|
||||||
|
|
||||||
|
### Proteins Used
|
||||||
|
|
||||||
|
- [BRCA1](https://www.ncbi.nlm.nih.gov/protein/1698399?report=fasta)
|
||||||
|
|
||||||
|
### References
|
||||||
|
|
||||||
|
- [Oregon State University: Applied Bioinformatics](https://open.oregonstate.education/appliedbioinformatics/front-matter/applied-bioinformatics/#front-matter-490-section-1)
|
||||||
|
|
||||||
|
|
||||||
|
## Topics
|
||||||
|
|
||||||
|
1. [Sequence Motifs](sequence-motifs.py)
|
||||||
|
2. [Sequence Alignment](sequence-alignment.py)
|
Binary file not shown.
|
@ -0,0 +1,29 @@
|
||||||
|
>AAC37594.1 BRCA1 [Homo sapiens]
|
||||||
|
MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQCPLCKNDITK
|
||||||
|
RSLQESTRFSQLVEELLKIICAFQLDTGLEYANSYNFAKKENNSPEHLKDEVSIIQSMGYRNRAKRLLQS
|
||||||
|
EPENPSLQETSLSVQLSNLGTVRTLRTKQRIQPQKTSVYIELGSDSSEDTVNKATYCSVGDQELLQITPQ
|
||||||
|
GTRDEISLDSAKKAACEFSETDVTNTEHHQPSNNDLNTTEKRAAERHPEKYQGSSVSNLHVEPCGTNTHA
|
||||||
|
SSLQHENSSLLLTKDRMNVEKAEFCNKSKQPGLARSQHNRWAGSKETCNDRRTPSTEKKVDLNADPLCER
|
||||||
|
KEWNKQKLPCSENPRDTEDVPWITLNSSIQKVNEWFSRSDELLGSDDSHDGESESNAKVADVLDVLNEVD
|
||||||
|
EYSGSSEKIDLLASDPHEALICKSERVHSKSVESNIEDKIFGKTYRKKASLPNLSHVTENLIIGAFVTEP
|
||||||
|
QIIQERPLTNKLKRKRRPTSGLHPEDFIKKADLAVQKTPEMINQGTNQTEQNGQVMNITNSGHENKTKGD
|
||||||
|
SIQNEKNPNPIESLEKESAFKTKAEPISSSISNMELELNIHNSKAPKKNRLRRKSSTRHIHALELVVSRN
|
||||||
|
LSPPNCTELQIDSCSSSEEIKKKKYNQMPVRHSRNLQLMEGKEPATGAKKSNKPNEQTSKRHDSDTFPEL
|
||||||
|
KLTNAPGSFTKCSNTSELKEFVNPSLPREEKEEKLETVKVSNNAEDPKDLMLSGERVLQTERSVESSSIS
|
||||||
|
LVPGTDYGTQESISLLEVSTLGKAKTEPNKCVSQCAAFENPKGLIHGCSKDNRNDTEGFKYPLGHEVNHS
|
||||||
|
RETSIEMEESELDAQYLQNTFKVSKRQSFAPFSNPGNAEEECATFSAHSGSLKKQSPKVTFECEQKEENQ
|
||||||
|
GKNESNIKPVQTVNITAGFPVVGQKDKPVDNAKCSIKGGSRFCLSSQFRGNETGLITPNKHGLLQNPYRI
|
||||||
|
PPLFPIKSFVKTKCKKNLLEENFEEHSMSPEREMGNENIPSTVSTISRNNIRENVFKEASSSNINEVGSS
|
||||||
|
TNEVGSSINEIGSSDENIQAELGRNRGPKLNAMLRLGVLQPEVYKQSLPGSNCKHPEIKKQEYEEVVQTV
|
||||||
|
NTDFSPYLISDNLEQPMGSSHASQVCSETPDDLLDDGEIKEDTSFAENDIKESSAVFSKSVQKGELSRSP
|
||||||
|
SPFTHTHLAQGYRRGAKKLESSEENLSSEDEELPCFQHLLFGKVNNIPSQSTRHSTVATECLSKNTEENL
|
||||||
|
LSLKNSLNDCSNQVILAKASQEHHLSEETKCSASLFSSQCSELEDLTANTNTQDPFLIGSSKQMRHQSES
|
||||||
|
QGVGLSDKELVSDDEERGTGLEENNQEEQSMDSNLGEAASGCESETSVSEDCSGLSSQSDILTTQQRDTM
|
||||||
|
QHNLIKLQQEMAELEAVLEQHGSQPSNSYPSIISDSSALEDLRNPEQSTSEKAVLTSQKSSEYPISQNPE
|
||||||
|
GLSADKFEVSADSSTSKNKEPGVERSSPSKCPSLDDRWYMHSCSGSLQNRNYPSQEELIKVVDVEEQQLE
|
||||||
|
ESGPHDLTETSYLPRQDLEGTPYLESGISLFSDDPESDPSEDRAPESARVGNIPSSTSALKVPQLKVAES
|
||||||
|
AQSPAAAHTTDTAGYNAMEESVSREKPELTASTERVNKRMSMVVSGLTPEEFMLVYKFARKHHITLTNLI
|
||||||
|
TEETTHVVMKTDAEFVCERTLKYFLGIAGGKWVVSYFWVTQSIKERKMLNEHDFEVRGDVVNGRNHQGPK
|
||||||
|
RARESQDRKIFRGLEICCYGPFTNMPTDQLEWMVQLCGASVVKELSSFTLGTGVHPIVVVQPDAWTEDNG
|
||||||
|
FHAIGQMCEAPVVTREWVLDSVALYQCQELDTYLIPQIPHSHY
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1 @@
|
||||||
|
Command output results go here.
|
|
@ -0,0 +1,21 @@
|
||||||
|
#!/bin/bash
|
||||||
|
DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
cd $DIR/..
|
||||||
|
|
||||||
|
# Download
|
||||||
|
if [ ! -f data/swissprot.fasta ]; then
|
||||||
|
wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/swissprot.gz
|
||||||
|
gunzip swissprot.gz
|
||||||
|
mv swissprot data/swissprot.fasta
|
||||||
|
rm -f swissprot.gz
|
||||||
|
else
|
||||||
|
echo skipped: data/swissprot.fasta already exists
|
||||||
|
fi
|
||||||
|
|
||||||
|
# if [ ! -f data/swissprot.fasta.pdb ]; then
|
||||||
|
# makeblastdb -in data/swissprot.fasta -input_type fasta -title swissprot -dbtype prot
|
||||||
|
# else
|
||||||
|
# echo skipped: swissprot DB already exists
|
||||||
|
# fi
|
||||||
|
|
||||||
|
(cd data && blastp -query brca1.fasta -db swissprot.fasta -outfmt 5 > ../output/brca1_swissprot_out.xml)
|
|
@ -0,0 +1,102 @@
|
||||||
|
# Resources:
|
||||||
|
# https://open.oregonstate.education/appliedbioinformatics/chapter/chapter-3/
|
||||||
|
|
||||||
|
from doctest import OutputChecker
|
||||||
|
import subprocess
|
||||||
|
from Bio.Blast import NCBIXML, Record
|
||||||
|
from Bio.Seq import Seq
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
print("""
|
||||||
|
-----------------------------------------------------------------
|
||||||
|
Demo: Needleman-Wunsch global alignment algorithm
|
||||||
|
(implementation from scratch)
|
||||||
|
-----------------------------------------------------------------
|
||||||
|
""")
|
||||||
|
|
||||||
|
def nw_align(a: Seq, b: Seq):
|
||||||
|
"""
|
||||||
|
Globally matches sequences A & B based on a scoring matrix &
|
||||||
|
gap penalty system.
|
||||||
|
|
||||||
|
Textbook section 3.1.1
|
||||||
|
"""
|
||||||
|
|
||||||
|
def score(a, b):
|
||||||
|
"""Simple score matrix that only checks equality"""
|
||||||
|
if a == b:
|
||||||
|
return 1
|
||||||
|
elif a != b:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
def gap_penalty(d):
|
||||||
|
G = -1 # penalty scaling parameter. must be
|
||||||
|
# negative, since it's added to the score
|
||||||
|
return G * d
|
||||||
|
|
||||||
|
# generate a comparison table of size (A + 1) x (B + 1)
|
||||||
|
# note: the +1 in length is to allow for gaps to be accounted for
|
||||||
|
matrix = np.zeros((len(a) + 1, len(b) + 1)) # shape: (a, b)
|
||||||
|
a_seq = ['-']; a_seq.extend([b for b in a])
|
||||||
|
b_seq = ['-']; b_seq.extend([b for b in b])
|
||||||
|
|
||||||
|
# initialize gap row & gap column with calculated penalties
|
||||||
|
for i in range(len(matrix)):
|
||||||
|
matrix[i][0] = gap_penalty(i)
|
||||||
|
for j in range(len(matrix[0])):
|
||||||
|
matrix[0][j] = gap_penalty(j)
|
||||||
|
|
||||||
|
# generate score + gap penalty for remaining cells by running through each sequence
|
||||||
|
for i in range(1,len(matrix)):
|
||||||
|
for j in range(1, len(matrix[i])):
|
||||||
|
matrix[i][j] = max(
|
||||||
|
score(a_seq[i], b_seq[j]) + matrix[i-1][j-1],
|
||||||
|
matrix[i][j-1] + gap_penalty(1), # adjacent -> gap of 1
|
||||||
|
matrix[i-1][j] + gap_penalty(1),
|
||||||
|
)
|
||||||
|
|
||||||
|
return matrix
|
||||||
|
|
||||||
|
# example from textbook
|
||||||
|
print(nw_align('CAGCTAGCG', 'CCATACGA'))
|
||||||
|
|
||||||
|
# print("""
|
||||||
|
# -----------------------------------------------------------------
|
||||||
|
# Demo: Smith-Waterman local alignment algorithm
|
||||||
|
# (implementation from scratch)
|
||||||
|
# -----------------------------------------------------------------
|
||||||
|
# """)
|
||||||
|
|
||||||
|
# def sw_align(a: Seq, b: Seq):
|
||||||
|
# """
|
||||||
|
# Locally matches sequences A & B
|
||||||
|
# """
|
||||||
|
|
||||||
|
print("""
|
||||||
|
-----------------------------------------------------------------
|
||||||
|
Task: Run BLAST tasks (./scripts/blast-tasks.sh)
|
||||||
|
- Download protein database (swissprot)
|
||||||
|
- Extract database
|
||||||
|
- Run search task for BRCA1 (tumor suppression protein)
|
||||||
|
-----------------------------------------------------------------
|
||||||
|
""")
|
||||||
|
|
||||||
|
subprocess.run(["bash", "./scripts/blast-tasks.sh"])
|
||||||
|
|
||||||
|
print("""
|
||||||
|
-----------------------------------------------------------------
|
||||||
|
Demo: Parse BLAST XML output & read matches
|
||||||
|
-----------------------------------------------------------------
|
||||||
|
""")
|
||||||
|
|
||||||
|
out_path = './output/brca1_swissprot_out.xml'
|
||||||
|
print(f'read {out_path}...\n')
|
||||||
|
blast_out: Record = None
|
||||||
|
with open(out_path) as file:
|
||||||
|
blast_out = NCBIXML.read(file)
|
||||||
|
for alignment in blast_out.alignments:
|
||||||
|
print('Alignment')
|
||||||
|
print('---------')
|
||||||
|
print(f'title: {alignment.title}')
|
||||||
|
print(f'length: {alignment.length}')
|
||||||
|
print(f'high-scoring pair (HSP) matches: {[hsp.match for hsp in alignment.hsps]}\n')
|
|
@ -0,0 +1,123 @@
|
||||||
|
# find instances of a promoter consensus sequence in e-coli genome
|
||||||
|
# https://open.oregonstate.education/appliedbioinformatics/chapter/chapter-2-sequence-motifs/
|
||||||
|
# https://en.wikipedia.org/wiki/Sequence_motif
|
||||||
|
|
||||||
|
import textwrap
|
||||||
|
from Bio import SeqIO, SeqUtils, motifs
|
||||||
|
from Bio.Seq import Seq
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
print("""
|
||||||
|
-----------------------------------------------------------------
|
||||||
|
Demo: Calculate sequence complexity
|
||||||
|
-----------------------------------------------------------------
|
||||||
|
""")
|
||||||
|
|
||||||
|
def sequence_complexity(sequence: Seq) -> float:
|
||||||
|
"""
|
||||||
|
Complexity is defined as:
|
||||||
|
|
||||||
|
1 / N * log_D( N! / (n_A! * n_C! * n_T! * n_G! ) )
|
||||||
|
|
||||||
|
Where:
|
||||||
|
D = 4 (alphabet size),
|
||||||
|
N = Total length of sequence,
|
||||||
|
n_X = number of nucleotide X in sequence
|
||||||
|
|
||||||
|
AAAAAAAA -> less complex
|
||||||
|
|
||||||
|
ATCGATCG -> more complex
|
||||||
|
"""
|
||||||
|
n_A = sequence.count("A")
|
||||||
|
n_T = sequence.count("T")
|
||||||
|
n_C = sequence.count("C")
|
||||||
|
n_G = sequence.count("G")
|
||||||
|
return 1 / len(sequence) * math.log(
|
||||||
|
(
|
||||||
|
math.factorial(len(sequence)) / (
|
||||||
|
math.factorial(n_A) *
|
||||||
|
math.factorial(n_T) *
|
||||||
|
math.factorial(n_C) *
|
||||||
|
math.factorial(n_G)
|
||||||
|
)
|
||||||
|
),
|
||||||
|
4 # base
|
||||||
|
)
|
||||||
|
|
||||||
|
# print(textwrap.dedent(sequence_complexity.__doc__))
|
||||||
|
|
||||||
|
print(f'Sequence\tComplexity')
|
||||||
|
print(f'--------\t----------')
|
||||||
|
for i in [
|
||||||
|
'AAAAAAAA',
|
||||||
|
'AAAAAAAAAAAA',
|
||||||
|
'ATCGATCG',
|
||||||
|
'ATCGATCGATCG',
|
||||||
|
'ATATATAT',
|
||||||
|
'ACTACTAA',
|
||||||
|
'AAAAAATA',
|
||||||
|
]:
|
||||||
|
print(f'{i}\t{sequence_complexity(i)}')
|
||||||
|
|
||||||
|
|
||||||
|
print("""
|
||||||
|
-----------------------------------------------------------------
|
||||||
|
Demo: Generate a consensus sequence based on a collection
|
||||||
|
of variations
|
||||||
|
-----------------------------------------------------------------
|
||||||
|
""")
|
||||||
|
|
||||||
|
variations = [
|
||||||
|
Seq("CAGTT"),
|
||||||
|
Seq("CATTT"),
|
||||||
|
Seq("CAGTA"),
|
||||||
|
Seq("CAGTT"),
|
||||||
|
Seq("CAGTA")
|
||||||
|
]
|
||||||
|
motif = motifs.create(variations)
|
||||||
|
|
||||||
|
print("value counts:\n", motif.counts)
|
||||||
|
|
||||||
|
print("consensus: ", motif.degenerate_consensus)
|
||||||
|
|
||||||
|
print("""Get more common consensus sequences from JASPAR, e.g.:
|
||||||
|
https://jaspar.genereg.net/matrix/MA0447.1/""")
|
||||||
|
|
||||||
|
|
||||||
|
print("""
|
||||||
|
-----------------------------------------------------------------
|
||||||
|
Demo: find a common promoter motif in the E. Coli Genome
|
||||||
|
-----------------------------------------------------------------
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Using promoters from:
|
||||||
|
# https://en.wikipedia.org/wiki/Promoter_(genetics)#Bacterial
|
||||||
|
promoter_consensus = Seq('AAAAAARNR')
|
||||||
|
|
||||||
|
# define genome sequence
|
||||||
|
genome: Seq = None
|
||||||
|
|
||||||
|
# load e-coli genome
|
||||||
|
with open('./data/e-coli.gb', "r") as file:
|
||||||
|
gb_file = SeqIO.parse(file, 'genbank')
|
||||||
|
|
||||||
|
# genome we want is first item in the genbank file
|
||||||
|
data = next(gb_file)
|
||||||
|
print('Genome Data')
|
||||||
|
print('-----------')
|
||||||
|
print(f'id: {data.id}')
|
||||||
|
print(f'len: {len(data.seq)} bp\n')
|
||||||
|
|
||||||
|
genome = data.seq
|
||||||
|
|
||||||
|
# search for promoter seq
|
||||||
|
result = SeqUtils.nt_search(str(genome), promoter_consensus)
|
||||||
|
print('Search Results')
|
||||||
|
print('----------------')
|
||||||
|
print(f'''original search sequence: {str(promoter_consensus)}
|
||||||
|
translated sequence: {result[0]}
|
||||||
|
indices of first 4 instances: {result[1:5]}
|
||||||
|
contents of first 4 instances:
|
||||||
|
{[str(genome[result[i]:result[i]+len(promoter_consensus)]) for i in range(1,5)]}
|
||||||
|
''')
|
Loading…
Reference in New Issue