GO Annotations
Steven Roberts 2025-03-21
- Variables
- Aerobic respiration (GO:0009060)
- Oxidative phosphorylation (GO:0006119)
- Canonical glycolysis (GO:0061621)
- Tricarboxylic Acid Cycle (GO:0006099)
- Summary of blast hits
Want to start with grabbing protein with specific GOs
see also https://www.ebi.ac.uk/QuickGO/annotations
Variables
# Global R options
::opts_chunk$set(echo = TRUE)
knitr
# Define key paths and tool directories
<- "../output/27-Apul-pheno-annot/"
OUT_DIR <- "1E-20"
evalue <- "../data/Apulchra-genome.pep.faa"
fasta
# Export these as environment variables for bash chunks.
Sys.setenv(
OUT_DIR = OUT_DIR,
evalue = evalue,
fasta =fasta
)
Aerobic respiration (GO:0009060)
GO="0009060"
curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A"${GO}"%29%29+AND+%28reviewed%3Atrue%29" -o "${OUT_DIR}"SwissProt-GO:"${GO}".fa
head "${OUT_DIR}"SwissProt-GO:"${GO}".fa
echo "Number of Proteins"
grep -c ">" "${OUT_DIR}"SwissProt-GO:"${GO}".fa
/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
"${OUT_DIR}"SwissProt-GO:"${GO}".fa \
-in \
-dbtype prot "${OUT_DIR}"SwissProt-GO:"${GO}"
-out
/home/shared/ncbi-blast-2.15.0+/bin/blastp \
$fasta \
-query "${OUT_DIR}"SwissProt-GO:"${GO}" \
-db "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab \
-out "${evalue}" \
-evalue \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 > "${OUT_DIR}"blast_warnings"${GO}".txt
2
head "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
echo "Number of hits"
wc -l "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- 0:00:01 --:--:-- 0
100 673k 0 673k 0 0 378k 0 --:--:-- 0:00:01 --:--:-- 377k
100 2157k 0 2157k 0 0 768k 0 --:--:-- 0:00:02 --:--:-- 768k
100 3305k 0 3305k 0 0 868k 0 --:--:-- 0:00:03 --:--:-- 868k
100 4953k 0 4953k 0 0 1015k 0 --:--:-- 0:00:04 --:--:-- 1015k
100 5219k 0 5219k 0 0 1064k 0 --:--:-- 0:00:04 --:--:-- 1346k
>sp|A0A096P8D3|IDH_OSTTA Isocitrate dehydrogenase (NAD(+)), mitochondrial OS=Ostreococcus tauri OX=70448 GN=IDH PE=1 SV=1
MTRVERGRVLARAIERAVAHRASARRWTTTTRTPAWMVTGWMGGRGVDRSTAMTRFERCG
STASSKITAAPMVYVRGEEMTAYVMDLIRSRWIEPRVDVGGWETFDLRAKNRDDTEDRVL
RDVIEAGKRIKAIFKEPTVTPTADQVKRLGLRKSWGSPNGAMRRGWNGITISRDTIHIDG
VELGYKKPVLFERHAVGGEYSAGYKNVGKGKLTTTFTPSEGPDAGKTVVVDEREIVDEEA
AVVTYHNPYDNVHDLARFFFGRCLEAKVTPYVVTKKTVFKWQEPFWQIMRTVFDEEFKAQ
FVAAGVMKEGEELVHLLSDAATMKLVQWRQGGFGMAAHNYDGDVLTDELAQVHKSPGFIT
SNLVGVHEDGTLIKEFEASHGTVADMDEARLRGEETSLNPLGMVEGLIGAMNHAADVHNI
DRDRTHAFTTKMRTVIHQLFREGKGTRDLCGPSGLTTEQFIDAVAERLDA
>sp|A0A0D2Y5A7|ODP2_FUSO4 Dihydrolipoyllysine-residue acetyltransferase component of pyruvate dehydrogenase complex, mitochondrial OS=Fusarium oxysporum f. sp. lycopersici (strain 4287 / CBS 123668 / FGSC 9935 / NRRL 34936) OX=426428 GN=LAT1 PE=1 SV=1
Number of Proteins
10978
Building a new DB, current time: 03/21/2025 12:49:44
New DB name: /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0009060
New DB title: ../output/27-Apul-pheno-annot/SwissProt-GO:0009060.fa
Sequence type: Protein
Deleted existing Protein BLAST database named /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0009060
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 10978 sequences in 0.179029 seconds.
FUN_000047-T1 sp|Q757N1|H3_EREGS 88.971 136 15 0 4 139 1 136 2.14e-86 246
FUN_000049-T1 sp|Q757N1|H3_EREGS 92.079 101 8 0 1 101 1 101 1.61e-63 190
FUN_000051-T1 sp|Q757N1|H3_EREGS 92.079 101 8 0 1 101 1 101 1.61e-63 190
FUN_000053-T1 sp|Q757N1|H3_EREGS 92.079 101 8 0 1 101 1 101 1.61e-63 190
FUN_000055-T1 sp|Q757N1|H3_EREGS 92.079 101 8 0 1 101 1 101 1.61e-63 190
FUN_000057-T1 sp|Q757N1|H3_EREGS 92.079 101 8 0 1 101 1 101 1.61e-63 190
FUN_000059-T1 sp|Q757N1|H3_EREGS 92.079 101 8 0 1 101 1 101 1.61e-63 190
FUN_000061-T1 sp|Q757N1|H3_EREGS 92.079 101 8 0 1 101 1 101 1.61e-63 190
FUN_000063-T1 sp|Q757N1|H3_EREGS 92.079 101 8 0 1 101 1 101 1.61e-63 190
FUN_000065-T1 sp|Q757N1|H3_EREGS 88.971 136 15 0 4 139 1 136 2.14e-86 246
Number of hits
616 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0009060_out.tab
Oxidative phosphorylation (GO:0006119)
GO="0006119"
curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A"${GO}"%29%29+AND+%28reviewed%3Atrue%29" -o "${OUT_DIR}"SwissProt-GO:"${GO}".fa
head "${OUT_DIR}"SwissProt-GO:"${GO}".fa
echo "Number of Proteins"
grep -c ">" "${OUT_DIR}"SwissProt-GO:"${GO}".fa
/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
"${OUT_DIR}"SwissProt-GO:"${GO}".fa \
-in \
-dbtype prot "${OUT_DIR}"SwissProt-GO:"${GO}"
-out
/home/shared/ncbi-blast-2.15.0+/bin/blastp \
$fasta \
-query "${OUT_DIR}"SwissProt-GO:"${GO}" \
-db "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab \
-out "${evalue}" \
-evalue \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 > "${OUT_DIR}"blast_warnings"${GO}".txt
2
head "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
echo "Number of hits"
wc -l "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
100 479k 0 479k 0 0 285k 0 --:--:-- 0:00:01 --:--:-- 285k
100 2632k 0 2632k 0 0 969k 0 --:--:-- 0:00:02 --:--:-- 969k
100 2675k 0 2675k 0 0 984k 0 --:--:-- 0:00:02 --:--:-- 983k
>sp|A0A1D8PHA3|CYT1_CANAL Cytochrome b-c1 complex catalytic subunit, mitochondrial OS=Candida albicans (strain SC5314 / ATCC MYA-2876) OX=237561 GN=CYT1 PE=1 SV=1
MFRTAYKTMNQSMVQKFIAGGVGVTGLTASYLLYQDSMTADAMTAAEHGLHPPAYNWPHN
GMFETFDHASIRRGFQVYREVCAACHSLDRIAWRNLVGVSHTTSEAKAMAEELEYDDEPD
DEGKPRKRPGKLADYIPGPYENEQAARAANQGAYPPDLSLIVKARHGGSDYIFSLLTGYP
DEPPAGVVLPEGSNYNPYFPGGAIAMGRVLFDDLVEYEDGTPATTSQMAKDVSTFLNWAS
EPEHDDRKKWGLKALVVLSSLYLLSIWVKRFKWTPIKNRKFRFDPPKK
>sp|A0A1D8PJX3|RIP1_CANAL Cytochrome b-c1 complex subunit Rieske, mitochondrial OS=Candida albicans (strain SC5314 / ATCC MYA-2876) OX=237561 GN=RIP1 PE=1 SV=1
MSSLAFRTLRNGLGLKSSVRALSTTTTTLSNYQQPDYSSYLNNKSGQGSRNFTYFMVGSM
GLLSAAGAKSTVEAFLSSFAASADVLAMAKVEVKLGAIPEGKNVIIKWQGKPVFIRHRTA
DEIEEANQVDIKTLRDPQNDADRVKKPEWLIMLGICTHLGCVPIGEAGDFGGWFCPCHGS
Number of Proteins
6140
Building a new DB, current time: 03/21/2025 12:50:54
New DB name: /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0006119
New DB title: ../output/27-Apul-pheno-annot/SwissProt-GO:0006119.fa
Sequence type: Protein
Deleted existing Protein BLAST database named /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0006119
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 6140 sequences in 0.103551 seconds.
FUN_000943-T1 sp|Q791V5|MTCH2_MOUSE 41.549 284 162 2 24 305 8 289 2.00e-80 243
FUN_000977-T1 sp|Q6XHB2|ROCO4_DICDI 30.035 283 163 10 589 848 1023 1293 6.93e-28 119
FUN_001181-T1 sp|O52683|HYDA_THEMA 29.902 408 237 17 115 515 184 549 5.65e-40 151
FUN_001463-T1 sp|Q6XHB2|ROCO4_DICDI 27.059 340 219 14 1552 1876 954 1279 1.37e-23 107
FUN_001483-T1 sp|Q6CFT7|ATPB_YARLI 25.301 415 280 10 45 448 52 447 3.17e-29 117
FUN_001648-T1 sp|P24539|AT5F1_HUMAN 39.691 194 108 3 15 207 62 247 2.52e-38 130
FUN_001648-T2 sp|P24539|AT5F1_HUMAN 40.314 191 105 3 47 236 62 244 2.48e-38 131
FUN_003901-T1 sp|P29410|KAD2_RAT 34.314 204 105 5 30 204 20 223 9.03e-29 105
FUN_004027-T1 sp|Q05752|NDUA7_BOVIN 53.947 76 34 1 1 75 1 76 6.67e-23 83.2
FUN_004095-T1 sp|Q6XHB2|ROCO4_DICDI 28.975 283 175 10 37 301 1015 1289 2.80e-21 96.3
Number of hits
130 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0006119_out.tab
Canonical glycolysis (GO:0061621)
GO="0061621"
curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A"${GO}"%29%29+AND+%28reviewed%3Atrue%29" -o "${OUT_DIR}"SwissProt-GO:"${GO}".fa
head "${OUT_DIR}"SwissProt-GO:"${GO}".fa
echo "Number of Proteins"
grep -c ">" "${OUT_DIR}"SwissProt-GO:"${GO}".fa
/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
"${OUT_DIR}"SwissProt-GO:"${GO}".fa \
-in \
-dbtype prot "${OUT_DIR}"SwissProt-GO:"${GO}"
-out
/home/shared/ncbi-blast-2.15.0+/bin/blastp \
$fasta \
-query "${OUT_DIR}"SwissProt-GO:"${GO}" \
-db "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab \
-out "${evalue}" \
-evalue \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 > "${OUT_DIR}"blast_warnings"${GO}".txt
2
head "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
echo "Number of hits"
wc -l "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- 0:00:01 --:--:-- 0
100 208k 0 208k 0 0 117k 0 --:--:-- 0:00:01 --:--:-- 117k
>sp|A1A4J1|PFKAL_BOVIN ATP-dependent 6-phosphofructokinase, liver type OS=Bos taurus OX=9913 GN=PFKL PE=2 SV=1
MASVDLEKLRTTGAGKAIGVLTSGGDAQGMNAAVRAVTRMGIYVGAKVFLIYEGYEGLVE
GGENIKQANWLSVSNIIQLGGTVIGSARCKAFTTREGRRAAAYNLVQRGITNLCVIGGDG
SLTGANIFRSEWGSLLEELVSEGKISEGTAQTYSHLNIAGLVGSIDNDFCGTDMTIGTDS
ALHRIMEVIDAITTTAQSHQRTFVLEVMGRHCGYLALVSALASGADWLFIPEAPPEDGWE
NFMCERLGETRSRGSRLNIIIIAEGAIDRNGKPISSRYVKDLVVQRLGFDTRVTVLGHVQ
RGGTPSAFDRILSSKMGMEAVMALLEATPDTPACVVSLSGNQSVRLPLMECVQMTKEVQK
AMDEKRFDEAIQLRGGSFENNWNIYKLLSHQKISKEKTNFSLAILNVGAPAAGMNAAVRS
AVRSGISQGHTVYVVHDGFEGLAKNQVQEVSWHDVAGWLGRGGSMLGTKRTLPKGFMEKI
VENIRLHNIHALLVIGGFEAYEGVLQLVEARGRYEELCIVMCVIPATISNNVPGTDFSLG
Number of Proteins
394
Building a new DB, current time: 03/21/2025 12:51:31
New DB name: /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0061621
New DB title: ../output/27-Apul-pheno-annot/SwissProt-GO:0061621.fa
Sequence type: Protein
Deleted existing Protein BLAST database named /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0061621
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 394 sequences in 0.0159671 seconds.
FUN_005030-T1 sp|P85037|FOXK1_HUMAN 51.000 100 48 1 95 194 302 400 4.38e-26 103
FUN_005036-T1 sp|P42128|FOXK1_MOUSE 48.454 97 50 0 30 126 290 386 1.70e-27 105
FUN_005159-T1 sp|Q3UCQ1|FOXK2_MOUSE 52.041 98 45 1 95 190 247 344 7.57e-28 108
FUN_008233-T1 sp|Q7ZX03|FOXK2_XENLA 43.066 137 75 1 37 173 188 321 1.35e-33 125
FUN_009301-T1 sp|Q867C9|PFKAM_HORSE 58.634 776 300 4 70 833 13 779 0.0 935
FUN_010519-T1 sp|P00940|TPIS_CHICK 69.672 244 74 0 3 246 4 247 1.09e-126 353
FUN_012871-T1 sp|P00940|TPIS_CHICK 69.672 244 74 0 3 246 4 247 1.09e-126 353
FUN_013147-T1 sp|Q7ZX03|FOXK2_XENLA 46.939 98 50 1 143 238 215 312 3.77e-24 96.3
FUN_013305-T1 sp|P42128|FOXK1_MOUSE 42.202 109 61 1 47 153 287 395 2.42e-22 86.7
FUN_015434-T1 sp|P05064|ALDOA_MOUSE 72.269 357 97 1 5 359 8 364 0.0 528
Number of hits
37 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0061621_out.tab
Tricarboxylic Acid Cycle (GO:0006099)
GO="0006099"
curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A"${GO}"%29%29+AND+%28reviewed%3Atrue%29" -o "${OUT_DIR}"SwissProt-GO:"${GO}".fa
head "${OUT_DIR}"SwissProt-GO:"${GO}".fa
echo "Number of Proteins"
grep -c ">" "${OUT_DIR}"SwissProt-GO:"${GO}".fa
/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
"${OUT_DIR}"SwissProt-GO:"${GO}".fa \
-in \
-dbtype prot "${OUT_DIR}"SwissProt-GO:"${GO}"
-out
/home/shared/ncbi-blast-2.15.0+/bin/blastp \
$fasta \
-query "${OUT_DIR}"SwissProt-GO:"${GO}" \
-db "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab \
-out "${evalue}" \
-evalue \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 > "${OUT_DIR}"blast_warnings"${GO}".txt
2
head "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
echo "Number of hits"
wc -l "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
100 31776 0 31776 0 0 26928 0 --:--:-- 0:00:01 --:--:-- 26906
100 1726k 0 1726k 0 0 889k 0 --:--:-- 0:00:01 --:--:-- 889k
>sp|A0A096P8D3|IDH_OSTTA Isocitrate dehydrogenase (NAD(+)), mitochondrial OS=Ostreococcus tauri OX=70448 GN=IDH PE=1 SV=1
MTRVERGRVLARAIERAVAHRASARRWTTTTRTPAWMVTGWMGGRGVDRSTAMTRFERCG
STASSKITAAPMVYVRGEEMTAYVMDLIRSRWIEPRVDVGGWETFDLRAKNRDDTEDRVL
RDVIEAGKRIKAIFKEPTVTPTADQVKRLGLRKSWGSPNGAMRRGWNGITISRDTIHIDG
VELGYKKPVLFERHAVGGEYSAGYKNVGKGKLTTTFTPSEGPDAGKTVVVDEREIVDEEA
AVVTYHNPYDNVHDLARFFFGRCLEAKVTPYVVTKKTVFKWQEPFWQIMRTVFDEEFKAQ
FVAAGVMKEGEELVHLLSDAATMKLVQWRQGGFGMAAHNYDGDVLTDELAQVHKSPGFIT
SNLVGVHEDGTLIKEFEASHGTVADMDEARLRGEETSLNPLGMVEGLIGAMNHAADVHNI
DRDRTHAFTTKMRTVIHQLFREGKGTRDLCGPSGLTTEQFIDAVAERLDA
>sp|A0A3Q0KQY7|FUMC_SCHMA Fumarate hydratase OS=Schistosoma mansoni OX=6183 GN=Smp_158240 PE=1 SV=2
Number of Proteins
2837
Building a new DB, current time: 03/21/2025 12:51:40
New DB name: /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0006099
New DB title: ../output/27-Apul-pheno-annot/SwissProt-GO:0006099.fa
Sequence type: Protein
Deleted existing Protein BLAST database named /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0006099
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 2837 sequences in 0.0586021 seconds.
FUN_000960-T1 sp|P21399|ACOHC_HUMAN 70.588 867 255 0 24 890 21 887 0.0 1312
FUN_001356-T1 sp|A0QMB9|GABD1_MYCA1 30.787 445 291 4 79 508 7 449 5.73e-62 214
FUN_003898-T1 sp|Q9V0D5|MDH_PYRAB 41.694 307 173 3 6 310 7 309 2.54e-71 224
FUN_003899-T1 sp|Q9V0D5|MDH_PYRAB 37.110 353 215 5 28 377 9 357 2.79e-73 229
FUN_004736-T1 sp|Q9VWH4|IDH3A_DROME 72.892 332 89 1 40 371 47 377 0.0 514
FUN_004752-T1 sp|A0QMB9|GABD1_MYCA1 34.848 462 284 6 426 879 3 455 5.01e-74 248
FUN_006329-T1 sp|P09624|DLDH_YEAST 56.557 488 199 7 30 508 15 498 0.0 540
FUN_006649-T1 sp|A4SFT4|MDH_CHLPM 36.093 302 182 4 24 322 2 295 4.65e-64 202
FUN_006864-T1 sp|Q73TP5|GABD1_MYCPA 35.141 461 285 7 65 519 5 457 6.75e-76 244
FUN_006864-T2 sp|Q73TP5|GABD1_MYCPA 35.141 461 285 7 65 519 5 457 2.18e-76 245
Number of hits
82 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0006099_out.tab
Summary of blast hits
wc -l "${OUT_DIR}"*tab
82 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0006099_out.tab
130 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0006119_out.tab
616 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0009060_out.tab
37 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0061621_out.tab
865 total