Getting to GO with Phenotype

A short list for machine learning
e5
GO
Author
Affiliation

Steven Roberts

Published

March 21, 2025

GO Annotations

Steven Roberts 2025-03-21

Want to start with grabbing protein with specific GOs

see also https://www.ebi.ac.uk/QuickGO/annotations

Variables

# Global R options
knitr::opts_chunk$set(echo = TRUE)

# Define key paths and tool directories
 
OUT_DIR <- "../output/27-Apul-pheno-annot/"
evalue <- "1E-20"
fasta <- "../data/Apulchra-genome.pep.faa"

# Export these as environment variables for bash chunks.
Sys.setenv(
  OUT_DIR = OUT_DIR,
  evalue = evalue,
  fasta =fasta
)

Aerobic respiration (GO:0009060)

GO="0009060"

curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A"${GO}"%29%29+AND+%28reviewed%3Atrue%29" -o "${OUT_DIR}"SwissProt-GO:"${GO}".fa

head "${OUT_DIR}"SwissProt-GO:"${GO}".fa

echo "Number of Proteins"
grep -c ">" "${OUT_DIR}"SwissProt-GO:"${GO}".fa

/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in "${OUT_DIR}"SwissProt-GO:"${GO}".fa \
-dbtype prot \
-out "${OUT_DIR}"SwissProt-GO:"${GO}"


/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db "${OUT_DIR}"SwissProt-GO:"${GO}" \
-out "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab \
-evalue "${evalue}" \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 \
2> "${OUT_DIR}"blast_warnings"${GO}".txt

head "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab

echo "Number of hits"

wc -l "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100  673k    0  673k    0     0   378k      0 --:--:--  0:00:01 --:--:--  377k
100 2157k    0 2157k    0     0   768k      0 --:--:--  0:00:02 --:--:--  768k
100 3305k    0 3305k    0     0   868k      0 --:--:--  0:00:03 --:--:--  868k
100 4953k    0 4953k    0     0  1015k      0 --:--:--  0:00:04 --:--:-- 1015k
100 5219k    0 5219k    0     0  1064k      0 --:--:--  0:00:04 --:--:-- 1346k
>sp|A0A096P8D3|IDH_OSTTA Isocitrate dehydrogenase (NAD(+)), mitochondrial OS=Ostreococcus tauri OX=70448 GN=IDH PE=1 SV=1
MTRVERGRVLARAIERAVAHRASARRWTTTTRTPAWMVTGWMGGRGVDRSTAMTRFERCG
STASSKITAAPMVYVRGEEMTAYVMDLIRSRWIEPRVDVGGWETFDLRAKNRDDTEDRVL
RDVIEAGKRIKAIFKEPTVTPTADQVKRLGLRKSWGSPNGAMRRGWNGITISRDTIHIDG
VELGYKKPVLFERHAVGGEYSAGYKNVGKGKLTTTFTPSEGPDAGKTVVVDEREIVDEEA
AVVTYHNPYDNVHDLARFFFGRCLEAKVTPYVVTKKTVFKWQEPFWQIMRTVFDEEFKAQ
FVAAGVMKEGEELVHLLSDAATMKLVQWRQGGFGMAAHNYDGDVLTDELAQVHKSPGFIT
SNLVGVHEDGTLIKEFEASHGTVADMDEARLRGEETSLNPLGMVEGLIGAMNHAADVHNI
DRDRTHAFTTKMRTVIHQLFREGKGTRDLCGPSGLTTEQFIDAVAERLDA
>sp|A0A0D2Y5A7|ODP2_FUSO4 Dihydrolipoyllysine-residue acetyltransferase component of pyruvate dehydrogenase complex, mitochondrial OS=Fusarium oxysporum f. sp. lycopersici (strain 4287 / CBS 123668 / FGSC 9935 / NRRL 34936) OX=426428 GN=LAT1 PE=1 SV=1
Number of Proteins
10978


Building a new DB, current time: 03/21/2025 12:49:44
New DB name:   /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0009060
New DB title:  ../output/27-Apul-pheno-annot/SwissProt-GO:0009060.fa
Sequence type: Protein
Deleted existing Protein BLAST database named /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0009060
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 10978 sequences in 0.179029 seconds.


FUN_000047-T1   sp|Q757N1|H3_EREGS  88.971  136 15  0   4   139 1   136 2.14e-86    246
FUN_000049-T1   sp|Q757N1|H3_EREGS  92.079  101 8   0   1   101 1   101 1.61e-63    190
FUN_000051-T1   sp|Q757N1|H3_EREGS  92.079  101 8   0   1   101 1   101 1.61e-63    190
FUN_000053-T1   sp|Q757N1|H3_EREGS  92.079  101 8   0   1   101 1   101 1.61e-63    190
FUN_000055-T1   sp|Q757N1|H3_EREGS  92.079  101 8   0   1   101 1   101 1.61e-63    190
FUN_000057-T1   sp|Q757N1|H3_EREGS  92.079  101 8   0   1   101 1   101 1.61e-63    190
FUN_000059-T1   sp|Q757N1|H3_EREGS  92.079  101 8   0   1   101 1   101 1.61e-63    190
FUN_000061-T1   sp|Q757N1|H3_EREGS  92.079  101 8   0   1   101 1   101 1.61e-63    190
FUN_000063-T1   sp|Q757N1|H3_EREGS  92.079  101 8   0   1   101 1   101 1.61e-63    190
FUN_000065-T1   sp|Q757N1|H3_EREGS  88.971  136 15  0   4   139 1   136 2.14e-86    246
Number of hits
616 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0009060_out.tab

Oxidative phosphorylation (GO:0006119)

GO="0006119"

curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A"${GO}"%29%29+AND+%28reviewed%3Atrue%29" -o "${OUT_DIR}"SwissProt-GO:"${GO}".fa

head "${OUT_DIR}"SwissProt-GO:"${GO}".fa

echo "Number of Proteins"
grep -c ">" "${OUT_DIR}"SwissProt-GO:"${GO}".fa

/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in "${OUT_DIR}"SwissProt-GO:"${GO}".fa \
-dbtype prot \
-out "${OUT_DIR}"SwissProt-GO:"${GO}"


/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db "${OUT_DIR}"SwissProt-GO:"${GO}" \
-out "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab \
-evalue "${evalue}" \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 \
2> "${OUT_DIR}"blast_warnings"${GO}".txt

head "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab

echo "Number of hits"

wc -l "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  479k    0  479k    0     0   285k      0 --:--:--  0:00:01 --:--:--  285k
100 2632k    0 2632k    0     0   969k      0 --:--:--  0:00:02 --:--:--  969k
100 2675k    0 2675k    0     0   984k      0 --:--:--  0:00:02 --:--:--  983k
>sp|A0A1D8PHA3|CYT1_CANAL Cytochrome b-c1 complex catalytic subunit, mitochondrial OS=Candida albicans (strain SC5314 / ATCC MYA-2876) OX=237561 GN=CYT1 PE=1 SV=1
MFRTAYKTMNQSMVQKFIAGGVGVTGLTASYLLYQDSMTADAMTAAEHGLHPPAYNWPHN
GMFETFDHASIRRGFQVYREVCAACHSLDRIAWRNLVGVSHTTSEAKAMAEELEYDDEPD
DEGKPRKRPGKLADYIPGPYENEQAARAANQGAYPPDLSLIVKARHGGSDYIFSLLTGYP
DEPPAGVVLPEGSNYNPYFPGGAIAMGRVLFDDLVEYEDGTPATTSQMAKDVSTFLNWAS
EPEHDDRKKWGLKALVVLSSLYLLSIWVKRFKWTPIKNRKFRFDPPKK
>sp|A0A1D8PJX3|RIP1_CANAL Cytochrome b-c1 complex subunit Rieske, mitochondrial OS=Candida albicans (strain SC5314 / ATCC MYA-2876) OX=237561 GN=RIP1 PE=1 SV=1
MSSLAFRTLRNGLGLKSSVRALSTTTTTLSNYQQPDYSSYLNNKSGQGSRNFTYFMVGSM
GLLSAAGAKSTVEAFLSSFAASADVLAMAKVEVKLGAIPEGKNVIIKWQGKPVFIRHRTA
DEIEEANQVDIKTLRDPQNDADRVKKPEWLIMLGICTHLGCVPIGEAGDFGGWFCPCHGS
Number of Proteins
6140


Building a new DB, current time: 03/21/2025 12:50:54
New DB name:   /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0006119
New DB title:  ../output/27-Apul-pheno-annot/SwissProt-GO:0006119.fa
Sequence type: Protein
Deleted existing Protein BLAST database named /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0006119
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 6140 sequences in 0.103551 seconds.


FUN_000943-T1   sp|Q791V5|MTCH2_MOUSE   41.549  284 162 2   24  305 8   289 2.00e-80    243
FUN_000977-T1   sp|Q6XHB2|ROCO4_DICDI   30.035  283 163 10  589 848 1023    1293    6.93e-28    119
FUN_001181-T1   sp|O52683|HYDA_THEMA    29.902  408 237 17  115 515 184 549 5.65e-40    151
FUN_001463-T1   sp|Q6XHB2|ROCO4_DICDI   27.059  340 219 14  1552    1876    954 1279    1.37e-23    107
FUN_001483-T1   sp|Q6CFT7|ATPB_YARLI    25.301  415 280 10  45  448 52  447 3.17e-29    117
FUN_001648-T1   sp|P24539|AT5F1_HUMAN   39.691  194 108 3   15  207 62  247 2.52e-38    130
FUN_001648-T2   sp|P24539|AT5F1_HUMAN   40.314  191 105 3   47  236 62  244 2.48e-38    131
FUN_003901-T1   sp|P29410|KAD2_RAT  34.314  204 105 5   30  204 20  223 9.03e-29    105
FUN_004027-T1   sp|Q05752|NDUA7_BOVIN   53.947  76  34  1   1   75  1   76  6.67e-23    83.2
FUN_004095-T1   sp|Q6XHB2|ROCO4_DICDI   28.975  283 175 10  37  301 1015    1289    2.80e-21    96.3
Number of hits
130 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0006119_out.tab

Canonical glycolysis (GO:0061621)

GO="0061621"

curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A"${GO}"%29%29+AND+%28reviewed%3Atrue%29" -o "${OUT_DIR}"SwissProt-GO:"${GO}".fa

head "${OUT_DIR}"SwissProt-GO:"${GO}".fa

echo "Number of Proteins"
grep -c ">" "${OUT_DIR}"SwissProt-GO:"${GO}".fa

/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in "${OUT_DIR}"SwissProt-GO:"${GO}".fa \
-dbtype prot \
-out "${OUT_DIR}"SwissProt-GO:"${GO}"


/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db "${OUT_DIR}"SwissProt-GO:"${GO}" \
-out "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab \
-evalue "${evalue}" \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 \
2> "${OUT_DIR}"blast_warnings"${GO}".txt

head "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab

echo "Number of hits"

wc -l "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100  208k    0  208k    0     0   117k      0 --:--:--  0:00:01 --:--:--  117k
>sp|A1A4J1|PFKAL_BOVIN ATP-dependent 6-phosphofructokinase, liver type OS=Bos taurus OX=9913 GN=PFKL PE=2 SV=1
MASVDLEKLRTTGAGKAIGVLTSGGDAQGMNAAVRAVTRMGIYVGAKVFLIYEGYEGLVE
GGENIKQANWLSVSNIIQLGGTVIGSARCKAFTTREGRRAAAYNLVQRGITNLCVIGGDG
SLTGANIFRSEWGSLLEELVSEGKISEGTAQTYSHLNIAGLVGSIDNDFCGTDMTIGTDS
ALHRIMEVIDAITTTAQSHQRTFVLEVMGRHCGYLALVSALASGADWLFIPEAPPEDGWE
NFMCERLGETRSRGSRLNIIIIAEGAIDRNGKPISSRYVKDLVVQRLGFDTRVTVLGHVQ
RGGTPSAFDRILSSKMGMEAVMALLEATPDTPACVVSLSGNQSVRLPLMECVQMTKEVQK
AMDEKRFDEAIQLRGGSFENNWNIYKLLSHQKISKEKTNFSLAILNVGAPAAGMNAAVRS
AVRSGISQGHTVYVVHDGFEGLAKNQVQEVSWHDVAGWLGRGGSMLGTKRTLPKGFMEKI
VENIRLHNIHALLVIGGFEAYEGVLQLVEARGRYEELCIVMCVIPATISNNVPGTDFSLG
Number of Proteins
394


Building a new DB, current time: 03/21/2025 12:51:31
New DB name:   /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0061621
New DB title:  ../output/27-Apul-pheno-annot/SwissProt-GO:0061621.fa
Sequence type: Protein
Deleted existing Protein BLAST database named /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0061621
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 394 sequences in 0.0159671 seconds.


FUN_005030-T1   sp|P85037|FOXK1_HUMAN   51.000  100 48  1   95  194 302 400 4.38e-26    103
FUN_005036-T1   sp|P42128|FOXK1_MOUSE   48.454  97  50  0   30  126 290 386 1.70e-27    105
FUN_005159-T1   sp|Q3UCQ1|FOXK2_MOUSE   52.041  98  45  1   95  190 247 344 7.57e-28    108
FUN_008233-T1   sp|Q7ZX03|FOXK2_XENLA   43.066  137 75  1   37  173 188 321 1.35e-33    125
FUN_009301-T1   sp|Q867C9|PFKAM_HORSE   58.634  776 300 4   70  833 13  779 0.0 935
FUN_010519-T1   sp|P00940|TPIS_CHICK    69.672  244 74  0   3   246 4   247 1.09e-126   353
FUN_012871-T1   sp|P00940|TPIS_CHICK    69.672  244 74  0   3   246 4   247 1.09e-126   353
FUN_013147-T1   sp|Q7ZX03|FOXK2_XENLA   46.939  98  50  1   143 238 215 312 3.77e-24    96.3
FUN_013305-T1   sp|P42128|FOXK1_MOUSE   42.202  109 61  1   47  153 287 395 2.42e-22    86.7
FUN_015434-T1   sp|P05064|ALDOA_MOUSE   72.269  357 97  1   5   359 8   364 0.0 528
Number of hits
37 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0061621_out.tab

Tricarboxylic Acid Cycle (GO:0006099)

GO="0006099"

curl -H "Accept: text/plain" "https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28go%3A"${GO}"%29%29+AND+%28reviewed%3Atrue%29" -o "${OUT_DIR}"SwissProt-GO:"${GO}".fa

head "${OUT_DIR}"SwissProt-GO:"${GO}".fa

echo "Number of Proteins"
grep -c ">" "${OUT_DIR}"SwissProt-GO:"${GO}".fa

/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in "${OUT_DIR}"SwissProt-GO:"${GO}".fa \
-dbtype prot \
-out "${OUT_DIR}"SwissProt-GO:"${GO}"


/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db "${OUT_DIR}"SwissProt-GO:"${GO}" \
-out "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab \
-evalue "${evalue}" \
-num_threads 42 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 \
2> "${OUT_DIR}"blast_warnings"${GO}".txt

head "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab

echo "Number of hits"

wc -l "${OUT_DIR}"Apul_blastp-GO:"${GO}"_out.tab
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 31776    0 31776    0     0  26928      0 --:--:--  0:00:01 --:--:-- 26906
100 1726k    0 1726k    0     0   889k      0 --:--:--  0:00:01 --:--:--  889k
>sp|A0A096P8D3|IDH_OSTTA Isocitrate dehydrogenase (NAD(+)), mitochondrial OS=Ostreococcus tauri OX=70448 GN=IDH PE=1 SV=1
MTRVERGRVLARAIERAVAHRASARRWTTTTRTPAWMVTGWMGGRGVDRSTAMTRFERCG
STASSKITAAPMVYVRGEEMTAYVMDLIRSRWIEPRVDVGGWETFDLRAKNRDDTEDRVL
RDVIEAGKRIKAIFKEPTVTPTADQVKRLGLRKSWGSPNGAMRRGWNGITISRDTIHIDG
VELGYKKPVLFERHAVGGEYSAGYKNVGKGKLTTTFTPSEGPDAGKTVVVDEREIVDEEA
AVVTYHNPYDNVHDLARFFFGRCLEAKVTPYVVTKKTVFKWQEPFWQIMRTVFDEEFKAQ
FVAAGVMKEGEELVHLLSDAATMKLVQWRQGGFGMAAHNYDGDVLTDELAQVHKSPGFIT
SNLVGVHEDGTLIKEFEASHGTVADMDEARLRGEETSLNPLGMVEGLIGAMNHAADVHNI
DRDRTHAFTTKMRTVIHQLFREGKGTRDLCGPSGLTTEQFIDAVAERLDA
>sp|A0A3Q0KQY7|FUMC_SCHMA Fumarate hydratase OS=Schistosoma mansoni OX=6183 GN=Smp_158240 PE=1 SV=2
Number of Proteins
2837


Building a new DB, current time: 03/21/2025 12:51:40
New DB name:   /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0006099
New DB title:  ../output/27-Apul-pheno-annot/SwissProt-GO:0006099.fa
Sequence type: Protein
Deleted existing Protein BLAST database named /home/shared/8TB_HDD_03/sr320/github/deep-dive-expression/D-Apul/output/27-Apul-pheno-annot/SwissProt-GO:0006099
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 2837 sequences in 0.0586021 seconds.


FUN_000960-T1   sp|P21399|ACOHC_HUMAN   70.588  867 255 0   24  890 21  887 0.0 1312
FUN_001356-T1   sp|A0QMB9|GABD1_MYCA1   30.787  445 291 4   79  508 7   449 5.73e-62    214
FUN_003898-T1   sp|Q9V0D5|MDH_PYRAB 41.694  307 173 3   6   310 7   309 2.54e-71    224
FUN_003899-T1   sp|Q9V0D5|MDH_PYRAB 37.110  353 215 5   28  377 9   357 2.79e-73    229
FUN_004736-T1   sp|Q9VWH4|IDH3A_DROME   72.892  332 89  1   40  371 47  377 0.0 514
FUN_004752-T1   sp|A0QMB9|GABD1_MYCA1   34.848  462 284 6   426 879 3   455 5.01e-74    248
FUN_006329-T1   sp|P09624|DLDH_YEAST    56.557  488 199 7   30  508 15  498 0.0 540
FUN_006649-T1   sp|A4SFT4|MDH_CHLPM 36.093  302 182 4   24  322 2   295 4.65e-64    202
FUN_006864-T1   sp|Q73TP5|GABD1_MYCPA   35.141  461 285 7   65  519 5   457 6.75e-76    244
FUN_006864-T2   sp|Q73TP5|GABD1_MYCPA   35.141  461 285 7   65  519 5   457 2.18e-76    245
Number of hits
82 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0006099_out.tab

Summary of blast hits

wc -l "${OUT_DIR}"*tab
   82 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0006099_out.tab
  130 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0006119_out.tab
  616 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0009060_out.tab
   37 ../output/27-Apul-pheno-annot/Apul_blastp-GO:0061621_out.tab
  865 total