Pipeline for Reproduction Annotation
gene ontology

Steven Roberts


June 15, 2024

Using Swiss-Prot Repro subset..

Based on the following GO Term


Make BlastDB

(Would only be needed once)

curl -H "Accept: text/plain" "" -o ../data/SwissProt-GO:0022414.fa
head ../data/SwissProt-GO:0022414.fa
echo "Number of Sequences"
grep -c ">" ../data/SwissProt-GO:0022414.fa 
>sp|A0A060A682|HAP2_TETTH Hapless 2 OS=Tetrahymena thermophila OX=5911 GN=HAP2 PE=1 SV=1
Number of Sequences
/home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \
-in ../data/SwissProt-GO:0022414.fa \
-dbtype prot \
-out ../blastdb/SwissProt-GO:0022414

Set Query

set fasta as variable


head $fasta
echo "Number of Sequences"
grep ">" -c $fasta
Number of Sequences



/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db ../blastdb/SwissProt-GO:0022414 \
-out ../output/04-repro-annot/ \
-evalue 1E-20 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6
head ../output/04-repro-annot/
ANN15859-RA sp|P07207|NOTCH_DROME   27.146  1072    523 60  96  1012    162 1130    5.83e-40    162
ANN15865-RA sp|Q9VCA8|ANKHM_DROME   27.049  366 193 9   907 1269    2316    2610    1.97e-24    110
ANN15898-RA sp|Q9Z0E8|S22A5_MOUSE   27.604  576 327 13  236 784 12  524 4.19e-57    204
ANN15903-RA sp|Q9Z0E8|S22A5_MOUSE   29.242  554 337 11  16  545 2   524 1.16e-63    217
ANN15914-RA sp|P49891|ST1E1_MOUSE   31.250  256 153 8   28  269 49  295 2.65e-26    103
ANN04321-RA sp|O15455|TLR3_HUMAN    22.802  671 421 18  29  655 238 855 1.93e-26    114
ANN04322-RA sp|P15620|ZN271_MOUSE   24.554  505 321 18  1022    1523    80  527 2.78e-26    114
ANN04330-RA sp|O88572|LRP6_MOUSE    27.494  902 546 38  66  930 65  895 9.40e-60    226
ANN04334-RA sp|P20241|NRG_DROME 25.049  507 314 23  299 765 26  506 2.24e-24    109
ANN04339-RA sp|Q969R2|OSBP2_HUMAN   50.591  761 322 13  11  757 184 904 0.0 749
wc -l ../output/04-repro-annot/
4641 ../output/04-repro-annot/
tr '|' '\t' < ../output/04-repro-annot/ \
> ../output/04-repro-annot/

head -1 ../output/04-repro-annot/
ANN15859-RA sp  P07207  NOTCH_DROME 27.146  1072    523 60  96  1012    162 1130    5.83e-40    162

Download Swiss-Prot Information

curl -H "Accept: text/plain; format=tsv" "" -o ../data/SwissProt-GO:0022414.tsv

Join blast with GO info

bltabl <- read.csv("../output/04-repro-annot/", sep = '\t', header = FALSE)

spgo <- read.csv("../data/SwissProt-GO:0022414.tsv", sep = '\t', header = TRUE)
annot_tab <-
  left_join(bltabl, spgo,  by = c("V3" = "Entry")) %>%
  select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs)
           V1     V3      V13
1 ANN15859-RA P07207 5.83e-40
2 ANN15865-RA Q9VCA8 1.97e-24
3 ANN15898-RA Q9Z0E8 4.19e-57
4 ANN15903-RA Q9Z0E8 1.16e-63
5 ANN15914-RA P49891 2.65e-26
6 ANN04321-RA O15455 1.93e-26
1                                             Neurogenic locus Notch protein [Cleaved into: Processed neurogenic locus Notch protein]
2                  Ankyrin repeat and KH domain-containing protein mask (Multiple ankyrin repeat single KH domain-containing protein)
3 Organic cation/carnitine transporter 2 (High-affinity sodium-dependent carnitine cotransporter) (Solute carrier family 22 member 5)
4 Organic cation/carnitine transporter 2 (High-affinity sodium-dependent carnitine cotransporter) (Solute carrier family 22 member 5)
5       Sulfotransferase 1E1 (ST1E1) (EC (Estrogen sulfotransferase, testis isoform) (Sulfotransferase, estrogen-preferring)
6                                                                                             Toll-like receptor 3 (CD antigen CD283)
1 Drosophila melanogaster (Fruit fly)
2 Drosophila melanogaster (Fruit fly)
3                Mus musculus (Mouse)
4                Mus musculus (Mouse)
5                Mus musculus (Mouse)
6                Homo sapiens (Human)
