Git that Pro GO

Deep dive Protein Annotation
blast
e5
Author
Affiliation

Steven Roberts

Published

August 25, 2024

tldr:

Got GO annotations for Deep Dive Protein sets:

https://raw.githubusercontent.com/urol-e5/deep-dive/main/D-Apul/output/20-Apul-gene-annotation/Apul-protein-GO.tsv

https://raw.githubusercontent.com/urol-e5/deep-dive/main/E-Peve/output/20-Peve-gene-annotation/Peve-protein-GO.tsv

https://github.com/urol-e5/deep-dive/raw/main/F-Pmea/output/20-Pmea-gene-annotation/Pmea-protein-GO.tsv


Here is example code

20-Apul Annotation

Steven Roberts 25 August, 2024

Apul

Protein

cd ../data

curl -O https://gannet.fish.washington.edu/seashell/snaps/GCF_013753865.1_Amil_v2.1.protein.faa
head ../data/GCF_013753865.1_Amil_v2.1.protein.faa
ls ../../data/blast_dbs

Swiss Prot download

cd ../../data/blast_dbs
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2024_04.fasta.gz
gunzip -k uniprot_sprot_r2024_04.fasta.gz
head ../../data/blast_dbs/uniprot_sprot_r2024_04.fasta
echo "Number of Sequences"
grep -c ">" ../../data/blast_dbs/uniprot_sprot_r2024_04.fasta
/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
-in ../../data/blast_dbs/uniprot_sprot_r2024_04.fasta \
-dbtype prot \
-out ../../data/blast_dbs/uniprot_sprot_r2024_04

Blastp

fasta="../data/GCF_013753865.1_Amil_v2.1.protein.faa"

/home/shared/ncbi-blast-2.15.0+/bin/blastp \
-query $fasta \
-db ../../data/blast_dbs/uniprot_sprot_r2024_04 \
-out ../output/20-Apul-gene-annotation/blastp_out.tab \
-evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6
wc -l ../output/20-Apul-gene-annotation/blastp_out.tab
tr '|' '\t' < ../output/20-Apul-gene-annotation/blastp_out.tab \
> ../output/20-Apul-gene-annotation/blastp_out_sep.tab

head -1 ../output/20-Apul-gene-annotation/blastp_out_sep.tab

Download GO info

sr320@raven:/home/shared/8TB_HDD_01/sr320/github/deep-dive/data$ curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo_c%2Cgo%2Cgo_f%2Cgo_id&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29" -o SwissProt-Annot-GO_082324.tsv
wc -l ../../data/SwissProt-Annot-GO_082324.tsv 

Join blast with GO info

bltabl <- read.csv("../output/20-Apul-gene-annotation/blastp_out_sep.tab", sep = '\t', header = FALSE)

spgo <- read.csv("../../data/SwissProt-Annot-GO_082324.tsv", sep = '\t', header = TRUE)
annot_tab <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>%
  select(
    query = V1,
    blast_hit = V3,
    evalue = V13,
    ProteinNames = Protein.names,
    BiologicalProcess = Gene.Ontology..biological.process.,
    GeneOntologyIDs = Gene.Ontology.IDs
  )
head(annot_tab)
write.table(annot_tab, 
            file = "../output/20-Apul-gene-annotation/Apul-protein-GO.tsv", 
            sep = "\t", 
            row.names = FALSE, 
            quote = FALSE)
head ../output/20-Apul-gene-annotation/Apul-protein-GO.tsv

Actual code at

https://github.com/urol-e5/deep-dive/blob/main/D-Apul/code/20-Apul-gene-annotation.Rmd

https://github.com/urol-e5/deep-dive/blob/main/E-Peve/code/20-Peve-gene-annotation.Rmd

https://github.com/urol-e5/deep-dive/blob/main/F-Pmea/code/20-Pmea-gene-annotation.Rmd