tldr:
Got GO annotations for Deep Dive Protein sets:
https://raw.githubusercontent.com/urol-e5/deep-dive/main/D-Apul/output/20-Apul-gene-annotation/Apul-protein-GO.tsv
https://raw.githubusercontent.com/urol-e5/deep-dive/main/E-Peve/output/20-Peve-gene-annotation/Peve-protein-GO.tsv
https://github.com/urol-e5/deep-dive/raw/main/F-Pmea/output/20-Pmea-gene-annotation/Pmea-protein-GO.tsv
Here is example code
20-Apul Annotation
Steven Roberts 25 August, 2024
Apul
Protein
cd ../data
curl -O https://gannet.fish.washington.edu/seashell/snaps/GCF_013753865.1_Amil_v2.1.protein.faa
head ../data/GCF_013753865.1_Amil_v2.1.protein.faa
ls ../../data/blast_dbs
Swiss Prot download
cd ../../data/blast_dbs
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2024_04.fasta.gz
gunzip -k uniprot_sprot_r2024_04.fasta.gz
head ../../data/blast_dbs/uniprot_sprot_r2024_04.fasta
echo "Number of Sequences"
grep -c ">" ../../data/blast_dbs/uniprot_sprot_r2024_04.fasta
/home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \
\
-in ../../data/blast_dbs/uniprot_sprot_r2024_04.fasta \
-dbtype prot -out ../../data/blast_dbs/uniprot_sprot_r2024_04
Blastp
fasta="../data/GCF_013753865.1_Amil_v2.1.protein.faa"
/home/shared/ncbi-blast-2.15.0+/bin/blastp \
$fasta \
-query \
-db ../../data/blast_dbs/uniprot_sprot_r2024_04 \
-out ../output/20-Apul-gene-annotation/blastp_out.tab \
-evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1 -outfmt 6
wc -l ../output/20-Apul-gene-annotation/blastp_out.tab
tr '|' '\t' < ../output/20-Apul-gene-annotation/blastp_out.tab \
> ../output/20-Apul-gene-annotation/blastp_out_sep.tab
head -1 ../output/20-Apul-gene-annotation/blastp_out_sep.tab
Download GO info
sr320@raven:/home/shared/8TB_HDD_01/sr320/github/deep-dive/data$ curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo_c%2Cgo%2Cgo_f%2Cgo_id&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29" -o SwissProt-Annot-GO_082324.tsv
wc -l ../../data/SwissProt-Annot-GO_082324.tsv
Join blast with GO info
<- read.csv("../output/20-Apul-gene-annotation/blastp_out_sep.tab", sep = '\t', header = FALSE)
bltabl
<- read.csv("../../data/SwissProt-Annot-GO_082324.tsv", sep = '\t', header = TRUE) spgo
<- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>%
annot_tab select(
query = V1,
blast_hit = V3,
evalue = V13,
ProteinNames = Protein.names,
BiologicalProcess = Gene.Ontology..biological.process.,
GeneOntologyIDs = Gene.Ontology.IDs
)
head(annot_tab)
write.table(annot_tab,
file = "../output/20-Apul-gene-annotation/Apul-protein-GO.tsv",
sep = "\t",
row.names = FALSE,
quote = FALSE)
head ../output/20-Apul-gene-annotation/Apul-protein-GO.tsv
Actual code at
https://github.com/urol-e5/deep-dive/blob/main/D-Apul/code/20-Apul-gene-annotation.Rmd
https://github.com/urol-e5/deep-dive/blob/main/E-Peve/code/20-Peve-gene-annotation.Rmd
https://github.com/urol-e5/deep-dive/blob/main/F-Pmea/code/20-Pmea-gene-annotation.Rmd