Pulling out calcification / biomin gene counts.
TLDR
- https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/apul_biomin_counts.csv
- https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/peve_biomin_counts.csv
- https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/ptua_biomin_counts.csv
Have list of biomineralization genes from coral literature (e.g., Takeuchi et al. 2016, Ramos-Silva et al. 2013)
makeblastdb \
-in ../output/31-biomin-pathway/Stylo-biomin.fasta \
-dbtype prot \
-out ../output/31-biomin-pathway/Stylo-biomin-dbblastp \
-query ../output/11.3-ortholog-annotation/representative_sequences.faa \
-db ../output/31-biomin-pathway/Stylo-biomin-db \
-out ../output/31-biomin-pathway/OG-repseq_Stylo-biomin-results.tsv \
-outfmt "6 qseqid sseqid evalue bitscore" \
-evalue 1e-20 \
-max_target_seqs 1 \
-max_hsps 1 \
-num_threads 10library(dplyr)
library(readr)
# Paths
blast_file <- "../output/31-biomin-pathway/OG-repseq_Stylo-biomin-results.tsv"
mapping_file <- "../output/11.3-ortholog-annotation/gene_to_ortholog_mapping.csv"
# Read BLAST results
blast <- read_tsv(
blast_file,
col_names = c("gene_id", "hit", "evalue", "score")
)
# Read ortholog mapping
mapping <- read_csv(mapping_file)
# Join to add group_id to each gene in blast results
annotated <- blast %>%
left_join(mapping %>% select(gene_id, group_id), by = "gene_id")
# Write out CSV
write_csv(annotated, "../output/31-biomin-pathway/OG-repseq_Stylo-biomin.csv")
# View result
annotatedgene_id,hit,evalue,score,group_id
FUN_000236-T1,XP_022780303.1,6.1399999999999995e-24,102,OG_00030
FUN_000857-T1,XP_022795459.1,1.3899999999999998e-159,447,OG_00171
FUN_000867-T1,XP_022780694.1,1.6e-34,125,OG_00176
FUN_001009-T1,XP_022789591.1,7.039999999999999e-107,338,OG_00246
FUN_001088-T1,XP_022783044.1,3.9e-35,130,OG_00297
FUN_001089-T1,XP_022783044.1,2.1699999999999998e-23,93.2,OG_00298
FUN_001101-T1,XP_022783044.1,1.22e-31,118,OG_00302
FUN_001107-T1,XP_022795459.1,9.45e-59,190,OG_00305
FUN_001118-T1,XP_022801359.1,1.33e-35,124,OG_00314
library(tidyverse)
ogs <- read_tsv("../output/33-biomin-pathway-counts/biomin-OG.txt",
col_names = "group_id")
map <- read_csv("../output/11.3-ortholog-annotation/gene_to_ortholog_mapping.csv")
filtered <- map %>%
semi_join(ogs, by = "group_id") %>%
mutate(gene_id = str_replace(gene_id, "-T\\d+$", "")) # <<<<<< KEY FIXlibrary(tidyverse)
base_url <- "https://raw.githubusercontent.com/urol-e5/MOSAiC/refs/heads/main/data/orthologs"
apul <- read_csv(file.path(base_url, "apul-gene_count_matrix.csv"), show_col_types = FALSE)
peve <- read_csv(file.path(base_url, "peve-gene_count_matrix.csv"), show_col_types = FALSE)
ptua <- read_csv(file.path(base_url, "ptua-gene_count_matrix.csv"), show_col_types = FALSE)library(tidyverse)
# ensure gene_id format matches count matrices
filtered <- filtered %>%
mutate(gene_id = str_replace(gene_id, "-T\\d+$", ""))
# filter apul & add group_id as first column
apul_filt <- apul %>%
inner_join(
filtered %>% filter(species == "Apul") %>% select(gene_id, group_id),
by = "gene_id"
) %>%
relocate(group_id, .before = gene_id) # <<<<<< KEY LINEter(species == "Apul")
# write to CSV
write_csv(
apul_filt,
"../output/33-biomin-pathway-counts/apul_biomin_counts.csv")library(tidyverse)
# ensure gene_id in mapping file matches (remove transcript suffix)
filtered <- filtered %>%
mutate(gene_id = str_replace(gene_id, "-T\\d+$", ""))
# filter Peve matrix, normalize gene_id, join, move group_id to column 1
peve_filt <- peve %>%
mutate(gene_id = str_replace(gene_id, "^gene-", "")) %>% # <<<<< FIX HERE
inner_join(
filtered %>% filter(species == "Peve") %>% select(gene_id, group_id),
by = "gene_id"
) %>%
relocate(group_id, .before = gene_id)
# write to CSV
write_csv(
peve_filt,
"../output/33-biomin-pathway-counts/peve_biomin_counts.csv"
)library(tidyverse)
# ensure gene_id in mapping file matches count matrix format
filtered <- filtered %>%
mutate(gene_id = str_replace(gene_id, "-T\\d+$", ""))
# filter Ptua matrix, normalize gene_id, join, move group_id to column 1
ptua_filt <- ptua %>%
mutate(gene_id = str_replace(gene_id, "^gene-", "")) %>% # <<<<< KEY FIX
inner_join(
filtered %>%
filter(species == "Ptua") %>%
select(gene_id, group_id),
by = "gene_id"
) %>%
relocate(group_id, .before = gene_id)
# write to CSV
write_csv(
ptua_filt,
"../output/33-biomin-pathway-counts/ptua_biomin_counts.csv"
)