Calcification Count Matrices

filtering timeseries
e5
coral
Author
Affiliation

Steven Roberts

Published

November 26, 2025

Pulling out calcification / biomin gene counts.

TLDR
  • https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/apul_biomin_counts.csv
  • https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/peve_biomin_counts.csv
  • https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/ptua_biomin_counts.csv

Have list of biomineralization genes from coral literature (e.g., Takeuchi et al. 2016, Ramos-Silva et al. 2013)

makeblastdb \
-in ../output/31-biomin-pathway/Stylo-biomin.fasta \
-dbtype prot \
-out ../output/31-biomin-pathway/Stylo-biomin-db
blastp \
  -query ../output/11.3-ortholog-annotation/representative_sequences.faa \
  -db ../output/31-biomin-pathway/Stylo-biomin-db \
  -out ../output/31-biomin-pathway/OG-repseq_Stylo-biomin-results.tsv \
  -outfmt "6 qseqid sseqid evalue bitscore" \
  -evalue 1e-20 \
  -max_target_seqs 1 \
  -max_hsps 1 \
  -num_threads 10
library(dplyr)
library(readr)

# Paths
blast_file <- "../output/31-biomin-pathway/OG-repseq_Stylo-biomin-results.tsv"
mapping_file <- "../output/11.3-ortholog-annotation/gene_to_ortholog_mapping.csv"

# Read BLAST results
blast <- read_tsv(
  blast_file,
  col_names = c("gene_id", "hit", "evalue", "score")
)

# Read ortholog mapping
mapping <- read_csv(mapping_file)

# Join to add group_id to each gene in blast results
annotated <- blast %>%
  left_join(mapping %>% select(gene_id, group_id), by = "gene_id")


# Write out CSV
write_csv(annotated, "../output/31-biomin-pathway/OG-repseq_Stylo-biomin.csv")


# View result
annotated
gene_id,hit,evalue,score,group_id
FUN_000236-T1,XP_022780303.1,6.1399999999999995e-24,102,OG_00030
FUN_000857-T1,XP_022795459.1,1.3899999999999998e-159,447,OG_00171
FUN_000867-T1,XP_022780694.1,1.6e-34,125,OG_00176
FUN_001009-T1,XP_022789591.1,7.039999999999999e-107,338,OG_00246
FUN_001088-T1,XP_022783044.1,3.9e-35,130,OG_00297
FUN_001089-T1,XP_022783044.1,2.1699999999999998e-23,93.2,OG_00298
FUN_001101-T1,XP_022783044.1,1.22e-31,118,OG_00302
FUN_001107-T1,XP_022795459.1,9.45e-59,190,OG_00305
FUN_001118-T1,XP_022801359.1,1.33e-35,124,OG_00314
library(tidyverse)

ogs <- read_tsv("../output/33-biomin-pathway-counts/biomin-OG.txt",
                col_names = "group_id")

map <- read_csv("../output/11.3-ortholog-annotation/gene_to_ortholog_mapping.csv")

filtered <- map %>%
  semi_join(ogs, by = "group_id") %>%
  mutate(gene_id = str_replace(gene_id, "-T\\d+$", ""))  # <<<<<< KEY FIX
library(tidyverse)

base_url <- "https://raw.githubusercontent.com/urol-e5/MOSAiC/refs/heads/main/data/orthologs"

apul <- read_csv(file.path(base_url, "apul-gene_count_matrix.csv"), show_col_types = FALSE)
peve <- read_csv(file.path(base_url, "peve-gene_count_matrix.csv"), show_col_types = FALSE)
ptua <- read_csv(file.path(base_url, "ptua-gene_count_matrix.csv"), show_col_types = FALSE)
library(tidyverse)

# ensure gene_id format matches count matrices
filtered <- filtered %>%
  mutate(gene_id = str_replace(gene_id, "-T\\d+$", ""))

# filter apul & add group_id as first column
apul_filt <- apul %>%
  inner_join(
    filtered %>% filter(species == "Apul") %>% select(gene_id, group_id),
    by = "gene_id"
  ) %>%
  relocate(group_id, .before = gene_id)   # <<<<<< KEY LINEter(species == "Apul") 


# write to CSV
write_csv(
  apul_filt,
  "../output/33-biomin-pathway-counts/apul_biomin_counts.csv")
library(tidyverse)

# ensure gene_id in mapping file matches (remove transcript suffix)
filtered <- filtered %>%
  mutate(gene_id = str_replace(gene_id, "-T\\d+$", ""))

# filter Peve matrix, normalize gene_id, join, move group_id to column 1
peve_filt <- peve %>%
  mutate(gene_id = str_replace(gene_id, "^gene-", "")) %>%   # <<<<< FIX HERE
  inner_join(
    filtered %>% filter(species == "Peve") %>% select(gene_id, group_id),
    by = "gene_id"
  ) %>%
  relocate(group_id, .before = gene_id)

# write to CSV
write_csv(
  peve_filt,
  "../output/33-biomin-pathway-counts/peve_biomin_counts.csv"
)
library(tidyverse)

# ensure gene_id in mapping file matches count matrix format
filtered <- filtered %>%
  mutate(gene_id = str_replace(gene_id, "-T\\d+$", ""))

# filter Ptua matrix, normalize gene_id, join, move group_id to column 1
ptua_filt <- ptua %>%
  mutate(gene_id = str_replace(gene_id, "^gene-", "")) %>%   # <<<<< KEY FIX
  inner_join(
    filtered %>%
      filter(species == "Ptua") %>%
      select(gene_id, group_id),
    by = "gene_id"
  ) %>%
  relocate(group_id, .before = gene_id)


# write to CSV
write_csv(
  ptua_filt,
  "../output/33-biomin-pathway-counts/ptua_biomin_counts.csv"
)