More Sparsity?

Optimizing Rank 35
e5
barnacle
coral
Author
Affiliation

Steven Roberts

Published

November 5, 2025

Based on overlap of genes across components wanted to increase sparsity be increasing lambda-gene values..

Ran with multiple values

LAMBDAS_GENE="0.00 0.01 0.05 0.1 0.2 0.4 0.8"
LAMBDA_SAMPLE=0.1
LAMBDA_TIME=0.05
RANK=35


OUTDIR_BASE=../output/26-rank35-optimization

for LG in $LAMBDAS_GENE; do
  OUTDIR=${OUTDIR_BASE}/lambda_gene_${LG}
  mkdir -p "$OUTDIR"

  /Users/sr320/.local/bin/uv run python ../scripts/14.1-barnacle/build_tensor_and_run.py \
    --input-file ../output/14-pca-orthologs/vst_counts_matrix.csv \
    --output-dir "$OUTDIR" \
    --rank $RANK \
    --lambda-gene $LG \
    --lambda-sample $LAMBDA_SAMPLE \
    --lambda-time $LAMBDA_TIME \
    --max-iter 2000 \
    --tol 1e-5 \
    --seed 81
done

Move forward with lamda_gene of 0.2…

Sample

Time

library(tidyverse)

# File path
file_path <- "../output/26-rank35-optimization/lambda_gene_0.2/barnacle_factors/time_factors.csv"

# Read data
df <- read_csv(file_path)

# Compute variance of each component across timepoints
var_components <- df %>%
  pivot_longer(cols = starts_with("Component_"),
               names_to = "Component",
               values_to = "Value") %>%
  group_by(Component) %>%
  summarise(Variance = var(Value, na.rm = TRUE)) %>%
  arrange(desc(Variance))

# Select top 10 most variable components
top10_components <- var_components %>% slice_max(Variance, n = 10) %>% pull(Component)

# Prepare data for plotting
df_long <- df %>%
  pivot_longer(cols = starts_with("Component_"),
               names_to = "Component",
               values_to = "Value") %>%
  filter(Component %in% top10_components)

# Line plot
ggplot(df_long, aes(x = OG_ID, y = Value, group = Component, color = Component)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2) +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "Top 10 Most Variable Components Across Timepoints",
    x = "Time Point",
    y = "Component Value"
  )

Pick your component

library(tidyverse)

# File path
file_path <- "../output/26-rank35-optimization/lambda_gene_0.2/barnacle_factors/time_factors.csv"

# ---- USER INPUT ----
# Pick the component you want to plot (e.g., "Component_5")
selected_component <- "Component_24"
# ---------------------

# Read data
df <- read_csv(file_path)

# Convert to long format
df_long <- df %>%
  pivot_longer(
    cols = starts_with("Component_"),
    names_to = "Component",
    values_to = "Value"
  )

# Check that the chosen component exists
if (!(selected_component %in% unique(df_long$Component))) {
  stop(paste("Component", selected_component, "not found in data!"))
}

# Filter for the selected component
df_sel <- df_long %>% filter(Component == selected_component)

# Plot single component across timepoints
ggplot(df_sel, aes(x = OG_ID, y = Value, group = 1)) +
  geom_line(linewidth = 1.2, color = "steelblue") +
  geom_point(size = 3, color = "firebrick") +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = paste0(selected_component, " Across Timepoints"),
    x = "Time Point",
    y = "Loading Value"
  )

Gene Factor Loading

Annotation

An Annotation - U pick!

library(tidyverse)
library(pheatmap)

# ---- File path ----
file_path <- "../output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component/goslim_term_counts.csv"  # change to your actual CSV path

# ---- User input ----
# Enter one or more GO terms (exact or partial match)
selected_terms <- c("protein catabolic process", "lipid", "immune system process", "carbohydrate", "antioxidant activity", "mitochondrion", "inflammatory response", "reproductive")  # example

# ---- Options ----
row_normalize <- TRUE   # TRUE for row-normalized (z-score), FALSE for raw
log_transform <- TRUE   # TRUE to apply log10(x+1) before plotting

# ---- Load and prepare data ----
df <- read_csv(file_path)

# Remove total column if present
df <- df %>% select(-any_of("total"))

# Filter by GO term (partial match supported)
df_sel <- df %>%
  filter(if_any(term, ~ str_detect(.x, paste(selected_terms, collapse = "|"))))

if (nrow(df_sel) == 0) {
  stop("⚠️ No matching GO terms found. Check spelling or capitalization.")
}

# Convert to matrix
mat <- df_sel %>%
  column_to_rownames(var = "term") %>%
  as.matrix()

# Optional log-transform
if (log_transform) {
  mat <- log10(mat + 1)
}

# Optional row normalization
if (row_normalize) {
  mat <- t(scale(t(mat)))  # z-score per row
}

# ---- Plot heatmap ----
pheatmap(
  mat,
  scale = "none",
  color = colorRampPalette(c("navy", "white", "firebrick3"))(200),
  clustering_method = "ward.D2",
  show_rownames = TRUE,
  main = paste(
    "GO Term Heatmap —",
    if (row_normalize) "Row-normalized" else "Raw values",
    if (log_transform) "(log10)" else ""
  )
)