Based on overlap of genes across components wanted to increase sparsity be increasing lambda-gene values..

Ran with multiple values

LAMBDAS_GENE="0.00 0.01 0.05 0.1 0.2 0.4 0.8"
LAMBDA_SAMPLE=0.1
LAMBDA_TIME=0.05
RANK=35


OUTDIR_BASE=../output/26-rank35-optimization

for LG in $LAMBDAS_GENE; do
  OUTDIR=${OUTDIR_BASE}/lambda_gene_${LG}
  mkdir -p "$OUTDIR"

  /Users/sr320/.local/bin/uv run python ../scripts/14.1-barnacle/build_tensor_and_run.py \
    --input-file ../output/14-pca-orthologs/vst_counts_matrix.csv \
    --output-dir "$OUTDIR" \
    --rank $RANK \
    --lambda-gene $LG \
    --lambda-sample $LAMBDA_SAMPLE \
    --lambda-time $LAMBDA_TIME \
    --max-iter 2000 \
    --tol 1e-5 \
    --seed 81
done

Move forward with lamda_gene of 0.2…

Sample

Time

library(tidyverse)

# File path
file_path <- "../output/26-rank35-optimization/lambda_gene_0.2/barnacle_factors/time_factors.csv"

# Read data
df <- read_csv(file_path)

# Compute variance of each component across timepoints
var_components <- df %>%
  pivot_longer(cols = starts_with("Component_"),
               names_to = "Component",
               values_to = "Value") %>%
  group_by(Component) %>%
  summarise(Variance = var(Value, na.rm = TRUE)) %>%
  arrange(desc(Variance))

# Select top 10 most variable components
top10_components <- var_components %>% slice_max(Variance, n = 10) %>% pull(Component)

# Prepare data for plotting
df_long <- df %>%
  pivot_longer(cols = starts_with("Component_"),
               names_to = "Component",
               values_to = "Value") %>%
  filter(Component %in% top10_components)

# Line plot
ggplot(df_long, aes(x = OG_ID, y = Value, group = Component, color = Component)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2) +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "Top 10 Most Variable Components Across Timepoints",
    x = "Time Point",
    y = "Component Value"
  )

Pick your component

library(tidyverse)

# File path
file_path <- "../output/26-rank35-optimization/lambda_gene_0.2/barnacle_factors/time_factors.csv"

# ---- USER INPUT ----
# Pick the component you want to plot (e.g., "Component_5")
selected_component <- "Component_24"
# ---------------------

# Read data
df <- read_csv(file_path)

# Convert to long format
df_long <- df %>%
  pivot_longer(
    cols = starts_with("Component_"),
    names_to = "Component",
    values_to = "Value"
  )

# Check that the chosen component exists
if (!(selected_component %in% unique(df_long$Component))) {
  stop(paste("Component", selected_component, "not found in data!"))
}

# Filter for the selected component
df_sel <- df_long %>% filter(Component == selected_component)

# Plot single component across timepoints
ggplot(df_sel, aes(x = OG_ID, y = Value, group = 1)) +
  geom_line(linewidth = 1.2, color = "steelblue") +
  geom_point(size = 3, color = "firebrick") +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = paste0(selected_component, " Across Timepoints"),
    x = "Time Point",
    y = "Loading Value"
  )

Gene Factor Loading

Annotation

An Annotation - U pick!

library(tidyverse)
library(pheatmap)

# ---- File path ----
file_path <- "../output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component/goslim_term_counts.csv"  # change to your actual CSV path

# ---- User input ----
# Enter one or more GO terms (exact or partial match)
selected_terms <- c("protein catabolic process", "lipid", "immune system process", "carbohydrate", "antioxidant activity", "mitochondrion", "inflammatory response", "reproductive")  # example

# ---- Options ----
row_normalize <- TRUE   # TRUE for row-normalized (z-score), FALSE for raw
log_transform <- TRUE   # TRUE to apply log10(x+1) before plotting

# ---- Load and prepare data ----
df <- read_csv(file_path)

# Remove total column if present
df <- df %>% select(-any_of("total"))

# Filter by GO term (partial match supported)
df_sel <- df %>%
  filter(if_any(term, ~ str_detect(.x, paste(selected_terms, collapse = "|"))))

if (nrow(df_sel) == 0) {
  stop("⚠️ No matching GO terms found. Check spelling or capitalization.")
}

# Convert to matrix
mat <- df_sel %>%
  column_to_rownames(var = "term") %>%
  as.matrix()

# Optional log-transform
if (log_transform) {
  mat <- log10(mat + 1)
}

# Optional row normalization
if (row_normalize) {
  mat <- t(scale(t(mat)))  # z-score per row
}

# ---- Plot heatmap ----
pheatmap(
  mat,
  scale = "none",
  color = colorRampPalette(c("navy", "white", "firebrick3"))(200),
  clustering_method = "ward.D2",
  show_rownames = TRUE,
  main = paste(
    "GO Term Heatmap —",
    if (row_normalize) "Row-normalized" else "Raw values",
    if (log_transform) "(log10)" else ""
  )
)

--- title: "More Sparsity?" description: "Optimizing Rank 35" categories: [e5, barnacle, coral] #citation: date: 11-05-2025 image: http://gannet.fish.washington.edu/seashell/snaps/2025-11-05_13-57-12.png # finding a good image author: - name: Steven Roberts url: orcid: 0000-0001-8302-1138 affiliation: Professor, UW - School of Aquatic and Fishery Sciences affiliation-url: https://robertslab.info #url: # self-defined draft: false # setting this to `true` will prevent your post from appearing on your listing page until you're ready! format: html: code-fold: FALSE code-tools: true code-copy: true highlight-style: github code-overflow: wrap #runtime: shiny --- Based on overlap of genes across components wanted to increase sparsity be increasing lambda-gene values.. Ran with multiple values ``` bash LAMBDAS_GENE="0.00 0.01 0.05 0.1 0.2 0.4 0.8" LAMBDA_SAMPLE=0.1 LAMBDA_TIME=0.05 RANK=35 OUTDIR_BASE=../output/26-rank35-optimization for LG in $LAMBDAS_GENE; do OUTDIR=${OUTDIR_BASE}/lambda_gene_${LG} mkdir -p "$OUTDIR" /Users/sr320/.local/bin/uv run python ../scripts/14.1-barnacle/build_tensor_and_run.py \ --input-file ../output/14-pca-orthologs/vst_counts_matrix.csv \ --output-dir "$OUTDIR" \ --rank $RANK \ --lambda-gene $LG \ --lambda-sample $LAMBDA_SAMPLE \ --lambda-time $LAMBDA_TIME \ --max-iter 2000 \ --tol 1e-5 \ --seed 81 done ``` ![](http://gannet.fish.washington.edu/seashell/snaps/2025-11-05_14-01-07.png) Move forward with lamda_gene of 0.2... # Sample ![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_Image_2025-11-06_07-16-18.png) # Time ``` r library(tidyverse) # File path file_path <- "../output/26-rank35-optimization/lambda_gene_0.2/barnacle_factors/time_factors.csv" # Read data df <- read_csv(file_path) # Compute variance of each component across timepoints var_components <- df %>% pivot_longer(cols = starts_with("Component_"), names_to = "Component", values_to = "Value") %>% group_by(Component) %>% summarise(Variance = var(Value, na.rm = TRUE)) %>% arrange(desc(Variance)) # Select top 10 most variable components top10_components <- var_components %>% slice_max(Variance, n = 10) %>% pull(Component) # Prepare data for plotting df_long <- df %>% pivot_longer(cols = starts_with("Component_"), names_to = "Component", values_to = "Value") %>% filter(Component %in% top10_components) # Line plot ggplot(df_long, aes(x = OG_ID, y = Value, group = Component, color = Component)) + geom_line(linewidth = 1.2) + geom_point(size = 2) + theme_minimal(base_size = 14) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs( title = "Top 10 Most Variable Components Across Timepoints", x = "Time Point", y = "Component Value" ) ``` ![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_Image_2025-11-06_07-18-17.png) # Pick your component ``` r library(tidyverse) # File path file_path <- "../output/26-rank35-optimization/lambda_gene_0.2/barnacle_factors/time_factors.csv" # ---- USER INPUT ---- # Pick the component you want to plot (e.g., "Component_5") selected_component <- "Component_24" # --------------------- # Read data df <- read_csv(file_path) # Convert to long format df_long <- df %>% pivot_longer( cols = starts_with("Component_"), names_to = "Component", values_to = "Value" ) # Check that the chosen component exists if (!(selected_component %in% unique(df_long$Component))) { stop(paste("Component", selected_component, "not found in data!")) } # Filter for the selected component df_sel <- df_long %>% filter(Component == selected_component) # Plot single component across timepoints ggplot(df_sel, aes(x = OG_ID, y = Value, group = 1)) + geom_line(linewidth = 1.2, color = "steelblue") + geom_point(size = 3, color = "firebrick") + theme_minimal(base_size = 14) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs( title = paste0(selected_component, " Across Timepoints"), x = "Time Point", y = "Loading Value" ) ``` ![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_Image_2025-11-06_07-20-36.png) # Gene Factor Loading ![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_Image_2025-11-06_07-21-34.png) # Annotation ![](http://gannet.fish.washington.edu/seashell/snaps/testddd.png) # An Annotation - U pick! ``` r library(tidyverse) library(pheatmap) # ---- File path ---- file_path <- "../output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component/goslim_term_counts.csv" # change to your actual CSV path # ---- User input ---- # Enter one or more GO terms (exact or partial match) selected_terms <- c("protein catabolic process", "lipid", "immune system process", "carbohydrate", "antioxidant activity", "mitochondrion", "inflammatory response", "reproductive") # example # ---- Options ---- row_normalize <- TRUE # TRUE for row-normalized (z-score), FALSE for raw log_transform <- TRUE # TRUE to apply log10(x+1) before plotting # ---- Load and prepare data ---- df <- read_csv(file_path) # Remove total column if present df <- df %>% select(-any_of("total")) # Filter by GO term (partial match supported) df_sel <- df %>% filter(if_any(term, ~ str_detect(.x, paste(selected_terms, collapse = "|")))) if (nrow(df_sel) == 0) { stop("⚠️ No matching GO terms found. Check spelling or capitalization.") } # Convert to matrix mat <- df_sel %>% column_to_rownames(var = "term") %>% as.matrix() # Optional log-transform if (log_transform) { mat <- log10(mat + 1) } # Optional row normalization if (row_normalize) { mat <- t(scale(t(mat))) # z-score per row } # ---- Plot heatmap ---- pheatmap( mat, scale = "none", color = colorRampPalette(c("navy", "white", "firebrick3"))(200), clustering_method = "ward.D2", show_rownames = TRUE, main = paste( "GO Term Heatmap —", if (row_normalize) "Row-normalized" else "Raw values", if (log_transform) "(log10)" else "" ) ) ``` ![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_Image_2025-11-06_07-25-48.png)