More lamda testing

LAMBDAS_GENE="1 2 4 6 8 10"
LAMBDA_SAMPLE=0.1
LAMBDA_TIME=0.05
RANK=35

mkdir -p ../output/41-rank35-optimization

OUTDIR_BASE=../output/41-rank35-optimization

for LG in $LAMBDAS_GENE; do
  OUTDIR=${OUTDIR_BASE}/lambda_gene_${LG}
  mkdir -p "$OUTDIR"

  uv run python ../scripts/14.1-barnacle/build_tensor_and_run.py \
    --input-file ../output/14-pca-orthologs/vst_counts_matrix.csv \
    --output-dir "$OUTDIR" \
    --rank $RANK \
    --lambda-gene $LG \
    --lambda-sample $LAMBDA_SAMPLE \
    --lambda-time $LAMBDA_TIME \
    --max-iter 1000 \
    --tol 1e-5 \
    --seed 92
done

library(tidyverse)

# ---------------------------------------------------------
# SETTINGS
# ---------------------------------------------------------

base_url <- "https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization"

lambda_vals <- c("1", "2", "4", "6", "8", "10")

# ---------------------------------------------------------
# FUNCTION: read gene_factors.csv AND FIX HEADER
# ---------------------------------------------------------
read_gene_factors <- function(lambda) {
  url <- sprintf("%s/lambda_gene_%s/barnacle_factors/gene_factors.csv",
                 base_url, lambda)

  message("Reading: ", url)
  df <- read_csv(url, show_col_types = FALSE)

  # Fix the shifted header here
  # First column should be "gene", others "comp1..compN"
  colnames(df) <- c("gene", paste0("comp", 1:(ncol(df)-1)))

  df$lambda_gene <- lambda
  df
}

# ---------------------------------------------------------
# READ ALL RUNS
# ---------------------------------------------------------
all_factors <- map_df(lambda_vals, read_gene_factors)

# ---------------------------------------------------------
# LONG FORMAT FOR ANALYSIS
# ---------------------------------------------------------
factor_long <- all_factors %>%
  pivot_longer(
    cols = starts_with("comp"),
    names_to = "component",
    values_to = "loading"
  )

sparsity_tbl <- factor_long %>%
  group_by(lambda_gene) %>%
  summarize(
    n_values = n(),
    n_zero = sum(loading == 0 | abs(loading) < 1e-12),
    sparsity = n_zero / n_values
  )

sparsity_tbl

# A tibble: 6 × 4
  lambda_gene n_values n_zero sparsity
  <chr>          <int>  <int>    <dbl>
1 1             343000   3079  0.00898
2 10            343000  25244  0.0736 
3 2             343000   3798  0.0111 
4 4             343000   7023  0.0205 
5 6             343000  16289  0.0475 
6 8             343000  21282  0.0620

gene_component_counts <- factor_long %>%
  filter(loading > 0) %>%
  group_by(lambda_gene, gene) %>%
  summarize(
    n_components = n_distinct(component),
    .groups = "drop"
  )

multi_component_summary <- gene_component_counts %>%
  group_by(lambda_gene, n_components) %>%
  summarize(n_genes = n(), .groups = "drop")

multi_component_summary

# A tibble: 59 × 3
   lambda_gene n_components n_genes
   <chr>              <int>   <int>
 1 1                     27       2
 2 1                     28       3
 3 1                     29       5
 4 1                     30      17
 5 1                     31      54
 6 1                     32     127
 7 1                     33     402
 8 1                     34    1526
 9 1                     35    7664
10 10                    24       3
# ℹ 49 more rows

summary_tbl <- sparsity_tbl %>%
  left_join(
    gene_component_counts %>%
      group_by(lambda_gene) %>%
      summarize(
        mean_components = mean(n_components),
        frac_multi = mean(n_components > 1),
        .groups = "drop"
      ),
    by = "lambda_gene"
  )

summary_tbl

# A tibble: 6 × 6
  lambda_gene n_values n_zero sparsity mean_components frac_multi
  <chr>          <int>  <int>    <dbl>           <dbl>      <dbl>
1 1             343000   3079  0.00898            34.7          1
2 10            343000  25244  0.0736             32.4          1
3 2             343000   3798  0.0111             34.6          1
4 4             343000   7023  0.0205             34.3          1
5 6             343000  16289  0.0475             33.3          1
6 8             343000  21282  0.0620             32.8          1

  # Load data
  gene_factors <- read_csv("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization/lambda_gene_1/barnacle_factors/gene_factors.csv")


#edit so columns names are shifted 1 to the left
  colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1)))

#code to create histogram of gene_factors:
  
  library(ggplot2)
  library(tidyr)
  
  # Reshape data to long format 
  gene_factors_long <- gene_factors %>%
    pivot_longer(cols = -1, names_to = "component", values_to = "loading")
  
  # Plot histogram
  ggplot(gene_factors_long, aes(x = loading)) +
    geom_histogram(bins = 50, fill = "blue", alpha = 0.7) +
    facet_wrap(~ component, scales = "free") +
    theme_minimal() +
    labs(title = "Distribution of Gene Loadings for Rank 35 Components",
         x = "Gene Loading",
         y = "Count")

  # Load data
  gene_factors <- read_csv("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization/lambda_gene_2/barnacle_factors/gene_factors.csv")


#edit so columns names are shifted 1 to the left
  colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1)))

#code to create histogram of gene_factors:
  
  library(ggplot2)
  library(tidyr)
  
  # Reshape data to long format 
  gene_factors_long <- gene_factors %>%
    pivot_longer(cols = -1, names_to = "component", values_to = "loading")
  
  # Plot histogram
  ggplot(gene_factors_long, aes(x = loading)) +
    geom_histogram(bins = 50, fill = "blue", alpha = 0.7) +
    facet_wrap(~ component, scales = "free") +
    theme_minimal() +
    labs(title = "Distribution of Gene Loadings for Rank 35 Components",
         x = "Gene Loading",
         y = "Count")

  # Load data
  gene_factors <- read_csv("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization/lambda_gene_4/barnacle_factors/gene_factors.csv")


#edit so columns names are shifted 1 to the left
  colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1)))

#code to create histogram of gene_factors:
  
  library(ggplot2)
  library(tidyr)
  
  # Reshape data to long format 
  gene_factors_long <- gene_factors %>%
    pivot_longer(cols = -1, names_to = "component", values_to = "loading")
  
  # Plot histogram
  ggplot(gene_factors_long, aes(x = loading)) +
    geom_histogram(bins = 50, fill = "blue", alpha = 0.7) +
    facet_wrap(~ component, scales = "free") +
    theme_minimal() +
    labs(title = "Distribution of Gene Loadings for Rank 35 Components",
         x = "Gene Loading",
         y = "Count")

  # Load data
  gene_factors <- read_csv("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization/lambda_gene_6/barnacle_factors/gene_factors.csv")


#edit so columns names are shifted 1 to the left
  colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1)))

#code to create histogram of gene_factors:
  
  library(ggplot2)
  library(tidyr)
  
  # Reshape data to long format 
  gene_factors_long <- gene_factors %>%
    pivot_longer(cols = -1, names_to = "component", values_to = "loading")
  
  # Plot histogram
  ggplot(gene_factors_long, aes(x = loading)) +
    geom_histogram(bins = 50, fill = "blue", alpha = 0.7) +
    facet_wrap(~ component, scales = "free") +
    theme_minimal() +
    labs(title = "Distribution of Gene Loadings for Rank 35 Components",
         x = "Gene Loading",
         y = "Count")

  # Load data
  gene_factors <- read_csv("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization/lambda_gene_8/barnacle_factors/gene_factors.csv")


#edit so columns names are shifted 1 to the left
  colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1)))

#code to create histogram of gene_factors:
  
  library(ggplot2)
  library(tidyr)
  
  # Reshape data to long format 
  gene_factors_long <- gene_factors %>%
    pivot_longer(cols = -1, names_to = "component", values_to = "loading")
  
  # Plot histogram
  ggplot(gene_factors_long, aes(x = loading)) +
    geom_histogram(bins = 50, fill = "blue", alpha = 0.7) +
    facet_wrap(~ component, scales = "free") +
    theme_minimal() +
    labs(title = "Distribution of Gene Loadings for Rank 35 Components",
         x = "Gene Loading",
         y = "Count")

  # Load data
  gene_factors <- read_csv("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization/lambda_gene_10/barnacle_factors/gene_factors.csv")


#edit so columns names are shifted 1 to the left
  colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1)))

#code to create histogram of gene_factors:
  
  library(ggplot2)
  library(tidyr)
  
  # Reshape data to long format 
  gene_factors_long <- gene_factors %>%
    pivot_longer(cols = -1, names_to = "component", values_to = "loading")
  
  # Plot histogram
  ggplot(gene_factors_long, aes(x = loading)) +
    geom_histogram(bins = 50, fill = "blue", alpha = 0.7) +
    facet_wrap(~ component, scales = "free") +
    theme_minimal() +
    labs(title = "Distribution of Gene Loadings for Rank 35 Components",
         x = "Gene Loading",
         y = "Count")

--- title: "Barnacle" description: "more lamda values" categories: [e5, coral] #citation: date: 12-04-2025 image: http://gannet.fish.washington.edu/seashell/snaps/Monosnap_Image_2025-11-30_15-34-25.png # finding a good image author: - name: Steven Roberts url: orcid: 0000-0001-8302-1138 affiliation: Professor, UW - School of Aquatic and Fishery Sciences affiliation-url: https://robertslab.info #url: # self-defined draft: false # setting this to `true` will prevent your post from appearing on your listing page until you're ready! format: html: toc: true # ← enable TOC toc-location: left # or: right, body toc-depth: 3 # how many heading levels to show code-fold: FALSE code-tools: true code-copy: true highlight-style: github code-overflow: wrap #runtime: shiny --- # More lamda testing ``` bash LAMBDAS_GENE="1 2 4 6 8 10" LAMBDA_SAMPLE=0.1 LAMBDA_TIME=0.05 RANK=35 mkdir -p ../output/41-rank35-optimization OUTDIR_BASE=../output/41-rank35-optimization for LG in $LAMBDAS_GENE; do OUTDIR=${OUTDIR_BASE}/lambda_gene_${LG} mkdir -p "$OUTDIR" uv run python ../scripts/14.1-barnacle/build_tensor_and_run.py \ --input-file ../output/14-pca-orthologs/vst_counts_matrix.csv \ --output-dir "$OUTDIR" \ --rank $RANK \ --lambda-gene $LG \ --lambda-sample $LAMBDA_SAMPLE \ --lambda-time $LAMBDA_TIME \ --max-iter 1000 \ --tol 1e-5 \ --seed 92 done ``` ```{r, echo=TRUE, warning=FALSE, message=FALSE} library(tidyverse) # --------------------------------------------------------- # SETTINGS # --------------------------------------------------------- base_url <- "https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization" lambda_vals <- c("1", "2", "4", "6", "8", "10") # --------------------------------------------------------- # FUNCTION: read gene_factors.csv AND FIX HEADER # --------------------------------------------------------- read_gene_factors <- function(lambda) { url <- sprintf("%s/lambda_gene_%s/barnacle_factors/gene_factors.csv", base_url, lambda) message("Reading: ", url) df <- read_csv(url, show_col_types = FALSE) # Fix the shifted header here # First column should be "gene", others "comp1..compN" colnames(df) <- c("gene", paste0("comp", 1:(ncol(df)-1))) df$lambda_gene <- lambda df } # --------------------------------------------------------- # READ ALL RUNS # --------------------------------------------------------- all_factors <- map_df(lambda_vals, read_gene_factors) # --------------------------------------------------------- # LONG FORMAT FOR ANALYSIS # --------------------------------------------------------- factor_long <- all_factors %>% pivot_longer( cols = starts_with("comp"), names_to = "component", values_to = "loading" ) ``` ```{r} sparsity_tbl <- factor_long %>% group_by(lambda_gene) %>% summarize( n_values = n(), n_zero = sum(loading == 0 | abs(loading) < 1e-12), sparsity = n_zero / n_values ) sparsity_tbl ``` ```{r} gene_component_counts <- factor_long %>% filter(loading > 0) %>% group_by(lambda_gene, gene) %>% summarize( n_components = n_distinct(component), .groups = "drop" ) ``` ```{r} multi_component_summary <- gene_component_counts %>% group_by(lambda_gene, n_components) %>% summarize(n_genes = n(), .groups = "drop") multi_component_summary ``` ```{r} summary_tbl <- sparsity_tbl %>% left_join( gene_component_counts %>% group_by(lambda_gene) %>% summarize( mean_components = mean(n_components), frac_multi = mean(n_components > 1), .groups = "drop" ), by = "lambda_gene" ) summary_tbl ``` ```{r, echo=TRUE, warning=FALSE, message=FALSE, fig.width=10, fig.height=18} # Load data gene_factors <- read_csv("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization/lambda_gene_1/barnacle_factors/gene_factors.csv") #edit so columns names are shifted 1 to the left colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1))) #code to create histogram of gene_factors: library(ggplot2) library(tidyr) # Reshape data to long format gene_factors_long <- gene_factors %>% pivot_longer(cols = -1, names_to = "component", values_to = "loading") # Plot histogram ggplot(gene_factors_long, aes(x = loading)) + geom_histogram(bins = 50, fill = "blue", alpha = 0.7) + facet_wrap(~ component, scales = "free") + theme_minimal() + labs(title = "Distribution of Gene Loadings for Rank 35 Components", x = "Gene Loading", y = "Count") ``` ```{r, echo=TRUE, warning=FALSE, message=FALSE, fig.width=10, fig.height=18} # Load data gene_factors <- read_csv("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization/lambda_gene_2/barnacle_factors/gene_factors.csv") #edit so columns names are shifted 1 to the left colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1))) #code to create histogram of gene_factors: library(ggplot2) library(tidyr) # Reshape data to long format gene_factors_long <- gene_factors %>% pivot_longer(cols = -1, names_to = "component", values_to = "loading") # Plot histogram ggplot(gene_factors_long, aes(x = loading)) + geom_histogram(bins = 50, fill = "blue", alpha = 0.7) + facet_wrap(~ component, scales = "free") + theme_minimal() + labs(title = "Distribution of Gene Loadings for Rank 35 Components", x = "Gene Loading", y = "Count") ``` ```{r, echo=TRUE, warning=FALSE, message=FALSE, fig.width=10, fig.height=18} # Load data gene_factors <- read_csv("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization/lambda_gene_4/barnacle_factors/gene_factors.csv") #edit so columns names are shifted 1 to the left colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1))) #code to create histogram of gene_factors: library(ggplot2) library(tidyr) # Reshape data to long format gene_factors_long <- gene_factors %>% pivot_longer(cols = -1, names_to = "component", values_to = "loading") # Plot histogram ggplot(gene_factors_long, aes(x = loading)) + geom_histogram(bins = 50, fill = "blue", alpha = 0.7) + facet_wrap(~ component, scales = "free") + theme_minimal() + labs(title = "Distribution of Gene Loadings for Rank 35 Components", x = "Gene Loading", y = "Count") ``` ```{r, echo=TRUE, warning=FALSE, message=FALSE, fig.width=10, fig.height=18} # Load data gene_factors <- read_csv("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization/lambda_gene_6/barnacle_factors/gene_factors.csv") #edit so columns names are shifted 1 to the left colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1))) #code to create histogram of gene_factors: library(ggplot2) library(tidyr) # Reshape data to long format gene_factors_long <- gene_factors %>% pivot_longer(cols = -1, names_to = "component", values_to = "loading") # Plot histogram ggplot(gene_factors_long, aes(x = loading)) + geom_histogram(bins = 50, fill = "blue", alpha = 0.7) + facet_wrap(~ component, scales = "free") + theme_minimal() + labs(title = "Distribution of Gene Loadings for Rank 35 Components", x = "Gene Loading", y = "Count") ``` ```{r, echo=TRUE, warning=FALSE, message=FALSE, fig.width=10, fig.height=18} # Load data gene_factors <- read_csv("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization/lambda_gene_8/barnacle_factors/gene_factors.csv") #edit so columns names are shifted 1 to the left colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1))) #code to create histogram of gene_factors: library(ggplot2) library(tidyr) # Reshape data to long format gene_factors_long <- gene_factors %>% pivot_longer(cols = -1, names_to = "component", values_to = "loading") # Plot histogram ggplot(gene_factors_long, aes(x = loading)) + geom_histogram(bins = 50, fill = "blue", alpha = 0.7) + facet_wrap(~ component, scales = "free") + theme_minimal() + labs(title = "Distribution of Gene Loadings for Rank 35 Components", x = "Gene Loading", y = "Count") ``` ```{r, echo=TRUE, warning=FALSE, message=FALSE, fig.width=10, fig.height=18} # Load data gene_factors <- read_csv("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/M-multi-species/output/41-rank35-optimization/lambda_gene_10/barnacle_factors/gene_factors.csv") #edit so columns names are shifted 1 to the left colnames(gene_factors) <- c("gene", paste0("comp", 1:(ncol(gene_factors)-1))) #code to create histogram of gene_factors: library(ggplot2) library(tidyr) # Reshape data to long format gene_factors_long <- gene_factors %>% pivot_longer(cols = -1, names_to = "component", values_to = "loading") # Plot histogram ggplot(gene_factors_long, aes(x = loading)) + geom_histogram(bins = 50, fill = "blue", alpha = 0.7) + facet_wrap(~ component, scales = "free") + theme_minimal() + labs(title = "Distribution of Gene Loadings for Rank 35 Components", x = "Gene Loading", y = "Count") ```