Getting gene-level methylation using 10xbedgraphs…
Here is a walk through of one species..
Gene Model Preparation
head ../data/Pocillopora_meandrina_HIv1.genes-validated.gff3##gff-version 3
##sequence-region Pocillopora_meandrina_HIv1___xfSc0000716 887 39392
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS gene 887 6811 . - . ID=gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS mRNA 887 6811 . - . ID=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1;Parent=gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS CDS 887 973 . - 0 ID=cds-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 887 973 . - 0 ID=exon-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1-1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS CDS 1828 1882 . - 1 ID=cds-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 1828 1882 . - 1 ID=exon-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1-2;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS CDS 2308 2371 . - 2 ID=cds-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 AUGUSTUS exon 2308 2371 . - 2 ID=exon-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1-3;Parent=mrna-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
cut -f3 ../data/Pocillopora_meandrina_HIv1.genes-validated.gff3 | \
grep -v "^#" | \
sort | uniq -c | sort -nr 208535 exon
208535 CDS
31840 mRNA
31840 gene
gawk '$3=="gene" {
split($9, a, ";");
for(i in a){
if(a[i] ~ /^ID=/){
sub("ID=", "", a[i]);
gene_id = a[i];
}
}
print $1 "\t" $4-1 "\t" $5 "\t" gene_id;
}' ../data/Pocillopora_meandrina_HIv1.genes-validated.gff3 > ../data/Pocillopora_meandrina_HIv1.GENESonly-validated.bedhead ../data/Pocillopora_meandrina_HIv1.GENESonly-validated.bed
wc -l ../data/Pocillopora_meandrina_HIv1.GENESonly-validated.bedPocillopora_meandrina_HIv1___xfSc0000716 886 6811 gene-Pocillopora_meandrina_HIv1___RNAseq.g29951.t1
Pocillopora_meandrina_HIv1___xfSc0000716 7304 12239 gene-Pocillopora_meandrina_HIv1___RNAseq.g29952.t1
Pocillopora_meandrina_HIv1___xfSc0000716 21621 23112 gene-Pocillopora_meandrina_HIv1___RNAseq.g29953.t1
Pocillopora_meandrina_HIv1___xfSc0000716 25312 26016 gene-Pocillopora_meandrina_HIv1___RNAseq.g29954.t1
Pocillopora_meandrina_HIv1___xfSc0000716 34770 39392 gene-Pocillopora_meandrina_HIv1___RNAseq.g29955.t1
Pocillopora_meandrina_HIv1___Sc0000029 18377 30613 gene-Pocillopora_meandrina_HIv1___RNAseq.g10233.t1
Pocillopora_meandrina_HIv1___Sc0000029 35435 38326 gene-Pocillopora_meandrina_HIv1___RNAseq.g10234.t1
Pocillopora_meandrina_HIv1___Sc0000029 39901 40640 gene-Pocillopora_meandrina_HIv1___RNAseq.g10235.t1
Pocillopora_meandrina_HIv1___Sc0000029 56431 56839 gene-Pocillopora_meandrina_HIv1___RNAseq.g10236.t1
Pocillopora_meandrina_HIv1___Sc0000029 96446 105523 gene-Pocillopora_meandrina_HIv1___RNAseq.g10237.t1
31840 ../data/Pocillopora_meandrina_HIv1.GENESonly-validated.bed
Prepping bedgraphs
mkdir -p ../data/10xbedgraph
cd ../data/10xbedgraph
wget -r -l1 --no-parent -nd -A '*10x.bedgraph' \
'https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250821_meth_Ptua/'
head ../data/10xbedgraph/POC-40-TP1_10x.bedgraphfor f in ../data/10xbedgraph/*_10x.bedgraph; do
sort -k1,1 -k2,2n "$f" > ${f%.bedgraph}.sorted.bedgraph
donehead ../data/10xbedgraph/POC-40-TP1_10x.sorted.bedgraphPocillopora_meandrina_HIv1___Sc0000000 2852 2854 0.000000
Pocillopora_meandrina_HIv1___Sc0000000 2908 2910 0.000000
Pocillopora_meandrina_HIv1___Sc0000000 3375 3377 6.666667
Pocillopora_meandrina_HIv1___Sc0000000 3384 3386 7.142857
Pocillopora_meandrina_HIv1___Sc0000000 3429 3431 0.000000
Pocillopora_meandrina_HIv1___Sc0000000 3570 3572 0.000000
Pocillopora_meandrina_HIv1___Sc0000000 5128 5130 0.000000
Pocillopora_meandrina_HIv1___Sc0000000 6101 6103 0.000000
Pocillopora_meandrina_HIv1___Sc0000000 6110 6112 0.000000
Pocillopora_meandrina_HIv1___Sc0000000 9318 9320 0.000000
mkdir -p ../output/09-Ptua-Gene-Methylation/
sort -k1,1 -k2,2n ../data/Pocillopora_meandrina_HIv1.GENESonly-validated.bed \
> ../data/Pocillopora_meandrina_HIv1.GENESonly-validated.sorted.bed
for f in ../data/10xbedgraph/*_10x.sorted.bedgraph; do
sample=$(basename "$f" _10x.sorted.bedgraph)
bedtools map \
-a ../data/Pocillopora_meandrina_HIv1.GENESonly-validated.sorted.bed \
-b "$f" \
-c 4 \
-o mean,count \
> ../output/09-Ptua-Gene-Methylation/gene_methylation_${sample}.tsv
donePocillopora_meandrina_HIv1___Sc0000000 10770 23652 gene-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1 77.1836165 18
Pocillopora_meandrina_HIv1___Sc0000000 25128 33764 gene-Pocillopora_meandrina_HIv1___RNAseq.g20903.t1 60.81863374 38
Pocillopora_meandrina_HIv1___Sc0000000 33907 39296 gene-Pocillopora_meandrina_HIv1___TS.g25664.t1 20.83649989 45
Pocillopora_meandrina_HIv1___Sc0000000 44371 50798 gene-Pocillopora_meandrina_HIv1___RNAseq.g20905.t1 0.2109149468 282
Pocillopora_meandrina_HIv1___Sc0000000 52049 53624 gene-Pocillopora_meandrina_HIv1___RNAseq.g20906.t1 0.2717391282 39
Pocillopora_meandrina_HIv1___Sc0000000 54620 71869 gene-Pocillopora_meandrina_HIv1___RNAseq.g20907.t1 9.341014667 255
Pocillopora_meandrina_HIv1___Sc0000000 75141 87120 gene-Pocillopora_meandrina_HIv1___RNAseq.g20908.t1 35.22936122 74
Pocillopora_meandrina_HIv1___Sc0000000 89333 101789 gene-Pocillopora_meandrina_HIv1___RNAseq.g20909.t1 32.38789587 97
Pocillopora_meandrina_HIv1___Sc0000000 104428 116100 gene-Pocillopora_meandrina_HIv1___RNAseq.g20910.t1 54.14505446 61
Pocillopora_meandrina_HIv1___Sc0000000 117505 121120 gene-Pocillopora_meandrina_HIv1___RNAseq.g20911.t1 27.60554107 54
for f in ../output/09-Ptua-Gene-Methylation/gene_methylation*.tsv; do
out="../output/09-Ptua-Gene-Methylation/$(basename "${f%.tsv}_gt10.tsv")"
awk '$6 > 10' "$f" > "$out"
donehead ../output/09-Ptua-Gene-Methylation/gene_methylation_POC-40-TP*_gt10.tsv==> ../output/09-Ptua-Gene-Methylation/gene_methylation_POC-40-TP1_gt10.tsv <==
Pocillopora_meandrina_HIv1___Sc0000000 10770 23652 gene-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1 78.86871104 24
Pocillopora_meandrina_HIv1___Sc0000000 25128 33764 gene-Pocillopora_meandrina_HIv1___RNAseq.g20903.t1 66.12523937 38
Pocillopora_meandrina_HIv1___Sc0000000 33907 39296 gene-Pocillopora_meandrina_HIv1___TS.g25664.t1 25.66347985 41
Pocillopora_meandrina_HIv1___Sc0000000 44371 50798 gene-Pocillopora_meandrina_HIv1___RNAseq.g20905.t1 0.61153482 250
Pocillopora_meandrina_HIv1___Sc0000000 52049 53624 gene-Pocillopora_meandrina_HIv1___RNAseq.g20906.t1 1.662464711 38
Pocillopora_meandrina_HIv1___Sc0000000 54620 71869 gene-Pocillopora_meandrina_HIv1___RNAseq.g20907.t1 8.299407695 256
Pocillopora_meandrina_HIv1___Sc0000000 75141 87120 gene-Pocillopora_meandrina_HIv1___RNAseq.g20908.t1 32.24347769 74
Pocillopora_meandrina_HIv1___Sc0000000 89333 101789 gene-Pocillopora_meandrina_HIv1___RNAseq.g20909.t1 29.81549905 94
Pocillopora_meandrina_HIv1___Sc0000000 104428 116100 gene-Pocillopora_meandrina_HIv1___RNAseq.g20910.t1 54.3217754 63
Pocillopora_meandrina_HIv1___Sc0000000 117505 121120 gene-Pocillopora_meandrina_HIv1___RNAseq.g20911.t1 22.62597314 65
Combined Matrix
library(tidyverse)
# Directory containing all filtered tsv files
indir <- "../output/09-Ptua-Gene-Methylation"
# List all filtered files
files <- list.files(indir, pattern = "gene_methylation.*_gt10.tsv", full.names = TRUE)
# Read and combine
df_list <- lapply(files, function(f) {
sample_name <- basename(f) |>
sub("gene_methylation_", "", x = _) |>
sub("_gt10.tsv", "", x = _)
read_tsv(f, col_names = FALSE) |>
select(X4, X5) |>
rename(gene_id = X4, !!sample_name := X5)
})
# Full join all files by gene_id
merged <- reduce(df_list, full_join, by = "gene_id")
# ---- NEW PART: keep genes present in >=75% of samples ----
merged_75 <- merged %>%
mutate(
# fraction of samples with non-NA values (ignores gene_id column)
frac_nonNA = rowMeans(!is.na(across(-gene_id)))
) %>%
filter(frac_nonNA >= 0.75) %>%
select(-frac_nonNA)
# Save 75% filtered matrix
write_tsv(merged_75, "../output/09-Ptua-Gene-Methylation/Ptua-gene-methylation_75pct.tsv")23370 ../output/09-Ptua-Gene-Methylation/Ptua-gene-methylation_75pct.tsv
gene_id POC-201-TP1 POC-201-TP2 POC-219-TP1 POC-219-TP2 POC-219-TP3 POC-219-TP4 POC-222-TP1 POC-222-TP3 POC-222-TP4 POC-255-TP1 POC-255-TP2 POC-255-TP3 POC-259-TP1 POC-259-TP2 POC-40-TP1 POC-40-TP2 POC-40-TP3 POC-40-TP4 POC-42-TP2 POC-42-TP4 POC-52-TP1 POC-52-TP2 POC-52-TP3 POC-52-TP4 POC-53-TP1 POC-53-TP2 POC-53-TP3 POC-53-TP4 POC-57-TP1 POC-57-TP2 POC-57-TP3 POC-57-TP4
gene-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1 55.94819507 65 NA 56.71798513 70.75064638 72.97016588 69.38569095 61.48214065 60.15602938 68.71286394 68.121314 NA 80.14404553 81.65033473 78.86871104 75.75863129 73.52938063 77.1836165 63.71908893 73.56089086 NA 63.53896107 61.79727092 NA 69.83198112 66.01362921 48.35913809 NA 71.90167729 75.73253744 74.34254614 65.64244627
gene-Pocillopora_meandrina_HIv1___RNAseq.g20903.t1 62.07071959 64.30941694 57.89994524 57.9764581 62.95167124 61.53853367 59.22841108 59.25982624 56.56210374 57.61653049 56.71230877 62.92154108 61.34633203 67.04912922 66.12523937 64.45467636 63.47604045 60.81863374 58.51124484 60.36185731 57.97446009 60.10673518 56.45465472 53.58677349 57.63890561 60.56861659 56.60354664 54.7309886 62.98476886 62.72254031 66.12983122 63.81437656
library(tidyverse)
library(pheatmap)
# ------------------------------------------------------------
# Load data
# ------------------------------------------------------------
df <- read_tsv("../output/09-Ptua-Gene-Methylation/Ptua-gene-methylation_75pct.tsv")
mat <- df %>%
column_to_rownames("gene_id") %>%
as.matrix()
# ------------------------------------------------------------
# Step 1 — Impute NA with row means
# ------------------------------------------------------------
row_means <- rowMeans(mat, na.rm = TRUE)
for (i in seq_len(nrow(mat))) {
if (is.na(row_means[i])) row_means[i] <- 0 # all-NA rows
mat[i, is.na(mat[i, ])] <- row_means[i]
}
# ------------------------------------------------------------
# Step 2 — Remove zero-variance rows
# ------------------------------------------------------------
keep <- apply(mat, 1, function(x) sd(x) > 0)
mat2 <- mat[keep, , drop = FALSE]
# ------------------------------------------------------------
# Step 3 — Remove rows still containing Inf/NaN
# ------------------------------------------------------------
mat2 <- mat2[apply(mat2, 1, function(x) all(is.finite(x))), , drop = FALSE]
# ------------------------------------------------------------
# Heatmap 1 — Scaled (same as before)
# ------------------------------------------------------------
pheatmap(
mat2,
scale = "row",
clustering_method = "ward.D2",
show_rownames = FALSE,
main = "Gene-level Methylation Patterns (Scaled)"
)
# ------------------------------------------------------------
# Heatmap 2 — Actual methylation values (Unscaled)
# ------------------------------------------------------------
pheatmap(
mat2,
scale = "none",
clustering_method = "ward.D2",
show_rownames = FALSE,
main = "Gene-level Methylation Values (Raw)",
color = colorRampPalette(c("navy","green", "firebrick"))(100) # optional nicer palette
)
library(tidyverse)
library(RColorBrewer)
# -------------------------
# Load data
# -------------------------
df <- read_tsv("../output/09-Ptua-Gene-Methylation/Ptua-gene-methylation_75pct.tsv")
# -------------------------
# Convert to matrix
# -------------------------
mat <- df %>%
column_to_rownames("gene_id") %>%
as.matrix()
# -------------------------
# Step 1: Impute NA with row means
# -------------------------
row_means <- rowMeans(mat, na.rm = TRUE)
for (i in seq_len(nrow(mat))) {
if (is.na(row_means[i])) row_means[i] <- 0 # if entire row is NA
mat[i, is.na(mat[i, ])] <- row_means[i]
}
# -------------------------
# Step 2: Remove zero-variance genes
# (these blow up PCA scaling)
# -------------------------
keep <- apply(mat, 1, function(x) sd(x) > 0)
mat2 <- mat[keep, ]
# -------------------------
# Step 3: Remove genes containing any Inf/NaN
# -------------------------
mat2 <- mat2[apply(mat2, 1, function(x) all(is.finite(x))), ]
# -------------------------
# Step 4: Transpose (samples as rows)
# -------------------------
mat2_t <- t(mat2)
# -------------------------
# Extract sample group
# -------------------------
sample_groups <- str_extract(rownames(mat2_t), "POC-[0-9]+")
ngroups <- length(unique(sample_groups))
colors <- brewer.pal(max(3, ngroups), "Dark2")[1:ngroups]
group_colors <- setNames(colors, unique(sample_groups))
# -------------------------
# Step 5: Run PCA safely
# -------------------------
pca <- prcomp(mat2_t, scale. = TRUE)
# -------------------------
# Step 6: Build plotting DF
# -------------------------
pca_df <- data.frame(
PC1 = pca$x[,1],
PC2 = pca$x[,2],
sample = rownames(pca$x),
group = sample_groups
)
# -------------------------
# Step 7: Plot PCA
# -------------------------
ggplot(pca_df, aes(PC1, PC2, color = group, label = sample)) +
geom_point(size = 4) +
geom_text(vjust = -0.7, size = 3) +
scale_color_manual(values = group_colors) +
theme_minimal(base_size = 14) +
labs(
title = "PCA of Gene-Level Methylation",
color = "Sample Group"
)
library(tidyverse)
library(pheatmap)
# ------------------------------------------------
# Load and convert to matrix
# ------------------------------------------------
mat <- read_tsv("../output/09-Ptua-Gene-Methylation/Ptua-gene-methylation_75pct.tsv") %>%
column_to_rownames("gene_id") %>%
as.matrix()
# ------------------------------------------------
# Step 1 — Impute NA with row means
# ------------------------------------------------
row_means <- rowMeans(mat, na.rm = TRUE)
for (i in seq_len(nrow(mat))) {
if (is.na(row_means[i])) row_means[i] <- 0 # handles all-NA rows
mat[i, is.na(mat[i, ])] <- row_means[i]
}
# ------------------------------------------------
# Step 2 — Remove zero-variance genes
# (these make correlation produce NA columns)
# ------------------------------------------------
keep <- apply(mat, 1, function(x) sd(x) > 0)
mat2 <- mat[keep, , drop = FALSE]
# ------------------------------------------------
# Step 3 — Remove any rows still containing Inf/NaN
# ------------------------------------------------
mat2 <- mat2[apply(mat2, 1, function(x) all(is.finite(x))), , drop = FALSE]
# ------------------------------------------------
# Step 4 — Compute correlation matrix safely
# ------------------------------------------------
cor_mat <- cor(mat2, use = "pairwise.complete.obs")
# ------------------------------------------------
# Step 5 — Remove NA rows/columns in correlation matrix (if any remain)
# ------------------------------------------------
finite_cols <- apply(cor_mat, 2, function(x) all(is.finite(x)))
cor_mat2 <- cor_mat[finite_cols, finite_cols]
# ------------------------------------------------
# Step 6 — Plot sample–sample correlation heatmap
# ------------------------------------------------
pheatmap(cor_mat2,
main = "Sample–Sample Correlation",
clustering_method = "average",
fontsize = 12)
Quick Species Comparison
| Apul | Peve | Ptua |
|---|---|---|
![]() |
![]() |
![]() |

