Day one. Much of time was spent on Epigenetic Simulation paper completion.
Some coding completed.
Epigenetic Machinery Expression
For Apul joined relavant blast matches to expression data.
https://github.com/urol-e5/timeseries_molecular/blob/main/D-Apul/code/29-Apul-epimachine-exp.Rmd
Select Gene for Expression Plot
CpG Methylation
Also looked at CpG Methylation in time-series data.
Prior had generated 10x bedgraph for APul.
15.5
cd ../output/15.5-Apul-bismark/
for f in *merged_CpG_evidence.cov
do
STEM=$(basename "${f}" _R1_001.fastp-trim_bismark_bt2.CpG_report.merged_CpG_evidence.cov)
cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 10) {print $1, $2, $3, $4}}' \
> "${STEM}"_10x.bedgraph
done
ntLink_0 25585 25587 0.000000
ntLink_0 25624 25626 0.000000
ntLink_0 25777 25779 0.000000
ntLink_0 25779 25781 0.000000
ntLink_0 25788 25790 0.000000
ntLink_0 25803 25805 0.000000
ntLink_0 25811 25813 0.000000
ntLink_0 25830 25832 0.000000
ntLink_0 90500 90502 0.000000
ntLink_0 90540 90542 0.000000
ntLink_0 90562 90564 0.000000
ntLink_0 90722 90724 0.000000
ntLink_0 94077 94079 0.000000
ntLink_0 94090 94092 0.000000
ntLink_0 94095 94097 0.000000
ntLink_0 94098 94100 0.000000
ntLink_1 2597 2599 0.000000
ntLink_1 2613 2615 0.000000
ntLink_1 2626 2628 0.000000
ntLink_1 4093 4095 0.000000
ntLink_1 4151 4153 0.000000
ntLink_1 4740 4742 10.000000
ntLink_1 4831 4833 0.000000
for file in ../output/15.5-Apul-bismark/*10x.bedgraph; do
awk '{print "CpG_"_$1"_"$2, $4}' "$file" > "${file%.bedgraph}_processed.txt"
done
CpG_ntLink_0_25692 0.000000
CpG_ntLink_0_25699 0.000000
CpG_ntLink_0_80003 58.333333
CpG_ntLink_0_90500 0.000000
CpG_ntLink_0_90540 0.000000
CpG_ntLink_0_90562 0.000000
CpG_ntLink_0_90633 0.000000
CpG_ntLink_0_90648 0.000000
CpG_ntLink_0_90653 15.384615
CpG_ntLink_0_90671 0.000000
CpG_ntLink_0_94077 0.000000
CpG_ntLink_0_94090 0.000000
CpG_ntLink_0_94095 0.000000
CpG_ntLink_0_94098 0.000000
CpG_ntLink_1_4093 0.000000
From here Kathleen filtered such that
WGBS data
#pull processed files from Gannet
# Note: Unfortunately we can't use the `cache` feature to make this process more time efficient, as it doesn't support long vectors
# Define the base URL
<- "https://gannet.fish.washington.edu/seashell/bu-github/timeseries_molecular/D-Apul/output/15.5-Apul-bismark/"
base_url
# Read the HTML page
<- read_html(base_url)
page
# Extract links to files
<- page %>%
file_links html_nodes("a") %>%
html_attr("href")
# Filter for files ending in "processed.txt"
<- file_links[grepl("processed\\.txt$", file_links)]
processed_files
# Create full URLs
<- paste0(base_url, processed_files)
file_urls
# Function to read a file from URL
<- function(url) {
read_processed_file read_table(url, col_types = cols(.default = "c")) # Read as character to avoid parsing issues
}
# Import all processed files into a list
<- lapply(file_urls, read_processed_file)
processed_data
# Name the list elements by file name
names(processed_data) <- processed_files
# Print structure of imported data
str(processed_data)
# add a header row that has "CpG" for the first column and "sample" for the second column, which will be populated by the file name
<- Map(function(df, filename) {
processed_data colnames(df) <- c("CpG", filename) # Rename columns
return(df)
names(processed_data)) # Use stored file names
}, processed_data,
#merge files together by "CpG"
<- purrr::reduce(processed_data, full_join, by = "CpG")
merged_data
# Print structure of final merged data
str(merged_data)
Replace any NA with 0.
# Convert all columns (except "CpG") to numeric and replace NAs with 0
<- merged_data %>%
merged_data mutate(across(-CpG, as.numeric)) %>% # Convert all except CpG to numeric
mutate(across(-CpG, ~ replace_na(.x, 0))) # Replace NA with 0 in numeric columns
Filter data sets
Only keep CpGs that have a non-zero value in all samples.
<- merged_data %>% filter(if_all(-CpG, ~ .x > 0))
filtered_wgbs
# Ensure it's formatted as a data frame
<- as.data.frame(filtered_wgbs)
filtered_wgbs # Only keep the sample information in the column name.
colnames(filtered_wgbs) <- gsub("^(.*?)_.*$", "\\1", colnames(filtered_wgbs))
# Set CpG IDs to rownames
rownames(filtered_wgbs) <- filtered_wgbs$CpG
<- filtered_wgbs %>% select(-CpG)
filtered_wgbs
nrow(merged_data)
nrow(filtered_wgbs)
We had 12,093,025 CpGs before filtering and have only 507 after filtering. This makes sense because most CpGs were not methylated in all samples.
Save filtered set to make code reruns/knitting quicker
write.csv(filtered_wgbs, "../output/22.3-Apul-multiomic-machine-learning-byTP/filtered-WGBS-CpG-counts.csv")
Need to revisit filtering
Ptua CpG Methylation
Shelly did alignment
# run pipeline
nextflow run nf-core/methylseq \
-c /gscratch/srlab/strigg/bin/uw_hyak_srlab.config \
--input /gscratch/scrubbed/strigg/analyses/20250422_methylseq/samplesheet.csv \
--outdir /gscratch/scrubbed/strigg/analyses/20250422_methylseq \
--fasta /gscratch/srlab/strigg/GENOMES/Pocillopora_meandrina_HIv1.assembly.fasta \
--em_seq \
-resume \
-with-report nf_report.html \
-with-trace \
-with-timeline nf_timeline.html \
--skip_trimming \
--nomeseq
### Results
- Multiqc report: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/multiqc/bismark/multiqc_report.html](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/multiqc/bismark/multiqc_report.html)
- Bismark summary report: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/summary/bismark_summary_report.html](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/summary/bismark_summary_report.html)
- Pipeline report: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/nf_report.html](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/nf_report.html)
- Pipeline timeline: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/nf_timeline.html](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/nf_timeline.html)
- Counts matrices: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/methylation_calls/methylation_coverage/](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/methylation_calls/methylation_coverage/)
- <sample_name>.fastp-trim_bismark_bt2_pe.deduplicated.bismark.cov.gz
- Deduplicated sorted bam files: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/deduplicated/](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/deduplicated/)
- <sample_name>.deduplicated.sorted.bam
- Other bismark output: [https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/](https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250422_methylseq/bismark/)
find ../data/dedup-cov/methylation_calls/methylation_coverage/*deduplicated.bismark.cov.gz | \
xargs -n 1 basename -s R1_001.fastp-trim_bismark_bt2_pe.deduplicated.bismark.cov.gz | \
parallel -j 24 /home/shared/Bismark-0.24.0/coverage2cytosine \
\
--genome_folder ../data/bs \
-o ../output/05-Ptua-bismark-CG/{} \
--merge_CpG \
--zero_based ../data/dedup-cov/methylation_calls/methylation_coverage/{}R1_001.fastp-trim_bismark_bt2_pe.deduplicated.bismark.cov.gz
cd ../output/05-Ptua-bismark-CG/
for f in *merged_CpG_evidence.cov
do
STEM=$(basename "${f}" _.CpG_report.merged_CpG_evidence.cov)
cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 5) {print $1, $2, $3, $4}}' \
> "${STEM}"_10x.bedgraph
done
for file in ../output/05-Ptua-bismark-CG/*10x.bedgraph; do
awk '{print "CpG_"_$1"_"$2, $4}' "$file" > "${file%.bedgraph}_processed.txt"
done
#pull processed files from Gannet
# Note: Unfortunately we can't use the `cache` feature to make this process more time efficient, as it doesn't support long vectors
# Define the base URL
<- "https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/F-Ptua/output/05-Ptua-bismark-CG/"
base_url
# Read the HTML page
<- read_html(base_url)
page
# Extract links to files
<- page %>%
file_links html_nodes("a") %>%
html_attr("href")
# Filter for files ending in "processed.txt"
<- file_links[grepl("processed\\.txt$", file_links)]
processed_files
# Create full URLs
<- paste0(base_url, processed_files)
file_urls
# Function to read a file from URL
<- function(url) {
read_processed_file read_table(url, col_types = cols(.default = "c")) # Read as character to avoid parsing issues
}
# Import all processed files into a list
<- lapply(file_urls, read_processed_file)
processed_data
# Name the list elements by file name
names(processed_data) <- processed_files
# Print structure of imported data
str(processed_data)
# add a header row that has "CpG" for the first column and "sample" for the second column, which will be populated by the file name
<- Map(function(df, filename) {
processed_data colnames(df) <- c("CpG", filename) # Rename columns
return(df)
names(processed_data)) # Use stored file names
}, processed_data,
#merge files together by "CpG"
<- purrr::reduce(processed_data, full_join, by = "CpG")
merged_data
# Print structure of final merged data
str(merged_data)
Replace any NA with 0.
# Convert all columns (except "CpG") to numeric and replace NAs with 0
<- merged_data %>%
merged_data mutate(across(-CpG, as.numeric)) %>% # Convert all except CpG to numeric
mutate(across(-CpG, ~ replace_na(.x, 0))) # Replace NA with 0 in numeric columns
Filter data sets
Only keep CpGs that have a non-zero value in all samples.
<- merged_data %>% filter(if_all(-CpG, ~ .x > 0))
filtered_wgbs
# Ensure it's formatted as a data frame
<- as.data.frame(filtered_wgbs)
filtered_wgbs # Only keep the sample information in the column name.
colnames(filtered_wgbs) <- gsub("^(.*?)_.*$", "\\1", colnames(filtered_wgbs))
# Set CpG IDs to rownames
rownames(filtered_wgbs) <- filtered_wgbs$CpG
<- filtered_wgbs %>% select(-CpG)
filtered_wgbs
nrow(merged_data)
nrow(filtered_wgbs)
[1] 9111633
[1] 137292
We had 9111633 CpGs before filtering and have only 137292 after filtering.
write.csv(filtered_wgbs, "../output/05-Ptua-bismark-CG/filtered-WGBS-CpG-counts.csv")