OA OA OA

Bringing diploid - triploid to an end?
oyster
methylation
Author
Affiliation

Steven Roberts

Published

March 12, 2025

knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,        # Avoid automatic execution
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set default plot width
  fig.height = 4,      # Set default plot height
  fig.align = "center", # Center align plots
  comment = ""         # Prevents '##' in output
)

Lets redo the bismark with new genome.

Download Reads

wget -r \
--no-directories --no-parent \
-P ../../data/Haws-10 \
-A "*fq.gz" \
https://gannet.fish.washington.edu/spartina/project-oyster-oa/Haws/trimmed-data-2/

Download new genome

cd ../../data/Haws-10
/home/shared/datasets download genome accession GCF_963853765.1 --include gff3,rna,cds,protein,genome,seq-report
unzip ../../data/Haws-10/ncbi_dataset.zip

Genome Prep

# Directories and programs
bismark_dir="/home/shared/Bismark-0.24.0/"
bowtie2_dir="/home/shared/bowtie2-2.4.4-linux-x86_64/"
genome_folder="../../data/Haws-10/"

${bismark_dir}/bismark_genome_preparation \
--verbose \
--parallel 12 \
--path_to_aligner ${bowtie2_dir} \
${genome_folder}

Checking alignment min scores


ls ../../data/Haws-10/
# Set directories and files
reads_dir="../../data/Haws-10/"
genome_folder="../../data/Haws-10/"
output_dir="../../analyses/Haws-10/"
checkpoint_file="../../analyses/Haws-10/completed_samples.log"
bismark_dir="/home/shared/Bismark-0.24.0/"
bowtie2_dir="/home/shared/bowtie2-2.4.4-linux-x86_64/"

# Create the checkpoint file if it doesn't exist
touch ${checkpoint_file}

# Get the list of sample files and corresponding sample names
for file in ${reads_dir}*_R2_val_2_val_2_val_2.fq.gz; do
    sample_name=$(basename "$file" "_R2_val_2_val_2_val_2.fq.gz")

    # Check if the sample has already been processed
    if grep -q "^${sample_name}$" ${checkpoint_file}; then
        echo "Sample ${sample_name} already processed. Skipping..."
        continue
    fi

    # Define log files for stdout and stderr
    stdout_log="${output_dir}/${sample_name}_stdout.log"
    stderr_log="${output_dir}/${sample_name}_stderr.log"

    # Define the array of score_min parameters to test
    score_min_params=(
        "L,0,-0.4"
        "L,0,-0.6"
        "L,0,-0.8"
        "L,0,-1.0"
        "L,-1,-0.6"
    )

    # Loop through each score_min parameter
    for score_min in "${score_min_params[@]}"; do
        echo "Running Bismark for sample ${sample_name} with score_min ${score_min}"
        
        # Create a subdirectory for this parameter
        param_output_dir="${output_dir}/${sample_name}_score_${score_min//,/}"
        mkdir -p ${param_output_dir}

        # Run Bismark alignment
        ${bismark_dir}bismark \
            -genome ${genome_folder} \
            -p 16 \
            -u 25000 \
            -score_min ${score_min} \
            --non_directional \
            --path_to_bowtie ${bowtie2_dir} \
            -1 ${reads_dir}${sample_name}_R1_val_1_val_1_val_1.fq.gz \
            -2 ${reads_dir}${sample_name}_R2_val_2_val_2_val_2.fq.gz \
            -o ${param_output_dir} \
            --basename ${sample_name}_${score_min//,/} \
            2> "${param_output_dir}/${sample_name}-bismark_summary.txt"

        # Check if the command was successful
        if [ $? -eq 0 ]; then
            echo "Sample ${sample_name} with score_min ${score_min} processed successfully."
        else
            echo "Sample ${sample_name} with score_min ${score_min} failed. Check ${stderr_log} for details."
        fi
    done

    # Mark the sample as completed in the checkpoint file
    if [ $? -eq 0 ]; then
        echo ${sample_name} >> ${checkpoint_file}
        echo "All tests for sample ${sample_name} completed."
    else
        echo "Sample ${sample_name} encountered errors. Check logs for details."
    fi
done

# Define summary file
summary_file="${output_dir}/parameter_comparison_summary.csv"

# Initialize summary file
echo "Sample,Score_Min,Alignment_Rate" > ${summary_file}

# Loop through parameter output directories
for dir in ${output_dir}/*_score_*; do
    if [ -d "$dir" ]; then
        # Extract sample name and score_min parameter from directory name
        sample_name=$(basename "$dir" | cut -d'_' -f1)
        score_min=$(basename "$dir" | grep -o "score_.*" | sed 's/score_//; s/_/,/g')

        # Locate the summary file
        summary_file_path="${dir}/${sample_name}_${score_min}_PE_report.txt"

        # Extract metrics
        if [ -f "$summary_file_path" ]; then
            mapping=$(grep "Mapping efficiency:" ${summary_file_path} | awk '{print "mapping efficiency ", $3}')
            echo "${sample_name},${score_min},${mapping}" >> ${summary_file}
        fi
    fi
done

Summary file does not, did manual inspection

Full Run

# Set variables
reads_dir="../../data/Haws-10/"
genome_folder="../../data/Haws-10/"
output_dir="../../analyses/Haws-10/"
score_min="L,0,-0.8"  # Single value for score_min

# Get the list of sample files and corresponding sample names
for file in ${reads_dir}*_R2_val_2_val_2_val_2.fq.gz; do
    sample_name=$(basename "$file" "_R2_val_2_val_2_val_2.fq.gz")
    
    echo "Running Bismark for sample ${sample_name} with score_min ${score_min}"

    
    # Run Bismark alignment
    /home/shared/Bismark-0.24.0/bismark \
        --path_to_bowtie2 /home/shared/bowtie2-2.4.4-linux-x86_64 \
        -genome ${genome_folder} \
        -p 8 \
        -score_min ${score_min} \
        -1 ${reads_dir}${sample_name}_R1_val_1_val_1_val_1.fq.gz \
        -2 ${reads_dir}${sample_name}_R2_val_2_val_2_val_2.fq.gz \
        --non_directional \
        -o ${output_dir} \
        --basename ${sample_name} \
        2> "${output_dir}/${sample_name}-bismark_summary.txt"
done
htmltools::tags$iframe(src="https://gannet.fish.washington.edu/seashell/bu-github/project-oyster-oa/analyses/Haws-10/multiqc_report.html", width="100%", height="600px")

Deduplication

Notebook 14

/home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-10

find /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-10/*.bam | \
xargs -n 1 basename -s .bam | \
parallel -j 8 /home/shared/Bismark-0.24.0/deduplicate_bismark \
--bam \
--paired \
--output_dir /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-14 \
/home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-10/{}.bam

Methylation Extraction

find /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-14/*deduplicated.bam | xargs -n 1 -I{} /home/shared/Bismark-0.24.0/bismark_methylation_extractor --bedGraph --counts --comprehensive --merge_non_CpG --multicore 48 --buffer_size 75% \
--output /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-14 \
{} 

Methylation Calls

find /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-14/*deduplicated.bismark.cov.gz | \
xargs -n 1 basename -s _pe.deduplicated.bismark.cov.gz | \
parallel -j 48 /home/shared/Bismark-0.24.0/coverage2cytosine \
--genome_folder /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/data/Haws-10 \
-o /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-14/{} \
--merge_CpG \
--zero_based \
/home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-14/{}_pe.deduplicated.bismark.cov.gz
htmltools::tags$iframe(src="https://gannet.fish.washington.edu/seashell/bu-github/project-oyster-oa/analyses/Haws-14/multiqc_report.html", width="100%", height="600px")

output

https://gannet.fish.washington.edu/seashell/bu-github/project-oyster-oa/analyses/Haws-14/

CpG_context_zr3644_10_pe.deduplicated.txt
CpG_context_zr3644_11_pe.deduplicated.txt
CpG_context_zr3644_12_pe.deduplicated.txt
CpG_context_zr3644_13_pe.deduplicated.txt
CpG_context_zr3644_14_pe.deduplicated.txt
CpG_context_zr3644_15_pe.deduplicated.txt
CpG_context_zr3644_16_pe.deduplicated.txt
CpG_context_zr3644_17_pe.deduplicated.txt
CpG_context_zr3644_18_pe.deduplicated.txt
CpG_context_zr3644_19_pe.deduplicated.txt
CpG_context_zr3644_1_pe.deduplicated.txt
CpG_context_zr3644_20_pe.deduplicated.txt
CpG_context_zr3644_21_pe.deduplicated.txt
CpG_context_zr3644_22_pe.deduplicated.txt
CpG_context_zr3644_23_pe.deduplicated.txt
CpG_context_zr3644_24_pe.deduplicated.txt
CpG_context_zr3644_2_pe.deduplicated.txt
CpG_context_zr3644_3_pe.deduplicated.txt
CpG_context_zr3644_4_pe.deduplicated.txt
CpG_context_zr3644_5_pe.deduplicated.txt
CpG_context_zr3644_6_pe.deduplicated.txt
CpG_context_zr3644_7_pe.deduplicated.txt
CpG_context_zr3644_8_pe.deduplicated.txt
CpG_context_zr3644_9_pe.deduplicated.txt
Non_CpG_context_zr3644_10_pe.deduplicated.txt
Non_CpG_context_zr3644_11_pe.deduplicated.txt
Non_CpG_context_zr3644_12_pe.deduplicated.txt
Non_CpG_context_zr3644_13_pe.deduplicated.txt
Non_CpG_context_zr3644_14_pe.deduplicated.txt
Non_CpG_context_zr3644_15_pe.deduplicated.txt
Non_CpG_context_zr3644_16_pe.deduplicated.txt
Non_CpG_context_zr3644_17_pe.deduplicated.txt
Non_CpG_context_zr3644_18_pe.deduplicated.txt
Non_CpG_context_zr3644_19_pe.deduplicated.txt
Non_CpG_context_zr3644_1_pe.deduplicated.txt
Non_CpG_context_zr3644_20_pe.deduplicated.txt
Non_CpG_context_zr3644_21_pe.deduplicated.txt
Non_CpG_context_zr3644_22_pe.deduplicated.txt
Non_CpG_context_zr3644_23_pe.deduplicated.txt
Non_CpG_context_zr3644_24_pe.deduplicated.txt
Non_CpG_context_zr3644_2_pe.deduplicated.txt
Non_CpG_context_zr3644_3_pe.deduplicated.txt
Non_CpG_context_zr3644_4_pe.deduplicated.txt
Non_CpG_context_zr3644_5_pe.deduplicated.txt
Non_CpG_context_zr3644_6_pe.deduplicated.txt
Non_CpG_context_zr3644_7_pe.deduplicated.txt
Non_CpG_context_zr3644_8_pe.deduplicated.txt
Non_CpG_context_zr3644_9_pe.deduplicated.txt
zr3644_10.CpG_report.merged_CpG_evidence.cov
zr3644_10.CpG_report.txt
zr3644_10.cytosine_context_summary.txt
zr3644_10_pe.deduplicated.bam
zr3644_10_pe.deduplicated.bedGraph.gz
zr3644_10_pe.deduplicated.bismark.cov.gz
zr3644_10_pe.deduplicated.M-bias.txt
zr3644_10_pe.deduplicated_splitting_report.txt
zr3644_10_pe.deduplication_report.txt
zr3644_11.CpG_report.merged_CpG_evidence.cov
zr3644_11.CpG_report.txt
zr3644_11.cytosine_context_summary.txt
zr3644_11_pe.deduplicated.bam
zr3644_11_pe.deduplicated.bedGraph.gz
zr3644_11_pe.deduplicated.bismark.cov.gz
zr3644_11_pe.deduplicated.M-bias.txt
zr3644_11_pe.deduplicated_splitting_report.txt
zr3644_11_pe.deduplication_report.txt
zr3644_12.CpG_report.merged_CpG_evidence.cov
zr3644_12.CpG_report.txt
zr3644_12.cytosine_context_summary.txt
zr3644_12_pe.deduplicated.bam
zr3644_12_pe.deduplicated.bedGraph.gz
zr3644_12_pe.deduplicated.bismark.cov.gz
zr3644_12_pe.deduplicated.M-bias.txt
zr3644_12_pe.deduplicated_splitting_report.txt
zr3644_12_pe.deduplication_report.txt
zr3644_13.CpG_report.merged_CpG_evidence.cov
zr3644_13.CpG_report.txt
zr3644_13.cytosine_context_summary.txt
zr3644_13_pe.deduplicated.bam
zr3644_13_pe.deduplicated.bedGraph.gz
zr3644_13_pe.deduplicated.bismark.cov.gz
zr3644_13_pe.deduplicated.M-bias.txt
zr3644_13_pe.deduplicated_splitting_report.txt
zr3644_13_pe.deduplication_report.txt
zr3644_14.CpG_report.merged_CpG_evidence.cov
zr3644_14.CpG_report.txt
zr3644_14.cytosine_context_summary.txt
zr3644_14_pe.deduplicated.bam
zr3644_14_pe.deduplicated.bedGraph.gz
zr3644_14_pe.deduplicated.bismark.cov.gz
zr3644_14_pe.deduplicated.M-bias.txt
zr3644_14_pe.deduplicated_splitting_report.txt
zr3644_14_pe.deduplication_report.txt
zr3644_15.CpG_report.merged_CpG_evidence.cov
zr3644_15.CpG_report.txt
zr3644_15.cytosine_context_summary.txt
zr3644_15_pe.deduplicated.bam
zr3644_15_pe.deduplicated.bedGraph.gz
zr3644_15_pe.deduplicated.bismark.cov.gz
zr3644_15_pe.deduplicated.M-bias.txt
zr3644_15_pe.deduplicated_splitting_report.txt
zr3644_15_pe.deduplication_report.txt
zr3644_16.CpG_report.merged_CpG_evidence.cov
zr3644_16.CpG_report.txt
zr3644_16.cytosine_context_summary.txt
zr3644_16_pe.deduplicated.bam
zr3644_16_pe.deduplicated.bedGraph.gz
zr3644_16_pe.deduplicated.bismark.cov.gz
zr3644_16_pe.deduplicated.M-bias.txt
zr3644_16_pe.deduplicated_splitting_report.txt
zr3644_16_pe.deduplication_report.txt
zr3644_17.CpG_report.merged_CpG_evidence.cov
zr3644_17.CpG_report.txt
zr3644_17.cytosine_context_summary.txt
zr3644_17_pe.deduplicated.bam
zr3644_17_pe.deduplicated.bedGraph.gz
zr3644_17_pe.deduplicated.bismark.cov.gz
zr3644_17_pe.deduplicated.M-bias.txt
zr3644_17_pe.deduplicated_splitting_report.txt
zr3644_17_pe.deduplication_report.txt
zr3644_18.CpG_report.merged_CpG_evidence.cov
zr3644_18.CpG_report.txt
zr3644_18.cytosine_context_summary.txt
zr3644_18_pe.deduplicated.bam
zr3644_18_pe.deduplicated.bedGraph.gz
zr3644_18_pe.deduplicated.bismark.cov.gz
zr3644_18_pe.deduplicated.M-bias.txt
zr3644_18_pe.deduplicated_splitting_report.txt
zr3644_18_pe.deduplication_report.txt
zr3644_19.CpG_report.merged_CpG_evidence.cov
zr3644_19.CpG_report.txt
zr3644_19.cytosine_context_summary.txt
zr3644_19_pe.deduplicated.bam
zr3644_19_pe.deduplicated.bedGraph.gz
zr3644_19_pe.deduplicated.bismark.cov.gz
zr3644_19_pe.deduplicated.M-bias.txt
zr3644_19_pe.deduplicated_splitting_report.txt
zr3644_19_pe.deduplication_report.txt
zr3644_1.CpG_report.merged_CpG_evidence.cov
zr3644_1.CpG_report.txt
zr3644_1.cytosine_context_summary.txt
zr3644_1_pe.deduplicated.bam
zr3644_1_pe.deduplicated.bedGraph.gz
zr3644_1_pe.deduplicated.bismark.cov.gz
zr3644_1_pe.deduplicated.M-bias.txt
zr3644_1_pe.deduplicated_splitting_report.txt
zr3644_1_pe.deduplication_report.txt
zr3644_20.CpG_report.merged_CpG_evidence.cov
zr3644_20.CpG_report.txt
zr3644_20.cytosine_context_summary.txt
zr3644_20_pe.deduplicated.bam
zr3644_20_pe.deduplicated.bedGraph.gz
zr3644_20_pe.deduplicated.bismark.cov.gz
zr3644_20_pe.deduplicated.M-bias.txt
zr3644_20_pe.deduplicated_splitting_report.txt
zr3644_20_pe.deduplication_report.txt
zr3644_21.CpG_report.merged_CpG_evidence.cov
zr3644_21.CpG_report.txt
zr3644_21.cytosine_context_summary.txt
zr3644_21_pe.deduplicated.bam
zr3644_21_pe.deduplicated.bedGraph.gz
zr3644_21_pe.deduplicated.bismark.cov.gz
zr3644_21_pe.deduplicated.M-bias.txt
zr3644_21_pe.deduplicated_splitting_report.txt
zr3644_21_pe.deduplication_report.txt
zr3644_22.CpG_report.merged_CpG_evidence.cov
zr3644_22.CpG_report.txt
zr3644_22.cytosine_context_summary.txt
zr3644_22_pe.deduplicated.bam
zr3644_22_pe.deduplicated.bedGraph.gz
zr3644_22_pe.deduplicated.bismark.cov.gz
zr3644_22_pe.deduplicated.M-bias.txt
zr3644_22_pe.deduplicated_splitting_report.txt
zr3644_22_pe.deduplication_report.txt
zr3644_23.CpG_report.merged_CpG_evidence.cov
zr3644_23.CpG_report.txt
zr3644_23.cytosine_context_summary.txt
zr3644_23_pe.deduplicated.bam
zr3644_23_pe.deduplicated.bedGraph.gz
zr3644_23_pe.deduplicated.bismark.cov.gz
zr3644_23_pe.deduplicated.M-bias.txt
zr3644_23_pe.deduplicated_splitting_report.txt
zr3644_23_pe.deduplication_report.txt
zr3644_24.CpG_report.merged_CpG_evidence.cov
zr3644_24.CpG_report.txt
zr3644_24.cytosine_context_summary.txt
zr3644_24_pe.deduplicated.bam
zr3644_24_pe.deduplicated.bedGraph.gz
zr3644_24_pe.deduplicated.bismark.cov.gz
zr3644_24_pe.deduplicated.M-bias.txt
zr3644_24_pe.deduplicated_splitting_report.txt
zr3644_24_pe.deduplication_report.txt
zr3644_2.CpG_report.merged_CpG_evidence.cov
zr3644_2.CpG_report.txt
zr3644_2.cytosine_context_summary.txt
zr3644_2_pe.deduplicated.bam
zr3644_2_pe.deduplicated.bedGraph.gz
zr3644_2_pe.deduplicated.bismark.cov.gz
zr3644_2_pe.deduplicated.M-bias.txt
zr3644_2_pe.deduplicated_splitting_report.txt
zr3644_2_pe.deduplication_report.txt
zr3644_3.CpG_report.merged_CpG_evidence.cov
zr3644_3.CpG_report.txt
zr3644_3.cytosine_context_summary.txt
zr3644_3_pe.deduplicated.bam
zr3644_3_pe.deduplicated.bedGraph.gz
zr3644_3_pe.deduplicated.bismark.cov.gz
zr3644_3_pe.deduplicated.M-bias.txt
zr3644_3_pe.deduplicated_splitting_report.txt
zr3644_3_pe.deduplication_report.txt
zr3644_4.CpG_report.merged_CpG_evidence.cov
zr3644_4.CpG_report.txt
zr3644_4.cytosine_context_summary.txt
zr3644_4_pe.deduplicated.bam
zr3644_4_pe.deduplicated.bedGraph.gz
zr3644_4_pe.deduplicated.bismark.cov.gz
zr3644_4_pe.deduplicated.M-bias.txt
zr3644_4_pe.deduplicated_splitting_report.txt
zr3644_4_pe.deduplication_report.txt
zr3644_5.CpG_report.merged_CpG_evidence.cov
zr3644_5.CpG_report.txt
zr3644_5.cytosine_context_summary.txt
zr3644_5_pe.deduplicated.bam
zr3644_5_pe.deduplicated.bedGraph.gz
zr3644_5_pe.deduplicated.bismark.cov.gz
zr3644_5_pe.deduplicated.M-bias.txt
zr3644_5_pe.deduplicated_splitting_report.txt
zr3644_5_pe.deduplication_report.txt
zr3644_6.CpG_report.merged_CpG_evidence.cov
zr3644_6.CpG_report.txt
zr3644_6.cytosine_context_summary.txt
zr3644_6_pe.deduplicated.bam
zr3644_6_pe.deduplicated.bedGraph.gz
zr3644_6_pe.deduplicated.bismark.cov.gz
zr3644_6_pe.deduplicated.M-bias.txt
zr3644_6_pe.deduplicated_splitting_report.txt
zr3644_6_pe.deduplication_report.txt
zr3644_7.CpG_report.merged_CpG_evidence.cov
zr3644_7.CpG_report.txt
zr3644_7.cytosine_context_summary.txt
zr3644_7_pe.deduplicated.bam
zr3644_7_pe.deduplicated.bedGraph.gz
zr3644_7_pe.deduplicated.bismark.cov.gz
zr3644_7_pe.deduplicated.M-bias.txt
zr3644_7_pe.deduplicated_splitting_report.txt
zr3644_7_pe.deduplication_report.txt
zr3644_8.CpG_report.merged_CpG_evidence.cov
zr3644_8.CpG_report.txt
zr3644_8.cytosine_context_summary.txt
zr3644_8_pe.deduplicated.bam
zr3644_8_pe.deduplicated.bedGraph.gz
zr3644_8_pe.deduplicated.bismark.cov.gz
zr3644_8_pe.deduplicated.M-bias.txt
zr3644_8_pe.deduplicated_splitting_report.txt
zr3644_8_pe.deduplication_report.txt
zr3644_9.CpG_report.merged_CpG_evidence.cov
zr3644_9.CpG_report.txt
zr3644_9.cytosine_context_summary.txt
zr3644_9_pe.deduplicated.bam
zr3644_9_pe.deduplicated.bedGraph.gz
zr3644_9_pe.deduplicated.bismark.cov.gz
zr3644_9_pe.deduplicated.M-bias.txt
zr3644_9_pe.deduplicated_splitting_report.txt
zr3644_9_pe.deduplication_report.txt