::opts_chunk$set(
knitrecho = TRUE, # Display code chunks
eval = FALSE, # Avoid automatic execution
warning = FALSE, # Hide warnings
message = FALSE, # Hide messages
fig.width = 6, # Set default plot width
fig.height = 4, # Set default plot height
fig.align = "center", # Center align plots
comment = "" # Prevents '##' in output
)
Lets redo the bismark with new genome.
Download Reads
wget -r \
--no-parent \
--no-directories \
-P ../../data/Haws-10 "*fq.gz" \
-A https://gannet.fish.washington.edu/spartina/project-oyster-oa/Haws/trimmed-data-2/
Download new genome
cd ../../data/Haws-10
/home/shared/datasets download genome accession GCF_963853765.1 --include gff3,rna,cds,protein,genome,seq-report
unzip ../../data/Haws-10/ncbi_dataset.zip
Genome Prep
# Directories and programs
bismark_dir="/home/shared/Bismark-0.24.0/"
bowtie2_dir="/home/shared/bowtie2-2.4.4-linux-x86_64/"
genome_folder="../../data/Haws-10/"
${bismark_dir}/bismark_genome_preparation \
\
--verbose \
--parallel 12 ${bowtie2_dir} \
--path_to_aligner ${genome_folder}
Checking alignment min scores
ls ../../data/Haws-10/
# Set directories and files
reads_dir="../../data/Haws-10/"
genome_folder="../../data/Haws-10/"
output_dir="../../analyses/Haws-10/"
checkpoint_file="../../analyses/Haws-10/completed_samples.log"
bismark_dir="/home/shared/Bismark-0.24.0/"
bowtie2_dir="/home/shared/bowtie2-2.4.4-linux-x86_64/"
# Create the checkpoint file if it doesn't exist
touch ${checkpoint_file}
# Get the list of sample files and corresponding sample names
for file in ${reads_dir}*_R2_val_2_val_2_val_2.fq.gz; do
sample_name=$(basename "$file" "_R2_val_2_val_2_val_2.fq.gz")
# Check if the sample has already been processed
if grep -q "^${sample_name}$" ${checkpoint_file}; then
echo "Sample ${sample_name} already processed. Skipping..."
continue
fi
# Define log files for stdout and stderr
stdout_log="${output_dir}/${sample_name}_stdout.log"
stderr_log="${output_dir}/${sample_name}_stderr.log"
# Define the array of score_min parameters to test
score_min_params=(
"L,0,-0.4"
"L,0,-0.6"
"L,0,-0.8"
"L,0,-1.0"
"L,-1,-0.6"
)
# Loop through each score_min parameter
for score_min in "${score_min_params[@]}"; do
echo "Running Bismark for sample ${sample_name} with score_min ${score_min}"
# Create a subdirectory for this parameter
param_output_dir="${output_dir}/${sample_name}_score_${score_min//,/}"
mkdir -p ${param_output_dir}
# Run Bismark alignment
${bismark_dir}bismark \
-genome ${genome_folder} \
-p 16 \
-u 25000 \
-score_min ${score_min} \
--non_directional \
--path_to_bowtie ${bowtie2_dir} \
-1 ${reads_dir}${sample_name}_R1_val_1_val_1_val_1.fq.gz \
-2 ${reads_dir}${sample_name}_R2_val_2_val_2_val_2.fq.gz \
-o ${param_output_dir} \
--basename ${sample_name}_${score_min//,/} \
2> "${param_output_dir}/${sample_name}-bismark_summary.txt"
# Check if the command was successful
if [ $? -eq 0 ]; then
echo "Sample ${sample_name} with score_min ${score_min} processed successfully."
else
echo "Sample ${sample_name} with score_min ${score_min} failed. Check ${stderr_log} for details."
fi
done
# Mark the sample as completed in the checkpoint file
if [ $? -eq 0 ]; then
echo ${sample_name} >> ${checkpoint_file}
echo "All tests for sample ${sample_name} completed."
else
echo "Sample ${sample_name} encountered errors. Check logs for details."
fi
done
# Define summary file
summary_file="${output_dir}/parameter_comparison_summary.csv"
# Initialize summary file
echo "Sample,Score_Min,Alignment_Rate" > ${summary_file}
# Loop through parameter output directories
for dir in ${output_dir}/*_score_*; do
if [ -d "$dir" ]; then
# Extract sample name and score_min parameter from directory name
sample_name=$(basename "$dir" | cut -d'_' -f1)
score_min=$(basename "$dir" | grep -o "score_.*" | sed 's/score_//; s/_/,/g')
# Locate the summary file
summary_file_path="${dir}/${sample_name}_${score_min}_PE_report.txt"
# Extract metrics
if [ -f "$summary_file_path" ]; then
mapping=$(grep "Mapping efficiency:" ${summary_file_path} | awk '{print "mapping efficiency ", $3}')
echo "${sample_name},${score_min},${mapping}" >> ${summary_file}
fi
fi
done
Summary file does not, did manual inspection
Full Run
# Set variables
reads_dir="../../data/Haws-10/"
genome_folder="../../data/Haws-10/"
output_dir="../../analyses/Haws-10/"
score_min="L,0,-0.8" # Single value for score_min
# Get the list of sample files and corresponding sample names
for file in ${reads_dir}*_R2_val_2_val_2_val_2.fq.gz; do
sample_name=$(basename "$file" "_R2_val_2_val_2_val_2.fq.gz")
echo "Running Bismark for sample ${sample_name} with score_min ${score_min}"
# Run Bismark alignment
/home/shared/Bismark-0.24.0/bismark \
--path_to_bowtie2 /home/shared/bowtie2-2.4.4-linux-x86_64 \
-genome ${genome_folder} \
-p 8 \
-score_min ${score_min} \
-1 ${reads_dir}${sample_name}_R1_val_1_val_1_val_1.fq.gz \
-2 ${reads_dir}${sample_name}_R2_val_2_val_2_val_2.fq.gz \
--non_directional \
-o ${output_dir} \
--basename ${sample_name} \
2> "${output_dir}/${sample_name}-bismark_summary.txt"
done
::tags$iframe(src="https://gannet.fish.washington.edu/seashell/bu-github/project-oyster-oa/analyses/Haws-10/multiqc_report.html", width="100%", height="600px") htmltools
Deduplication
Notebook 14
/home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-10
find /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-10/*.bam | \
xargs -n 1 basename -s .bam | \
parallel -j 8 /home/shared/Bismark-0.24.0/deduplicate_bismark \
\
--bam \
--paired \
--output_dir /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-14 /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-10/{}.bam
Methylation Extraction
find /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-14/*deduplicated.bam | xargs -n 1 -I{} /home/shared/Bismark-0.24.0/bismark_methylation_extractor --bedGraph --counts --comprehensive --merge_non_CpG --multicore 48 --buffer_size 75% \
\
--output /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-14 {}
Methylation Calls
find /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-14/*deduplicated.bismark.cov.gz | \
xargs -n 1 basename -s _pe.deduplicated.bismark.cov.gz | \
parallel -j 48 /home/shared/Bismark-0.24.0/coverage2cytosine \
\
--genome_folder /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/data/Haws-10 \
-o /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-14/{} \
--merge_CpG \
--zero_based /home/shared/16TB_HDD_01/sr320/github/project-oyster-oa/analyses/Haws-14/{}_pe.deduplicated.bismark.cov.gz
::tags$iframe(src="https://gannet.fish.washington.edu/seashell/bu-github/project-oyster-oa/analyses/Haws-14/multiqc_report.html", width="100%", height="600px") htmltools
output
https://gannet.fish.washington.edu/seashell/bu-github/project-oyster-oa/analyses/Haws-14/
CpG_context_zr3644_10_pe.deduplicated.txt
CpG_context_zr3644_11_pe.deduplicated.txt
CpG_context_zr3644_12_pe.deduplicated.txt
CpG_context_zr3644_13_pe.deduplicated.txt
CpG_context_zr3644_14_pe.deduplicated.txt
CpG_context_zr3644_15_pe.deduplicated.txt
CpG_context_zr3644_16_pe.deduplicated.txt
CpG_context_zr3644_17_pe.deduplicated.txt
CpG_context_zr3644_18_pe.deduplicated.txt
CpG_context_zr3644_19_pe.deduplicated.txt
CpG_context_zr3644_1_pe.deduplicated.txt
CpG_context_zr3644_20_pe.deduplicated.txt
CpG_context_zr3644_21_pe.deduplicated.txt
CpG_context_zr3644_22_pe.deduplicated.txt
CpG_context_zr3644_23_pe.deduplicated.txt
CpG_context_zr3644_24_pe.deduplicated.txt
CpG_context_zr3644_2_pe.deduplicated.txt
CpG_context_zr3644_3_pe.deduplicated.txt
CpG_context_zr3644_4_pe.deduplicated.txt
CpG_context_zr3644_5_pe.deduplicated.txt
CpG_context_zr3644_6_pe.deduplicated.txt
CpG_context_zr3644_7_pe.deduplicated.txt
CpG_context_zr3644_8_pe.deduplicated.txt
CpG_context_zr3644_9_pe.deduplicated.txt
Non_CpG_context_zr3644_10_pe.deduplicated.txt
Non_CpG_context_zr3644_11_pe.deduplicated.txt
Non_CpG_context_zr3644_12_pe.deduplicated.txt
Non_CpG_context_zr3644_13_pe.deduplicated.txt
Non_CpG_context_zr3644_14_pe.deduplicated.txt
Non_CpG_context_zr3644_15_pe.deduplicated.txt
Non_CpG_context_zr3644_16_pe.deduplicated.txt
Non_CpG_context_zr3644_17_pe.deduplicated.txt
Non_CpG_context_zr3644_18_pe.deduplicated.txt
Non_CpG_context_zr3644_19_pe.deduplicated.txt
Non_CpG_context_zr3644_1_pe.deduplicated.txt
Non_CpG_context_zr3644_20_pe.deduplicated.txt
Non_CpG_context_zr3644_21_pe.deduplicated.txt
Non_CpG_context_zr3644_22_pe.deduplicated.txt
Non_CpG_context_zr3644_23_pe.deduplicated.txt
Non_CpG_context_zr3644_24_pe.deduplicated.txt
Non_CpG_context_zr3644_2_pe.deduplicated.txt
Non_CpG_context_zr3644_3_pe.deduplicated.txt
Non_CpG_context_zr3644_4_pe.deduplicated.txt
Non_CpG_context_zr3644_5_pe.deduplicated.txt
Non_CpG_context_zr3644_6_pe.deduplicated.txt
Non_CpG_context_zr3644_7_pe.deduplicated.txt
Non_CpG_context_zr3644_8_pe.deduplicated.txt
Non_CpG_context_zr3644_9_pe.deduplicated.txt
zr3644_10.CpG_report.merged_CpG_evidence.cov
zr3644_10.CpG_report.txt
zr3644_10.cytosine_context_summary.txt
zr3644_10_pe.deduplicated.bam
zr3644_10_pe.deduplicated.bedGraph.gz
zr3644_10_pe.deduplicated.bismark.cov.gz
zr3644_10_pe.deduplicated.M-bias.txt
zr3644_10_pe.deduplicated_splitting_report.txt
zr3644_10_pe.deduplication_report.txt
zr3644_11.CpG_report.merged_CpG_evidence.cov
zr3644_11.CpG_report.txt
zr3644_11.cytosine_context_summary.txt
zr3644_11_pe.deduplicated.bam
zr3644_11_pe.deduplicated.bedGraph.gz
zr3644_11_pe.deduplicated.bismark.cov.gz
zr3644_11_pe.deduplicated.M-bias.txt
zr3644_11_pe.deduplicated_splitting_report.txt
zr3644_11_pe.deduplication_report.txt
zr3644_12.CpG_report.merged_CpG_evidence.cov
zr3644_12.CpG_report.txt
zr3644_12.cytosine_context_summary.txt
zr3644_12_pe.deduplicated.bam
zr3644_12_pe.deduplicated.bedGraph.gz
zr3644_12_pe.deduplicated.bismark.cov.gz
zr3644_12_pe.deduplicated.M-bias.txt
zr3644_12_pe.deduplicated_splitting_report.txt
zr3644_12_pe.deduplication_report.txt
zr3644_13.CpG_report.merged_CpG_evidence.cov
zr3644_13.CpG_report.txt
zr3644_13.cytosine_context_summary.txt
zr3644_13_pe.deduplicated.bam
zr3644_13_pe.deduplicated.bedGraph.gz
zr3644_13_pe.deduplicated.bismark.cov.gz
zr3644_13_pe.deduplicated.M-bias.txt
zr3644_13_pe.deduplicated_splitting_report.txt
zr3644_13_pe.deduplication_report.txt
zr3644_14.CpG_report.merged_CpG_evidence.cov
zr3644_14.CpG_report.txt
zr3644_14.cytosine_context_summary.txt
zr3644_14_pe.deduplicated.bam
zr3644_14_pe.deduplicated.bedGraph.gz
zr3644_14_pe.deduplicated.bismark.cov.gz
zr3644_14_pe.deduplicated.M-bias.txt
zr3644_14_pe.deduplicated_splitting_report.txt
zr3644_14_pe.deduplication_report.txt
zr3644_15.CpG_report.merged_CpG_evidence.cov
zr3644_15.CpG_report.txt
zr3644_15.cytosine_context_summary.txt
zr3644_15_pe.deduplicated.bam
zr3644_15_pe.deduplicated.bedGraph.gz
zr3644_15_pe.deduplicated.bismark.cov.gz
zr3644_15_pe.deduplicated.M-bias.txt
zr3644_15_pe.deduplicated_splitting_report.txt
zr3644_15_pe.deduplication_report.txt
zr3644_16.CpG_report.merged_CpG_evidence.cov
zr3644_16.CpG_report.txt
zr3644_16.cytosine_context_summary.txt
zr3644_16_pe.deduplicated.bam
zr3644_16_pe.deduplicated.bedGraph.gz
zr3644_16_pe.deduplicated.bismark.cov.gz
zr3644_16_pe.deduplicated.M-bias.txt
zr3644_16_pe.deduplicated_splitting_report.txt
zr3644_16_pe.deduplication_report.txt
zr3644_17.CpG_report.merged_CpG_evidence.cov
zr3644_17.CpG_report.txt
zr3644_17.cytosine_context_summary.txt
zr3644_17_pe.deduplicated.bam
zr3644_17_pe.deduplicated.bedGraph.gz
zr3644_17_pe.deduplicated.bismark.cov.gz
zr3644_17_pe.deduplicated.M-bias.txt
zr3644_17_pe.deduplicated_splitting_report.txt
zr3644_17_pe.deduplication_report.txt
zr3644_18.CpG_report.merged_CpG_evidence.cov
zr3644_18.CpG_report.txt
zr3644_18.cytosine_context_summary.txt
zr3644_18_pe.deduplicated.bam
zr3644_18_pe.deduplicated.bedGraph.gz
zr3644_18_pe.deduplicated.bismark.cov.gz
zr3644_18_pe.deduplicated.M-bias.txt
zr3644_18_pe.deduplicated_splitting_report.txt
zr3644_18_pe.deduplication_report.txt
zr3644_19.CpG_report.merged_CpG_evidence.cov
zr3644_19.CpG_report.txt
zr3644_19.cytosine_context_summary.txt
zr3644_19_pe.deduplicated.bam
zr3644_19_pe.deduplicated.bedGraph.gz
zr3644_19_pe.deduplicated.bismark.cov.gz
zr3644_19_pe.deduplicated.M-bias.txt
zr3644_19_pe.deduplicated_splitting_report.txt
zr3644_19_pe.deduplication_report.txt
zr3644_1.CpG_report.merged_CpG_evidence.cov
zr3644_1.CpG_report.txt
zr3644_1.cytosine_context_summary.txt
zr3644_1_pe.deduplicated.bam
zr3644_1_pe.deduplicated.bedGraph.gz
zr3644_1_pe.deduplicated.bismark.cov.gz
zr3644_1_pe.deduplicated.M-bias.txt
zr3644_1_pe.deduplicated_splitting_report.txt
zr3644_1_pe.deduplication_report.txt
zr3644_20.CpG_report.merged_CpG_evidence.cov
zr3644_20.CpG_report.txt
zr3644_20.cytosine_context_summary.txt
zr3644_20_pe.deduplicated.bam
zr3644_20_pe.deduplicated.bedGraph.gz
zr3644_20_pe.deduplicated.bismark.cov.gz
zr3644_20_pe.deduplicated.M-bias.txt
zr3644_20_pe.deduplicated_splitting_report.txt
zr3644_20_pe.deduplication_report.txt
zr3644_21.CpG_report.merged_CpG_evidence.cov
zr3644_21.CpG_report.txt
zr3644_21.cytosine_context_summary.txt
zr3644_21_pe.deduplicated.bam
zr3644_21_pe.deduplicated.bedGraph.gz
zr3644_21_pe.deduplicated.bismark.cov.gz
zr3644_21_pe.deduplicated.M-bias.txt
zr3644_21_pe.deduplicated_splitting_report.txt
zr3644_21_pe.deduplication_report.txt
zr3644_22.CpG_report.merged_CpG_evidence.cov
zr3644_22.CpG_report.txt
zr3644_22.cytosine_context_summary.txt
zr3644_22_pe.deduplicated.bam
zr3644_22_pe.deduplicated.bedGraph.gz
zr3644_22_pe.deduplicated.bismark.cov.gz
zr3644_22_pe.deduplicated.M-bias.txt
zr3644_22_pe.deduplicated_splitting_report.txt
zr3644_22_pe.deduplication_report.txt
zr3644_23.CpG_report.merged_CpG_evidence.cov
zr3644_23.CpG_report.txt
zr3644_23.cytosine_context_summary.txt
zr3644_23_pe.deduplicated.bam
zr3644_23_pe.deduplicated.bedGraph.gz
zr3644_23_pe.deduplicated.bismark.cov.gz
zr3644_23_pe.deduplicated.M-bias.txt
zr3644_23_pe.deduplicated_splitting_report.txt
zr3644_23_pe.deduplication_report.txt
zr3644_24.CpG_report.merged_CpG_evidence.cov
zr3644_24.CpG_report.txt
zr3644_24.cytosine_context_summary.txt
zr3644_24_pe.deduplicated.bam
zr3644_24_pe.deduplicated.bedGraph.gz
zr3644_24_pe.deduplicated.bismark.cov.gz
zr3644_24_pe.deduplicated.M-bias.txt
zr3644_24_pe.deduplicated_splitting_report.txt
zr3644_24_pe.deduplication_report.txt
zr3644_2.CpG_report.merged_CpG_evidence.cov
zr3644_2.CpG_report.txt
zr3644_2.cytosine_context_summary.txt
zr3644_2_pe.deduplicated.bam
zr3644_2_pe.deduplicated.bedGraph.gz
zr3644_2_pe.deduplicated.bismark.cov.gz
zr3644_2_pe.deduplicated.M-bias.txt
zr3644_2_pe.deduplicated_splitting_report.txt
zr3644_2_pe.deduplication_report.txt
zr3644_3.CpG_report.merged_CpG_evidence.cov
zr3644_3.CpG_report.txt
zr3644_3.cytosine_context_summary.txt
zr3644_3_pe.deduplicated.bam
zr3644_3_pe.deduplicated.bedGraph.gz
zr3644_3_pe.deduplicated.bismark.cov.gz
zr3644_3_pe.deduplicated.M-bias.txt
zr3644_3_pe.deduplicated_splitting_report.txt
zr3644_3_pe.deduplication_report.txt
zr3644_4.CpG_report.merged_CpG_evidence.cov
zr3644_4.CpG_report.txt
zr3644_4.cytosine_context_summary.txt
zr3644_4_pe.deduplicated.bam
zr3644_4_pe.deduplicated.bedGraph.gz
zr3644_4_pe.deduplicated.bismark.cov.gz
zr3644_4_pe.deduplicated.M-bias.txt
zr3644_4_pe.deduplicated_splitting_report.txt
zr3644_4_pe.deduplication_report.txt
zr3644_5.CpG_report.merged_CpG_evidence.cov
zr3644_5.CpG_report.txt
zr3644_5.cytosine_context_summary.txt
zr3644_5_pe.deduplicated.bam
zr3644_5_pe.deduplicated.bedGraph.gz
zr3644_5_pe.deduplicated.bismark.cov.gz
zr3644_5_pe.deduplicated.M-bias.txt
zr3644_5_pe.deduplicated_splitting_report.txt
zr3644_5_pe.deduplication_report.txt
zr3644_6.CpG_report.merged_CpG_evidence.cov
zr3644_6.CpG_report.txt
zr3644_6.cytosine_context_summary.txt
zr3644_6_pe.deduplicated.bam
zr3644_6_pe.deduplicated.bedGraph.gz
zr3644_6_pe.deduplicated.bismark.cov.gz
zr3644_6_pe.deduplicated.M-bias.txt
zr3644_6_pe.deduplicated_splitting_report.txt
zr3644_6_pe.deduplication_report.txt
zr3644_7.CpG_report.merged_CpG_evidence.cov
zr3644_7.CpG_report.txt
zr3644_7.cytosine_context_summary.txt
zr3644_7_pe.deduplicated.bam
zr3644_7_pe.deduplicated.bedGraph.gz
zr3644_7_pe.deduplicated.bismark.cov.gz
zr3644_7_pe.deduplicated.M-bias.txt
zr3644_7_pe.deduplicated_splitting_report.txt
zr3644_7_pe.deduplication_report.txt
zr3644_8.CpG_report.merged_CpG_evidence.cov
zr3644_8.CpG_report.txt
zr3644_8.cytosine_context_summary.txt
zr3644_8_pe.deduplicated.bam
zr3644_8_pe.deduplicated.bedGraph.gz
zr3644_8_pe.deduplicated.bismark.cov.gz
zr3644_8_pe.deduplicated.M-bias.txt
zr3644_8_pe.deduplicated_splitting_report.txt
zr3644_8_pe.deduplication_report.txt
zr3644_9.CpG_report.merged_CpG_evidence.cov
zr3644_9.CpG_report.txt
zr3644_9.cytosine_context_summary.txt
zr3644_9_pe.deduplicated.bam
zr3644_9_pe.deduplicated.bedGraph.gz
zr3644_9_pe.deduplicated.bismark.cov.gz
zr3644_9_pe.deduplicated.M-bias.txt
zr3644_9_pe.deduplicated_splitting_report.txt
zr3644_9_pe.deduplication_report.txt