Finally got the kinks out of the code! This is run on klone with a slurm job file and shell script.
Slurm
#!/bin/sh
#SBATCH --job-name=04-cea-bis # Job name
#SBATCH --output=%x_%A_%a.slurm.out # Standard output and error log
#SBATCH --error=%x_%A_%a.slurm.err # Error log
#SBATCH --account=srlab
#SBATCH --partition=ckpt #update this line - use hyakalloc to find partitions you can use
#SBATCH --time=10-02:00:00
#SBATCH --ntasks=1 # Run a single task
#SBATCH --cpus-per-task=30 # Number of CPU cores per task
#SBATCH --array=0-31 # Array range (adjust based on the number of samples)
#SBATCH --mem=100G # Memory per node
#SBATCH --chdir=/gscratch/scrubbed/sr320/github/ceasmallr/output/04.2-bismark-align
# Execute Roberts Lab bioinformatics container
# Binds home directory
# Binds /gscratch directory
# Directory bindings allow outputs to be written to the hard drive.
apptainer exec \
--home "$PWD" \
--bind /mmfs1/home/ \
--bind /mmfs1/gscratch/ \
/gscratch/srlab/sr320/srlab-bioinformatics-container-586bf21.sif \
../../code/04.1.sh
Shell
#!/bin/bash
# Set directories and files
reads_dir="../../data/"
genome_folder="../../data/genome"
output_dir="."
checkpoint_file="completed_samples.log"
# Create the checkpoint file if it doesn't exist
touch ${checkpoint_file}
# Get the list of sample files and corresponding sample names
files=(${reads_dir}*R1_001.fastp-trim.20220827.fq.gz)
file="${files[$SLURM_ARRAY_TASK_ID]}"
sample_name=$(basename "$file" "_R1_001.fastp-trim.20220827.fq.gz")
# Check if the sample has already been processed
if grep -q "^${sample_name}$" ${checkpoint_file}; then
echo "Sample ${sample_name} already processed. Skipping..."
exit 0
fi
# Define log files for stdout and stderr
stdout_log="${output_dir}${sample_name}_stdout.log"
stderr_log="${output_dir}${sample_name}_stderr.log"
# Run Bismark align
bismark \
-genome ${genome_folder} \
-p 8 \
-u 10000 \
-score_min L,0,-0.6 \
--non_directional \
-1 ${reads_dir}${sample_name}_R1_001.fastp-trim.20220827.fq.gz \
-2 ${reads_dir}${sample_name}_R2_001.fastp-trim.20220827.fq.gz \
-o ${output_dir} \
--basename ${sample_name} \
2> "${sample_name}-${SLURM_ARRAY_TASK_ID}-bismark_summary.txt"
# Check if the command was successful
if [ $? -eq 0 ]; then
# Append the sample name to the checkpoint file
echo ${sample_name} >> ${checkpoint_file}
echo "Sample ${sample_name} processed successfully."
else
echo "Sample ${sample_name} failed. Check ${stderr_log} for details."
fi
Elevated Shell Script
Not to brag, but Sam helped me take it up a notch by evaluating score_min parameters for comparisons
#!/bin/bash
# Set directories and files
reads_dir="../../data/"
genome_folder="../../data/genome"
output_dir="."
checkpoint_file="completed_samples.log"
# Create the checkpoint file if it doesn't exist
touch ${checkpoint_file}
# Get the list of sample files and corresponding sample names
files=(${reads_dir}*R1_001.fastp-trim.20220827.fq.gz)
file="${files[$SLURM_ARRAY_TASK_ID]}"
sample_name=$(basename "$file" "_R1_001.fastp-trim.20220827.fq.gz")
# Check if the sample has already been processed
if grep -q "^${sample_name}$" ${checkpoint_file}; then
echo "Sample ${sample_name} already processed. Skipping..."
exit 0
fi
# Define log files for stdout and stderr
stdout_log="${output_dir}/${sample_name}_stdout.log"
stderr_log="${output_dir}/${sample_name}_stderr.log"
# Define the array of score_min parameters to test
score_min_params=(
"L,0,-0.4"
"L,0,-0.6"
"L,0,-0.8"
"L,0,-1.0"
"L,-1,-0.6"
)
# Loop through each score_min parameter
for score_min in "${score_min_params[@]}"; do
echo "Running Bismark for sample ${sample_name} with score_min ${score_min}"
# Create a subdirectory for this parameter
param_output_dir="${output_dir}/${sample_name}_score_${score_min//,/}"
mkdir -p ${param_output_dir}
# Run Bismark alignment
bismark \
-genome ${genome_folder} \
-p 8 \
-u 10000 \
-score_min ${score_min} \
--non_directional \
-1 ${reads_dir}${sample_name}_R1_001.fastp-trim.20220827.fq.gz \
-2 ${reads_dir}${sample_name}_R2_001.fastp-trim.20220827.fq.gz \
-o ${param_output_dir} \
--basename ${sample_name}_${score_min//,/} \
2> "${param_output_dir}/${sample_name}-${SLURM_ARRAY_TASK_ID}-bismark_summary.txt"
# Check if the command was successful
if [ $? -eq 0 ]; then
echo "Sample ${sample_name} with score_min ${score_min} processed successfully."
else
echo "Sample ${sample_name} with score_min ${score_min} failed. Check ${stderr_log} for details."
fi
done
# Mark the sample as completed in the checkpoint file
if [ $? -eq 0 ]; then
echo ${sample_name} >> ${checkpoint_file}
echo "All tests for sample ${sample_name} completed."
else
echo "Sample ${sample_name} encountered errors. Check logs for details."
fi
# Define directories
output_dir="."
summary_file="${output_dir}/parameter_comparison_summary.csv"
# Initialize summary file
#echo "Sample,Score_Min,Alignment_Rate,Unique_Alignments,Mismatch_Rate,Bisulfite_Efficiency" > ${summary_file}
# Loop through parameter output directories
for dir in ${output_dir}/*_score_*; do
if [ -d "$dir" ]; then
# Extract sample name and score_min parameter from directory name
sample_name=$(basename "$dir" | cut -d'_' -f1)
score_min=$(basename "$dir" | grep -o "score_.*" | sed 's/score_//; s/_/,/g')
# Locate the summary file
summary_file_path="${dir}/${sample_name}_${score_min}_PE_report.txt"
# Extract metrics
mapping=$(grep "Mapping efficiency:" ${summary_file_path} | awk '{print "mapping efficiency ", $3}')
# Append to the summary file
echo "${sample_name},${score_min},${mapping}" >> ${summary_file}
fi
done