Lets take look at Isoseq fasta
cd ../data/
curl -O https://owl.fish.washington.edu/halfshell/genomic-databank/Mtros-hq_transcripts.fasta
# Load the sequences from a fasta file
<- "../data/Mtros-hq_transcripts.fasta" # Change to your FASTA file path
fasta_path <- readDNAStringSet(fasta_path)
dna_sequences
# Calculate the lengths of the sequences
<- width(dna_sequences)
sequence_lengths
# Convert to a data frame for ggplot
<- data.frame(Length = sequence_lengths)
df
# Plot the length distribution
ggplot(df, aes(x = Length)) +
geom_histogram(color = "black", fill = "white", bins = 30) +
theme_minimal() +
labs(title = "Sequence Length Distribution",
x = "Sequence Length",
y = "Count")
# Number of sequences shorter than 500 bp
<- sum(sequence_lengths < 500)
short_sequences print(short_sequences)
## [1] 36551
<- sum(sequence_lengths < 200)
really_short_sequences print(really_short_sequences)
## [1] 10038