Differentially Expressed Transcipts

for cod
R
datasets
cod
Author
Affiliation

Steven Roberts

Published

May 21, 2024

10-Hisat

Steven Roberts 22 May, 2024

Alt splice test run

1 Differentially Expressed Transcripts

2 Reads

ls ../data/reads/*
../data/reads/100.trimmed.R1.fastq.gz
../data/reads/100.trimmed.R2.fastq.gz
../data/reads/107.trimmed.R1.fastq.gz
../data/reads/107.trimmed.R2.fastq.gz
../data/reads/108.trimmed.R1.fastq.gz
../data/reads/108.trimmed.R2.fastq.gz
../data/reads/109.trimmed.R1.fastq.gz
../data/reads/109.trimmed.R2.fastq.gz
../data/reads/10.trimmed.R1.fastq.gz
../data/reads/10.trimmed.R2.fastq.gz
../data/reads/110.trimmed.R1.fastq.gz
../data/reads/110.trimmed.R2.fastq.gz
../data/reads/116.trimmed.R1.fastq.gz
../data/reads/116.trimmed.R2.fastq.gz
../data/reads/11.trimmed.R1.fastq.gz
../data/reads/11.trimmed.R2.fastq.gz
../data/reads/12.trimmed.R1.fastq.gz
../data/reads/12.trimmed.R2.fastq.gz
../data/reads/13.trimmed.R1.fastq.gz
../data/reads/13.trimmed.R2.fastq.gz
../data/reads/18.trimmed.R1.fastq.gz
../data/reads/18.trimmed.R2.fastq.gz
../data/reads/19.trimmed.R1.fastq.gz
../data/reads/19.trimmed.R2.fastq.gz
../data/reads/1.trimmed.R1.fastq.gz
../data/reads/1.trimmed.R2.fastq.gz
../data/reads/20.trimmed.R1.fastq.gz
../data/reads/20.trimmed.R2.fastq.gz
../data/reads/21.trimmed.R1.fastq.gz
../data/reads/21.trimmed.R2.fastq.gz
../data/reads/28.trimmed.R1.fastq.gz
../data/reads/28.trimmed.R2.fastq.gz
../data/reads/29.trimmed.R1.fastq.gz
../data/reads/29.trimmed.R2.fastq.gz
../data/reads/2.trimmed.R1.fastq.gz
../data/reads/2.trimmed.R2.fastq.gz
../data/reads/30.trimmed.R1.fastq.gz
../data/reads/30.trimmed.R2.fastq.gz
../data/reads/31.trimmed.R1.fastq.gz
../data/reads/31.trimmed.R2.fastq.gz
../data/reads/36.trimmed.R1.fastq.gz
../data/reads/36.trimmed.R2.fastq.gz
../data/reads/3.trimmed.R1.fastq.gz
../data/reads/3.trimmed.R2.fastq.gz
../data/reads/4.trimmed.R1.fastq.gz
../data/reads/4.trimmed.R2.fastq.gz
../data/reads/5.trimmed.R1.fastq.gz
../data/reads/5.trimmed.R2.fastq.gz
../data/reads/78.trimmed.R1.fastq.gz
../data/reads/78.trimmed.R2.fastq.gz
../data/reads/79.trimmed.R1.fastq.gz
../data/reads/79.trimmed.R2.fastq.gz
../data/reads/80.trimmed.R1.fastq.gz
../data/reads/80.trimmed.R2.fastq.gz
../data/reads/83.trimmed.R1.fastq.gz
../data/reads/83.trimmed.R2.fastq.gz
../data/reads/88.trimmed.R1.fastq.gz
../data/reads/88.trimmed.R2.fastq.gz
../data/reads/90.trimmed.R1.fastq.gz
../data/reads/90.trimmed.R2.fastq.gz
../data/reads/91.trimmed.R1.fastq.gz
../data/reads/91.trimmed.R2.fastq.gz
../data/reads/92.trimmed.R1.fastq.gz
../data/reads/92.trimmed.R2.fastq.gz
../data/reads/94.trimmed.R1.fastq.gz
../data/reads/94.trimmed.R2.fastq.gz
../data/reads/97.trimmed.R1.fastq.gz
../data/reads/97.trimmed.R2.fastq.gz
../data/reads/98.trimmed.R1.fastq.gz
../data/reads/98.trimmed.R2.fastq.gz
../data/reads/99.trimmed.R1.fastq.gz
../data/reads/99.trimmed.R2.fastq.gz
../data/reads/splice-test-files.txt

3 Genome

https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_031168955.1/

genome
cd ../data

/home/shared/datasets download genome accession GCF_031168955.1 --include gff3,gtf,rna,cds,protein,genome,seq-report
cd ../data 
unzip ncbi_dataset.zip
ls ../data/ncbi_dataset/data/GCF_031168955.1
cds_from_genomic.fna
GCF_031168955.1_ASM3116895v1_genomic.fna
genomic.gff
genomic.gtf
protein.faa
rna.fna
sequence_report.jsonl

4 Hisat

/home/shared/hisat2-2.2.1/hisat2_extract_exons.py \
../data/ncbi_dataset/data/GCF_031168955.1/genomic.gtf \
> ../output/10.1-hisat-deseq2/m_exon.tab
/home/shared/hisat2-2.2.1/hisat2_extract_splice_sites.py \
../data/ncbi_dataset/data/GCF_031168955.1/genomic.gtf \
> ../output/10.1-hisat-deseq2/m_spice_sites.tab

Will go ahead and exclude index from git

echo "10.1-hisat-deseq2/GCF*" >> ../output/.gitignore
/home/shared/hisat2-2.2.1/hisat2-build \
../data/ncbi_dataset/data/GCF_031168955.1/GCF_031168955.1_ASM3116895v1_genomic.fna \
../output/10.1-hisat-deseq2/GCF_031168955.1.index \
--exon ../output/10.1-hisat-deseq2/m_exon.tab \
--ss ../output/10.1-hisat-deseq2/m_spice_sites.tab \
-p 36 \
../data/ncbi_dataset/data/GCF_031168955.1/genomic.gtf \
2> ../output/10.1-hisat-deseq2/hisat2-build_stats.txt
echo "10.1-hisat-deseq2/*sam" >> ../output/.gitignore
find ../data/reads/*.trimmed.R1.fastq.gz \
| xargs basename -s .trimmed.R1.fastq.gz | xargs -I{} \
/home/shared/hisat2-2.2.1/hisat2 \
-x ../output/10.1-hisat-deseq2/GCF_031168955.1.index \
--dta \
-p 20 \
-1 ../data/reads/{}.trimmed.R1.fastq.gz \
-2 ../data/reads/{}.trimmed.R2.fastq.gz \
-S ../output/10.1-hisat-deseq2/{}.sam \
2> ../output/10.1-hisat-deseq2/hisat.out
echo "10.1-hisat-deseq2/*bam" >> ../output/.gitignore
echo "10.1-hisat-deseq2/*bam*" >> ../output/.gitignore
for samfile in ../output/10.1-hisat-deseq2/*.sam; do
  bamfile="${samfile%.sam}.bam"
  sorted_bamfile="${samfile%.sam}.sorted.bam"
  /home/shared/samtools-1.12/samtools view -bS -@ 20 "$samfile" > "$bamfile"
  /home/shared/samtools-1.12/samtools sort -@ 20 "$bamfile" -o "$sorted_bamfile"
  /home/shared/samtools-1.12/samtools index -@ 20 "$sorted_bamfile"
done
rm ../output/10.1-hisat-deseq2/*sam
ls ../output/10.1-hisat-deseq2/*sorted.bam | wc -l
tail ../output/10.1-hisat-deseq2/hisat.out
    ----
    3794910 pairs aligned concordantly 0 times; of these:
      146293 (3.85%) aligned discordantly 1 time
    ----
    3648617 pairs aligned 0 times concordantly or discordantly; of these:
      7297234 mates make up the pairs; of these:
        5640288 (77.29%) aligned 0 times
        1498025 (20.53%) aligned exactly 1 time
        158921 (2.18%) aligned >1 times
82.81% overall alignment rate
cat ../output/10.1-hisat-deseq2/hisat.out \
| grep "overall alignment rate"
87.53% overall alignment rate
86.32% overall alignment rate
81.37% overall alignment rate
86.39% overall alignment rate
83.54% overall alignment rate
87.67% overall alignment rate
86.69% overall alignment rate
86.56% overall alignment rate
84.58% overall alignment rate
81.26% overall alignment rate
83.03% overall alignment rate
86.29% overall alignment rate
84.82% overall alignment rate
82.29% overall alignment rate
83.16% overall alignment rate
86.14% overall alignment rate
81.04% overall alignment rate
87.28% overall alignment rate
84.89% overall alignment rate
87.76% overall alignment rate
88.45% overall alignment rate
83.13% overall alignment rate
83.21% overall alignment rate
84.99% overall alignment rate
86.33% overall alignment rate
86.47% overall alignment rate
84.37% overall alignment rate
87.60% overall alignment rate
75.31% overall alignment rate
81.73% overall alignment rate
84.47% overall alignment rate
80.84% overall alignment rate
86.72% overall alignment rate
87.60% overall alignment rate
81.89% overall alignment rate
82.81% overall alignment rate

5 Stringtie

echo "10.1-hisat-deseq2/*gtf" >> ../output/.gitignore
find ../output/10.1-hisat-deseq2/*sorted.bam \
| xargs basename -s .sorted.bam | xargs -I{} \
/home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \
-p 36 \
-eB \
-G ../data/ncbi_dataset/data/GCF_031168955.1/genomic.gff \
-o ../output/10.1-hisat-deseq2/{}.gtf \
../output/10.1-hisat-deseq2/{}.sorted.bam

6 Count Matrix

ls ../output/10.1-hisat-deseq2/*gtf
../output/10.1-hisat-deseq2/100.gtf
../output/10.1-hisat-deseq2/107.gtf
../output/10.1-hisat-deseq2/108.gtf
../output/10.1-hisat-deseq2/109.gtf
../output/10.1-hisat-deseq2/10.gtf
../output/10.1-hisat-deseq2/110.gtf
../output/10.1-hisat-deseq2/116.gtf
../output/10.1-hisat-deseq2/11.gtf
../output/10.1-hisat-deseq2/12.gtf
../output/10.1-hisat-deseq2/13.gtf
../output/10.1-hisat-deseq2/18.gtf
../output/10.1-hisat-deseq2/19.gtf
../output/10.1-hisat-deseq2/1.gtf
../output/10.1-hisat-deseq2/20.gtf
../output/10.1-hisat-deseq2/21.gtf
../output/10.1-hisat-deseq2/28.gtf
../output/10.1-hisat-deseq2/29.gtf
../output/10.1-hisat-deseq2/2.gtf
../output/10.1-hisat-deseq2/30.gtf
../output/10.1-hisat-deseq2/31.gtf
../output/10.1-hisat-deseq2/36.gtf
../output/10.1-hisat-deseq2/3.gtf
../output/10.1-hisat-deseq2/4.gtf
../output/10.1-hisat-deseq2/5.gtf
../output/10.1-hisat-deseq2/78.gtf
../output/10.1-hisat-deseq2/79.gtf
../output/10.1-hisat-deseq2/80.gtf
../output/10.1-hisat-deseq2/83.gtf
../output/10.1-hisat-deseq2/88.gtf
../output/10.1-hisat-deseq2/90.gtf
../output/10.1-hisat-deseq2/91.gtf
../output/10.1-hisat-deseq2/92.gtf
../output/10.1-hisat-deseq2/94.gtf
../output/10.1-hisat-deseq2/97.gtf
../output/10.1-hisat-deseq2/98.gtf
../output/10.1-hisat-deseq2/99.gtf
cat ../output/10.1-hisat-deseq2/list01.txt
1   ../output/10-hisat-deseq2/1.gtf
2   ../output/10-hisat-deseq2/2.gtf
3   ../output/10-hisat-deseq2/3.gtf
4   ../output/10-hisat-deseq2/4.gtf
5   ../output/10-hisat-deseq2/5.gtf
10  ../output/10-hisat-deseq2/10.gtf
11  ../output/10-hisat-deseq2/11.gtf
12  ../output/10-hisat-deseq2/12.gtf
13  ../output/10-hisat-deseq2/13.gtf
18  ../output/10-hisat-deseq2/18.gtf
19  ../output/10-hisat-deseq2/19.gtf
20  ../output/10-hisat-deseq2/20.gtf
21  ../output/10-hisat-deseq2/21.gtf
28  ../output/10-hisat-deseq2/28.gtf
29  ../output/10-hisat-deseq2/29.gtf
30  ../output/10-hisat-deseq2/30.gtf
31  ../output/10-hisat-deseq2/31.gtf
36  ../output/10-hisat-deseq2/36.gtf
78  ../output/10-hisat-deseq2/78.gtf
79  ../output/10-hisat-deseq2/79.gtf
80  ../output/10-hisat-deseq2/80.gtf
83  ../output/10-hisat-deseq2/83.gtf
88  ../output/10-hisat-deseq2/88.gtf
90  ../output/10-hisat-deseq2/90.gtf
91  ../output/10-hisat-deseq2/91.gtf
92  ../output/10-hisat-deseq2/92.gtf
94  ../output/10-hisat-deseq2/94.gtf
97  ../output/10-hisat-deseq2/97.gtf
98  ../output/10-hisat-deseq2/98.gtf
99  ../output/10-hisat-deseq2/99.gtf
100 ../output/10-hisat-deseq2/100.gtf
107 ../output/10-hisat-deseq2/107.gtf
108 ../output/10-hisat-deseq2/108.gtf
109 ../output/10-hisat-deseq2/109.gtf
110 ../output/10-hisat-deseq2/110.gtf
116 ../output/10-hisat-deseq2/116.gtf
python /home/shared/stringtie-2.2.1.Linux_x86_64/prepDE.py \
-i ../output/10.1-hisat-deseq2/list01.txt \
-g ../output/10.1-hisat-deseq2/gene_count_matrix.csv \
-t ../output/10.1-hisat-deseq2/transcript_count_matrix.csv
head ../output/10.1-hisat-deseq2/*matrix.csv
==> ../output/10.1-hisat-deseq2/gene_count_matrix.csv <==
gene_id,1,10,100,107,108,109,11,110,116,12,13,18,19,2,20,21,28,29,3,30,31,36,4,5,78,79,80,83,88,90,91,92,94,97,98,99
gene-LOC132462341|LOC132462341,360,464,391,346,691,408,436,509,366,373,432,385,330,288,307,346,293,347,452,452,984,230,469,400,472,311,368,312,551,631,605,366,339,517,577,235
gene-abce1|abce1,694,325,276,77,409,196,281,284,327,310,393,386,106,290,252,326,345,254,363,353,922,160,588,285,178,119,197,134,128,103,315,204,301,56,49,260
gene-si:dkey-6i22.5|si:dkey-6i22.5,0,10,13,11,49,42,14,36,0,14,9,0,0,14,3,21,26,9,39,20,10,0,0,10,48,41,60,39,49,54,20,27,7,22,63,12
gene-ube2v1|ube2v1,10,22,15,19,74,28,63,35,30,31,25,42,17,14,21,60,61,14,29,37,68,37,0,30,10,13,56,32,45,55,27,38,60,25,18,50
gene-cldn15la|cldn15la,0,9,0,0,18,0,0,0,0,27,0,0,0,0,0,0,0,0,4,35,0,0,0,4,0,0,0,0,8,32,0,0,14,0,0,0
gene-muc15|muc15,0,40,40,29,8,0,38,0,16,15,0,0,0,0,22,22,69,22,0,15,0,34,55,51,0,30,19,31,0,0,42,34,0,0,26,56
gene-pcloa|pcloa,0,3,0,0,41,7,4,0,3,0,0,0,27,0,0,0,0,4,0,10,0,23,0,0,0,0,4,4,0,0,4,0,0,22,101,0
gene-LOC132472829|LOC132472829,402,550,293,182,1082,213,234,149,444,360,531,242,269,194,359,383,395,392,408,489,310,352,168,403,462,287,310,113,457,321,396,234,455,279,229,208
gene-ifi35|ifi35,3696,2492,2615,2184,2702,2579,3631,3158,2121,2673,2893,2210,2324,2384,2561,2654,2887,2250,3143,2830,1297,1981,2971,2651,2669,2671,1790,2173,2603,3260,2186,2451,2464,2768,1935,1991

==> ../output/10.1-hisat-deseq2/transcript_count_matrix.csv <==
transcript_id,1,10,100,107,108,109,11,110,116,12,13,18,19,2,20,21,28,29,3,30,31,36,4,5,78,79,80,83,88,90,91,92,94,97,98,99
rna-XM_060037252.1,20,918,84,11,385,70,54,82,131,72,52,8,26,41,33,62,30,12,45,85,377,59,58,57,96,19,55,94,92,125,54,34,69,36,16,83
rna-XM_060070002.1,0,0,0,0,0,12,0,0,0,14,14,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,7
rna-XM_060056394.1,0,0,0,1,0,0,336,0,37,0,0,0,86,0,0,0,0,0,395,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
rna-XM_060072836.1,0,0,14,4,0,0,5,0,15,0,0,0,4,0,0,5,0,0,0,11,0,0,5,0,0,12,0,0,12,3,10,0,9,0,0,0
rna-XM_060062883.1,719,38,380,279,652,462,461,381,432,0,831,479,641,171,593,82,520,625,366,533,640,311,36,876,365,322,351,342,430,324,450,410,540,336,513,370
rna-XM_060057326.1,1072,1577,801,1056,1420,909,1523,1101,951,1459,2162,1170,1983,1317,1264,1398,2075,1055,1303,1820,1496,934,966,1509,1132,928,845,948,1930,2047,1039,781,979,2259,2747,740
rna-XM_060051796.1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,0,2,0,0,0,0,0,0,0,2
rna-XM_060076321.1,0,0,0,0,16,0,0,0,0,0,0,4,58,0,0,0,0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,5,13,80,0
rna-XM_060064475.1,6,1771,834,10,33,0,1752,997,470,1572,0,0,16,713,0,0,0,5,0,679,292,656,0,1051,6,1326,0,0,0,0,592,448,0,0,0,5

7 DEseq2

cat ../output/10.1-hisat-deseq2/conditions.txt
SampleID    Condition
X1  16
X2  16
X3  16
X4  16
X5  16
X10 16
X11 16
X12 16
X13 16
X18 16
X19 16
X20 16
X21 16
X28 16
X29 16
X30 16
X31 16
X36 16
X78 9
X79 9
X80 9
X83 9
X88 9
X90 9
X91 9
X92 9
X94 9
X97 9
X98 9
X99 9
X100    9
X107    9
X108    9
X109    9
X110    9
X116    9
library(DESeq2)
# Load transcript) count matrix and labels
countData <- as.matrix(read.csv("../output/10.1-hisat-deseq2/transcript_count_matrix.csv", row.names="transcript_id"))
colData <- read.csv("../output/10.1-hisat-deseq2/conditions.txt", sep="\t", row.names = 1)

# Note: The PHENO_DATA file contains information on each sample, e.g., sex or population.
# The exact way to import this depends on the format of the file.

# Check all sample IDs in colData are also in CountData and match their orders
all(rownames(colData) %in% colnames(countData)) # This should return TRUE
[1] TRUE
countData <- countData[, rownames(colData)]
all(rownames(colData) == colnames(countData)) # This should also return TRUE
[1] TRUE
# Create a DESeqDataSet from count matrix and labels
dds <- DESeqDataSetFromMatrix(countData = countData,
                              colData = colData, design = ~ Condition)


# Run the default analysis for DESeq2 and generate results table
dds <- DESeq(dds)
deseq2.res <- results(dds)

# Sort by adjusted p-value and display
resOrdered <- deseq2.res[order(deseq2.res$padj), ]
vsd <- vst(dds, blind = FALSE)
plotPCA(vsd, intgroup = "Condition")

# Select top 50 differentially expressed genes
res <- results(dds)
res_ordered <- res[order(res$padj), ]
top_genes <- row.names(res_ordered)[1:50]

# Extract counts and normalize
counts <- counts(dds, normalized = TRUE)
counts_top <- counts[top_genes, ]

# Log-transform counts
log_counts_top <- log2(counts_top + 1)

# Generate heatmap
pheatmap(log_counts_top, scale = "row")

# Count number of hits with adjusted p-value less then 0.05
dim(res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
[1] 4098    6
tmp <- deseq2.res
# The main plot
plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
     main="DEG Dessication  (pval <= 0.05)",
     xlab="mean of normalized counts",
     ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")

write.table(tmp.sig, "../output/10.1-hisat-deseq2/DETlist.tab", sep = '\t', row.names = T)
detlist <- read.csv("../output/10.1-hisat-deseq2/DETlist.tab", sep = '\t', header = TRUE)
detlist$RowName <- rownames(detlist)
detlist2 <- detlist[, c("RowName", "pvalue")] # Optionally, reorder the columns
head(detlist)
                    baseMean log2FoldChange      lfcSE      stat       pvalue
rna-XM_060040999.1 327.23637     0.07503113 0.02706549  2.772206 5.567773e-03
rna-XM_060059475.1  64.57272    -0.09356216 0.03262081 -2.868174 4.128491e-03
rna-XM_060060617.1  42.88642    -0.23727065 0.04910582 -4.831823 1.352884e-06
rna-XM_060067174.1 237.63970     0.07702393 0.02363060  3.259500 1.116087e-03
rna-XM_060057172.1  37.57466    -0.20032720 0.06166216 -3.248787 1.158984e-03
rna-XM_060038089.1 703.71042     0.08841799 0.02025754  4.364695 1.273003e-05
                           padj            RowName
rna-XM_060040999.1 4.170294e-02 rna-XM_060040999.1
rna-XM_060059475.1 3.329193e-02 rna-XM_060059475.1
rna-XM_060060617.1 4.605113e-05 rna-XM_060060617.1
rna-XM_060067174.1 1.243125e-02 rna-XM_060067174.1
rna-XM_060057172.1 1.279752e-02 rna-XM_060057172.1
rna-XM_060038089.1 3.171650e-04 rna-XM_060038089.1

```