#set the working directory
setwd("C:/Users/Magda/OneDrive - Queensland Cyber Infrastructure Foundation Ltd/Projects/10_00551 BYOD/eResearch/Public RNA-seq dataset/at_counts")

#install the following packages if not yet installed
library("ggrepel")
library("DESeq2")
library("ggplot2")
library("EnhancedVolcano")
library("reshape2")
library("plyr")
require(reshape2)

IMPORT AND PREPROCESS GENE COUNTS DATA

geneCountData <- read.csv('salmon.merged.gene_counts_length_scaled.tsv', header = TRUE, sep = "\t")
print(head(geneCountData))

get rid of the gene_name column

geneCountData = geneCountData[, colnames(geneCountData)[colnames(geneCountData) != 'gene_name']]
print(head(geneCountData))

create a new table with only numbers

onlyCountsData = geneCountData[, colnames(geneCountData)[colnames(geneCountData) != 'gene_id']]
print(head(onlyCountsData))

counts need to be rounded to integers

onlyCountsData <- ceiling(onlyCountsData)
print(head(onlyCountsData))

prepare to re-add the 'gene_id' column to the numbers table

create names of the new columns

new_colnames = cbind("gene_id", t(colnames(onlyCountsData)))
print(new_colnames)

merge the "gene_id" column with the numbers table

countsDataReady <- cbind(geneCountData$gene_id, onlyCountsData)

rename the columns

colnames(countsDataReady) = new_colnames
print(head(countsDataReady))

IMPORT AND PREPROCESS META DATA

metaData <- read.csv('metadata.txt', header = TRUE, sep = "\t")
print(metaData)

make sure that the condition in the metadata is factorised to keep order for DESeq2

first group (Ax) is the baseline, DESeq2 will compare the second group against the first group

growth_levels = c("Ax", "S.Fr1")
metaData$growth_condition = factor(metaData$growth_condition, levels = growth_levels)

CREATE DESEQ2 OBJECT AND RUN THE ANALYSIS

dds <- DESeqDataSetFromMatrix(countData=countsDataReady, colData=metaData, design=~growth_condition, tidy = TRUE)
dds <- DESeq(dds)

Take a look at the results table

res <- results(dds)
head(results(dds, tidy=TRUE)) #let's look at the results table

Summary of differential gene expression

summary(res)

Sort summary list by p-value

res <- res[order(res$padj),]
print(head(res))

Save DEGs to file

write.table(res, file = "DEGs.txt")

Print the results for the genes of interest

genes_of_interest = c("AT2G19190", "AT1G51890", "AT2G14610", "AT3G57260",
"AT3G04720", "AT1G75040", "AT5G44420", "AT5G26920",
"AT3G56400", "AT3G45140", "AT5G24780")
print(res[genes_of_interest,])

DESeq2 - alternative script