2.rnaseq
##### installing packages example (will not do in workshop)
tr()
setRepositories()
install.packages("psych")
library(psych)
tr()
m <- matrix(1:16,ncol=4)
tr(m)
#####################################
#################
# Basic plots #
#################
x11(width = 6, height = 6) #code to open up a window in linux
counts <- matrix( rnorm(200,mean=0,sd=1),20,10 ) #make a normal count
boxplot( counts[,1],counts[,3],counts[,5],counts[,7],counts[,9],counts[,10] , col=c("black","blue","green","red","purple","yellow") )
boxplot( counts[,1],counts[,3],counts[,5],counts[,7],counts[,9],counts[,10] , col=c("red","red","red","blue","blue","blue") )
test_data <- c(1,2,2,3,7,1,2,3,10,11,25,34,3,6,7,9,9,10,10,10) #make test data
hist(test_data)
test_data <- rnorm(500, mean = 0, sd = 1)
hist(test_data)
hist(test_data, main = "Histogram of Test Dataset", xlab = "Trait", ylab = "Individual Counts" )
dev.off() #or close the window
#실습 1교시 마무리.
## Set the working directory (not needed for workshop)
#setwd("C:\\PATH\\") ## in pc
#setwd("/home/2.RNASeq_practice/5.HTseq_simulated") ##in linux
## Call necessary libraries (these are preinstalled [ex:install.packages("limma")])
library(statmod)
library(locfit)
library(edgeR)
library(limma)
library(DESeq2)
## note the bad samples (6 and 10)
## save used samples' index to Sam_Index
Samp_Index <- c(1:5,7:9,11:23)
## or you can do
Samp_Index <- c(1:23)
Samp_Index <- Samp_Index[-c(6,10)]
## take out bad samples
meta_data <- read.table("meta_data_mRNA.txt", sep="\t", head = T)
File_list <- as.character(meta_data[,1])
File_list <- File_list[Samp_Index]
head(File_list) #check first few component of File_list
### Data Read (first file)
temp <- read.table(File_list[1], sep="\t", head=F)
temp <- temp[-((nrow(temp)-4):nrow(temp)),] #remove last 5 lines
Gene_Symbol <- as.character(temp[,1])
Expression <- data.frame(temp[,2])
## Merge matrix (read all other good sample files and append)
for(i in 2:length(File_list)){ #from file_list, 2nd to last files are called since first file is already called
temp <- read.table(File_list[i], sep="\t", head=F)
temp <- temp[-((nrow(temp)-4):nrow(temp)),] # remove last 5 lines same as the first file
Expression <- data.frame(Expression, temp[,2])
}
colnames(Expression) <- File_list #samples(columns) are marked with their original file name
head(Expression) #check first few lines of Expression
dim(Expression) #check dimension of Expression
head(Gene_Symbol)
#### remove all zero gene for mRNA data #from original 14218 genes
Removed_ID <- which(rowSums(Expression) == 0) #1067 genes removed
Expression <- Expression[-Removed_ID,]
Gene_Symbol <- Gene_Symbol[-Removed_ID]
dim(Expression)
length(Gene_Symbol) #13151
#### gene annotation #####
####after annotation file '#bin' delete
Gene_annotation <- read.table("workshop_practice.annotation",head=T, stringsAsFactors=FALSE)
Gene_annotation <- Gene_annotation[,c(2,13)] ## get information we need: col_2 and col_3
#match Gene_name with Gene_Symbol
Gene_name <- c()
for ( i in 1:length(Gene_Symbol) ){
Gene_name[i]<- Gene_annotation[match(Gene_Symbol[i],Gene_annotation[,1]),2]
}
## Start RNAseq analysis
#########################################
### DEG analysis Set for 1-way Analysis
#########################################
Treat <- factor( meta_data[,3], levels = c("male","female") )
Treat <- Treat[Samp_Index] #dimension correction
targets <- data.frame(Treat)
####################################################
### Get normalized TMM values
####################################################
design <- model.matrix(~Treat, data = targets)
colnames(design)
Y <- DGEList(counts=Expression, gene=Gene_Symbol)
Y <- estimateDisp(Y,design)
#Y <- estimateGLMCommonDisp(Y,design)
#Y <- estimateGLMTrendedDisp(Y,design)
#Y <- estimateGLMTagwiseDisp(Y,design)
fit <- glmFit(Y, design)
colnames(fit)
## Get normalized values
TMM <- calcNormFactors(Y, method="TMM")
TMM <- cpm(TMM, normalized.lib.sizes=TRUE, log=T)
### 1-way ANODEV based apporach for testing Stage effects
## female vs. male
Result_anodev <- glmLRT(fit, coef=2)
Result_anodev_table <- topTags(Result_anodev, n=dim(Expression)[1], sort.by="none")$table
sum(Result_anodev_table$FDR < 0.05) #2
head(Result_anodev_table)
#to visualize the two significant genes...
head(Result_anodev_table[order(Result_anodev_table$FDR),]) #order the table by FDR value (ascending)
Gene_name[1011]
Gene_name[5473]
#quick view of histogram of p-values
hist(Result_anodev_table$PValue)
hist(Result_anodev_table$PValue, main = "P-value Distribution Male vs. Female", xlab= "P-values", ylab="Count")
Final_anodev_table <- cbind (Gene_name,Result_anodev_table) #add the matching gene names to the anodev table
### save the final table results to Result_anodev.txt
write.table( Final_anodev_table,"Result_anodev.txt", row.names=F,sep="\t" )
####################################################
### What to do when you have NO REPLICATES? ###
####################################################
### 1. Be satisfied with a descriptive analysis such as fold change. Do not attempt a significance analysis.
### 2. Simply pick a reasonable dispersion value, based on the experience or study design.
bcv <- 0.2 #BCV is square-root-dispersion
counts <- matrix( rnbinom(40,size=1/bcv^2,mu=10),20,2 )
y <- DGEList( counts, group=1:2 )
y <- exactTest( y, dispersion = bcv^2 )
y$table
####################### EOF #######################
2.rnaseq.txt · Last modified: by 127.0.0.1
