## Codes used in session 4 (from the slides) 
## To run the code, open this script file on R (File > Open script..), and select the code you want to run (like selecting text) and press "CTRL + R".

#### CORRELATION ####
## cor(x,y) for two vectors (the function cor() provides correlation coefficient only)
data(iris)
plot(iris$Sepal.Length, iris$Sepal.Width)   # Does it look like there is a (linear) correlation? 		
cor(iris$Sepal.Length, iris$Sepal.Width)    # By default, Pearson correlation
cor(iris$Sepal.Length, iris$Sepal.Width, method = ("pearson"))   # Specifically, Pearson correlation
cor(iris$Sepal.Length, iris$Sepal.Width, method = ("spearman"))  # Specifically, Spearman correlation
cor(iris$Sepal.Length, iris$Sepal.Width, method = ("kendall"))   # Specifically, Kendall correlation
cor(iris[-5])	# generates a correlation (coefficients) matrix (numerical values only, hence [-5])
pairs(iris[-5])	# generates a correlation (scatterplot) matrix (not using the Species variable makes sense)

## cor.test(x,y) for two vectors (the function cor.test() provides correlation coefficient and P value)
data(iris)
cor.test(iris$Sepal.Length, iris$Sepal.Width)   
cor.test(iris$Sepal.Length, iris$Sepal.Width, method=("pearson"))   
cor.test(iris$Sepal.Length, iris$Sepal.Width, method=("spearman"))   
cor.test(iris$Sepal.Length, iris$Sepal.Width, method=("kendall"))   

## cor(x) for a matrix/dataframe (provides correlation matrix with coefficients only)
data(iris)
cor(iris[1:4], method = "spearman")	# a matrix of Spearman coefficients is generated    

## Hmisc::rcorr(as.matrix(x)) for a matrix/dataframe (provides correlation coefficients and P values)
# If needed, install the package "Hmisc"
# install.packages("Hmisc")
library("Hmisc")
data(iris)
(correlations  <- rcorr(as.matrix(iris[1:4])) )

## Options to deal with missing values:
# cor(x,y, use="pairwise")	# to leave out the pairs with missing values
# cor(x,y, use="complete")	# to leave out the whole variable with any missing value 

## Ways to visualize pairwise comparisons
pairs(iris[1:4], pch = 21)
pairs(iris[1:4], pch = 21, lower.panel = NULL)  # same graph with the lower panel left out
pairs(iris[1:4], pch = 21, upper.panel = NULL)  # same graph with the upper panel left out
     # To add color (by species):
my_cols <- c("red", "blue", "green"); pairs(iris[,1:4], pch = 19, cex = 0.5, col = my_cols[iris$Species], lower.panel=NULL) 

library(psych); pairs.panels(iris, method = "spearman") 

# install.packages("corrgram") if needed
library("corrgram"); corrgram(iris, main = "Iris data", lower.panel = panel.pts, upper.panel = panel.conf, diag.panel = panel.density)

# install.packages("ellipse")   # if needed
library("ellipse")
corrtab <- cor(iris[1:4]); round(corrtab, 2)
plotcorr(corrtab, mar = c(0.1, 0.1, 0.1, 0.1)) 

## Correlation matrix with heatmap using the R package "DescTools" (DescTools:PlotCorr() ):
a <- cor(iris[-5]); PlotCorr(a)  # This line of two functions is equivalent to:  PlotCorr(cor(iris[-5]))       
    # generates a correlation matrix as a heatmap (based on correlation coefficients, which can be changed to Spearman etc within cor())
# Another PlotCorr() example with options: 
PlotCorr(cor(mtcars), col=Pal("RedWhiteBlue1", 100), border="grey", args.colorlegend=list(labels=Format(seq(-1,1,.25), digits=2), frame="grey"))
# SEE: https://www.rdocumentation.org/packages/DescTools/versions/0.99.19/topics/PlotCorr for options of PlotCorr()

## Correlation matrix with heatmap using the R package "DataExplorer"
# install.packages("DataExplorer")   # if nneeded
library("DataExplorer"); plot_correlation(iris)      # correlation matrix with heatmap


#### t-TEST  ####

iris2 <- subset(iris[, 4:5], iris$Species == "virginica" | iris$Species == "setosa")   # creates a subset of iris (stacked dataframe)
t.test(iris2$Petal.Width ~ iris2$Species)      # the stacked dataframe version of the t-test function (between two subsets; and Species is the grouping variable) 


iris3 <- subset(iris[, 3:4], iris$Species == "setosa")   # creates a subset of iris (horizontal dataframe) 
t.test(iris3$Petal.Length, iris3$Petal.Width)            # the horizontal dataframe version of the t-test function (between two vectors)

x <- rnorm(50, mean = 100, sd = 15)    
t.test(x, mu = 95)                     # one-sample t-test with a simulated vector 

a <- rnorm(250, 30, 5)        # To generate a sample of size 25 from a normal distribution with mean 30 and standard deviation 5
b <- rnorm(250, 25, 7)        # To generate a sample of size 25 from a normal distribution with mean 25 and standard deviation 7
t.test(a, b, paired = TRUE)  # To run the paired t-test on two simulated vectors

# A t-test demonstration: 
a <- rnorm(25, 30, 5) # To generate a sample of size 25 from a normal distribution with mean 30 and standard deviation 5
b <- rnorm(25, 25, 7) # To generate a sample of size 25 from a normal distribution with mean 25 and standard deviation 7
t.test(a,b)           # To run the t-test 
# Change the sample size (first argument) in a and b to show the statistical power concept 
# Change the SD (third argument) to show the effect of intra-group variability 
# Change the difference between means (second argument) to show the effect of difference in the mean (effect size)

# Checking t-test assumptions:
# Normality "shapiro.test(x)"
# Equal variance "var.test(x,y)"  # SEE: http://www.sthda.com/english/wiki/f-test-compare-two-variances-in-r
x <- rnorm(100, 30, 5) 
y <- runif(100, min = 20, max = 30) 
shapiro.test(x); shapiro.test(y)
var.test(x,y)    # works only for two levels of the grouping factor
# Equal variance "bartlett.test(variable of interest ~ grouping variable)  # for multiple variables
bartlett.test(airquality$Wind ~ airquality$Month)

#### ANOVA ####
# General ANOVA model (but there are alternatives)
# modelname <- aov(variable of interest ~ as.factor(grouping variable))
# summary(modelname)      # Results
# TukeyHSD(modelname)     # Post-hoc test 

# One-way ANOVA:
data(airquality)          # Using the built-in dataset "airquality"
oneway.test(airquality$Wind ~ airquality$Month)    # Checking whether wind speed differs significantly over the months )not assuming equal variance)
boxplot(airquality$Wind ~ airquality$Month, col = "orchid2", border = "red")    # Visual check (remember tboxplot shows the median, not mean) 

oneway.test(airquality$Wind ~ airquality$Month, var.equal = TRUE)    # Same test, but assuming equal variance (var.equal = TRUE)

result <- aov(airquality$Wind ~ as.factor(airquality$Month))   # The function aov always assumes equal variance
summary(result)

# Complete ANOVA test with assumption checking, post-hoc test and boxplot as well as the non-parametric equivalent:
data(iris)
iris_sub1 <- subset(iris[1], iris$Species == "setosa")
shapiro.test(iris_sub1$Sepal.Length)
iris_sub2 <- subset(iris[1], iris$Species == "versicolor")
shapiro.test(iris_sub2$Sepal.Length)
iris_sub3 <- subset(iris[1], iris$Species == "virginica")
shapiro.test(iris_sub3$Sepal.Length)
bartlett.test(iris$Sepal.Length ~ iris$Species)
result <- aov(iris$Sepal.Length ~ iris$Species)
summary(result)
TukeyHSD(result)
boxplot(iris$Sepal.Length ~ iris$Species, col = "orchid2", border = "red")
kruskal.test(iris$Sepal.Length ~ iris$Species)
plot(result)

#### REGRESSION ####
# Standard (linear) regression model formula is: 
                    #  modelname <- lm(y ~ x1 + x2 + x3, data = dataframe) # equal sign is acceptable instead of <- 
# Results are displayed and illustrated by running  the functions: summary(modelname) and plot(modelname) functions.

# Linear regression (this is a multivariable regression example)
data(iris)
# Modelname will be "fit"
fit <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = iris)
summary(fit)
anova(fit)        # ANOVA results for the same model
par(mfrow=c(2,2))
plot(fit)   # Press ENTER to bring up the first plot, and there will be more. Press ENTER to move to the next plot. 
hist(fit$resid) 

# Logistic regression (when the outcome vatriable is binary like yes or no) 
# Standard logistic regression analysis
modelname <- glm(binaryoutcomevariable ~ predictor1 + predictor2 + predictor3, data = dataframe, family = binomial()) # binomial() denotes logistic regression (binary outcome)
summary(modelname)                    # display results
confint(modelname)                    # 95% CI for the coefficients
exp(coef(modelname))                  # exponentiated coefficients (effect size like odds ratio)
exp(confint(modelname))               # 95% CI for exponentiated coefficients
predict(modelname, type="response")   # predicted values
residuals(modelname, type="deviance") # residuals 

# Example for logistic regression
data(infert)  # the outcome variable is "case" which is binary
# Modelname will be "fit"
fit <- glm( case ~ age + parity + induced + spontaneous, data = infert, family = binomial() )
summary(fit)
confint(fit)
exp(coef(fit))       
exp(confint(fit))
predict(fit, type = "response")
residuals(fit, type = "deviance")  
par(mfrow=c(2,2))
plot(fit)    # Press ENTER to bring up the first plot, and there will be more. Press ENTER to move to the next plot. 
library(pROC)			# AUC analysis
plot.roc(infert$case, infert$spontaneous, print.auc = TRUE, ci = TRUE)
library(DescTools) 		# C-statistics 
Cstat(fit)
AIC()				# Akaike Information Criterion

# The variable "spontaneous" is the strongest predictor of the outcome (spontaneous  coeff=1.92534    SE=0.29863   Z=6.447   P=1.14e-10   OR=6.3)
# Let's run a ROC analysis:
# Install the package pROC if not installed: install.packages("pROC")
library("pROC")
plot.roc(infert$case, infert$spontaneous, print.auc = TRUE, ci = TRUE)  # yields an AUC value of 0.695 
# The variable "induced" is the second strongest predictor of the outcome  
plot.roc(infert$case, infert$induced, print.auc = TRUE, ci = TRUE)  # yields an AUC value of 0.509 

AUC analysis (C-statistics) using a different package (DescTools) for the whole model:
# SEE: https://www.rdocumentation.org/packages/DescTools/versions/0.99.19/topics/Cstat (for DescTools::CStat)
# Install DescTools if not installed:  install.packages("DescTools")
library(DescTools); Cstat(fit)

#####################################