## Codes used in session 3 (from the slides) ## To run the code, open this script file on R (File > Open script..), and select the code you want to run (like selecting text) and press "CTRL + R". ### CORRELATION ## cor(x,y) for two vectors (provides correlation coefficient only) data(iris) cor(iris$Sepal.Length, iris$Sepal.Width) # By default, Pearson correlation cor(iris$Sepal.Length, iris$Sepal.Width, method = ("pearson")) # Specifically, Pearson correlation cor(iris$Sepal.Length, iris$Sepal.Width, method = ("spearman")) # Specifically, Spearman correlation cor(iris$Sepal.Length, iris$Sepal.Width, method = ("kendall")) # Specifically, Kendall correlation ## cor.test(x,y) for two vectors (provides correlation coefficient and P value) data(iris) cor.test(iris$Sepal.Length, iris$Sepal.Width) cor.test(iris$Sepal.Length, iris$Sepal.Width, method=("pearson")) cor.test(iris$Sepal.Length, iris$Sepal.Width, method=("spearman")) cor.test(iris$Sepal.Length, iris$Sepal.Width, method=("kendall")) ## cor(x) for a matrix/dataframe (provides correlation matrix with coefficients only) data(iris) cor(iris[1:4], method = "spearman") ## Hmisc::rcorr(as.matrix(x)) for a matrix/dataframe (provides correlation coefficients and P values) # If needed, install the package "Hmisc" # install.packages("Hmisc") library("Hmisc") data(iris) (correlations <- rcorr(as.matrix(iris[1:4])) ) ## Options to deal with missing values: # cor(x,y, use="pairwise") # to leave out the pairs with missing values # cor(x,y, use="complete") # to leave out the whole variable with any missing value ## Ways to visualize pairwise comparisons pairs(iris[1:4], pch = 21) pairs(iris[1:4], pch = 21, lower.panel = NULL) # same graph with the lower panel left out pairs(iris[1:4], pch = 21, upper.panel = NULL) # same graph with the upper panel left out # To add color (by species): my_cols <- c("red", "blue", "green"); pairs(iris[,1:4], pch = 19, cex = 0.5, col = my_cols[iris$Species], lower.panel=NULL) library(psych); pairs.panels(iris, method = "spearman") # install.packages("corrgram") if needed library("corrgram"); corrgram(iris, main = "Iris data", lower.panel = panel.pts, upper.panel = panel.conf, diag.panel = panel.density) # install.packages("ellipse") # if needed library("ellipse") corrtab <- cor(iris[1:4]); round(corrtab, 2) plotcorr(corrtab, mar = c(0.1, 0.1, 0.1, 0.1)) ##Correlation matrix with heatmap using the R package "DescTools" (DescTools:PlotCorr() ): a <- cor(iris[-5]); PlotCorr(a) # This line of two functions is equivalent to: PlotCorr(cor(iris[-5])) # generates a correlation matrix as a heatmap (based on correlation coefficients, which can be changed to Spearman etc within cor()) # Another PlotCorr() example with options: PlotCorr(cor(mtcars), col=Pal("RedWhiteBlue1", 100), border="grey", args.colorlegend=list(labels=Format(seq(-1,1,.25), digits=2), frame="grey")) # SEE: https://www.rdocumentation.org/packages/DescTools/versions/0.99.19/topics/PlotCorr for options of PlotCorr() ## Correlation matrix with heatmap using the R package "DataExplorer" # install.packages("DataExplorer") # if nneeded library("DataExplorer"); plot_correlation(iris) # correlation matrix with heatmap ### t-TEST iris2 <- subset(iris[, 4:5], iris$Species == "virginica" | iris$Species == "setosa") # creates a subset of iris (stacked dataframe) t.test(iris2$Petal.Width ~ iris2$Species) # the stacked dataframe version of the t-test function (between two subsets; and Species is the grouping variable) iris3 <- subset(iris[, 3:4], iris$Species == "setosa") # creates a subset of iris (horizontal dataframe) t.test(iris3$Petal.Length, iris3$Petal.Width) # the horizontal dataframe version of the t-test function (between two vectors) x <- rnorm(50, mean = 100, sd = 15) t.test(x, mu = 95) # one-sample t-test with a simulated vector a <- rnorm(250, 30, 5) # To generate a sample of size 25 from a normal distribution with mean 30 and standard deviation 5 b <- rnorm(250, 25, 7) # To generate a sample of size 25 from a normal distribution with mean 25 and standard deviation 7 t.test(a, b, paired = TRUE) # To run the paired t-test on two simulated vectors # A t-test demonstration: a <- rnorm(25, 30, 5) # To generate a sample of size 25 from a normal distribution with mean 30 and standard deviation 5 b <- rnorm(25, 25, 7) # To generate a sample of size 25 from a normal distribution with mean 25 and standard deviation 7 t.test(a,b) # To run the t-test # Change the sample size (first argument) in a and b to show the statistical power concept # Change the SD (third argument) to show the effect of intra-group variability # Change the difference between means (second argument) to show the effect of difference in the mean (effect size) # Checking t-test assumptions: # Normality "shapiro.test(x)" # Equal variance "var.test(x,y)" # SEE: http://www.sthda.com/english/wiki/f-test-compare-two-variances-in-r x <- rnorm(100, 30, 5) y <- runif(100, min = 20, max = 30) shapiro.test(x); shapiro.test(y) var.test(x,y) # works only for two levels of the grouping factor # Equal variance "bartlett.test(variable of interest ~ grouping variable) bartlett.test(airquality$Wind ~ airquality$Month) ### ANOVA # General ANOVA model (but there are alternatives) # modelname <- aov(variable of interest ~ as.factor(grouping variable)) # summary(modelname) # Results # TukeyHSD(modelname) # Post-hoc test # One-way ANOVA: data(airquality) # Using the built-in dataset "airquality" oneway.test(airquality$Wind ~ airquality$Month) # Checking whether wind speed differs significantly over the months )not assuming equal variance) boxplot(airquality$Wind ~ airquality$Month, col = "orchid2", border = "red") # Visual check (remember tboxplot shows the median, not mean) oneway.test(airquality$Wind ~ airquality$Month, var.equal = TRUE) # Same test, but assuming equal variance (var.equal = TRUE) result <- aov(airquality$Wind ~ as.factor(airquality$Month)) # The function aov always assumes equal variance summary(result) # Complete ANOVA test with assumption checking, post-hoc test and boxplot as well as the non-parametric equivalent: data(iris) iris_sub1 <- subset(iris[1], iris$Species == "setosa") shapiro.test(iris_sub1$Sepal.Length) iris_sub2 <- subset(iris[1], iris$Species == "versicolor") shapiro.test(iris_sub2$Sepal.Length) iris_sub3 <- subset(iris[1], iris$Species == "virginica") shapiro.test(iris_sub3$Sepal.Length) bartlett.test(iris$Sepal.Length ~ iris$Species) result <- aov(iris$Sepal.Length ~ iris$Species) summary(result) TukeyHSD(result) boxplot(iris$Sepal.Length ~ iris$Species, col = "orchid2", border = "red") kruskal.test(iris$Sepal.Length ~ iris$Species) plot(result) ### REGRESSION # Standard (linear) regression model formula is: modelname <- lm(y ~ x1 + x2 + x3, data = dataframe) # equal sign is acceptable instead of <- # Results are obtained by running the functions summary(modelname) and plot(modelname) functions. # Linear regression (this is a multivariable regression example) data(iris) # Modelname will be "fit" fit <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, data = iris) summary(fit) anova(fit) # ANOVA results for the same model par(mfrow=c(2,2)) plot(fit) # Press ENTER to bring up the first plot, and there will be more. Press ENTER to move to the next plot. hist(fit$resid) # Logistic regression (when the outcome vatriable is binary like yes or no) # Standard logistic regression analysis modelname <- glm(binaryoutcomevariable ~ predictor1 + predictor2 + predictor3, data = dataframe, family = binomial()) # binomial() denotes logistic regression (binary outcome) summary(modelname) # display results confint(modelname) # 95% CI for the coefficients exp(coef(modelname)) # exponentiated coefficients (effect size like odds ratio) exp(confint(modelname)) # 95% CI for exponentiated coefficients predict(modelname, type="response") # predicted values residuals(modelname, type="deviance") # residuals # Example for logistic regression data(infert) # the outcome variable is "case" which is binary # Modelname will be "fit" fit <- glm( case ~ age + parity + induced + spontaneous, data = infert, family = binomial() ) summary(fit) confint(fit) exp(coef(fit)) exp(confint(fit)) predict(fit, type = "response") residuals(fit, type = "deviance") par(mfrow=c(2,2)) plot(fit) # Press ENTER to bring up the first plot, and there will be more. Press ENTER to move to the next plot. # The variable "spontaneous" is the strongest predictor of the outcome (spontaneous coeff=1.92534 SE=0.29863 Z=6.447 P=1.14e-10 OR=6.3) # Let's run a ROC analysis: # Install the package pROC if not installed: install.packages("pROC") library("pROC") plot.roc(infert$case, infert$spontaneous, print.auc = TRUE, ci = TRUE) # yields an AUC value of 0.695 # The variable "induced" is the second strongest predictor of the outcome plot.roc(infert$case, infert$induced, print.auc = TRUE, ci = TRUE) # yields an AUC value of 0.509 AUC analysis (C-statistics) using a different package (DescTools) for the whole model: # SEE: https://www.rdocumentation.org/packages/DescTools/versions/0.99.19/topics/Cstat (for DescTools::CStat) # Install DescTools if not installed: install.packages("DescTools") library(DescTools); Cstat(fit) ####################################