## Codes used in session 2 (from the slides) ## To run the code, open this script file on R (File > Open script..), and select the code you want to run (like selecting text) and press "CTRL + R". ### Descriptive statistics with R: Visual Exploration hist(iris$Sepal.Width, col = "darkblue") # Histogram (to check the bell's curve shape) plot(iris$Sepal.Width, col = "darkblue") # Plot (not really useful for this purpose unless it is density plot) boxplot(iris$Sepal.Width, col = "darkblue") # Boxplot: Useful to see a lot of data in one graph ## Boxplot: boxplot(iris$Sepal.Length, iris$Sepal.Width, iris$Petal.Length, iris$Petal.Width) # Selected columns specifically indicated boxplot(iris[-5]) # All except the fifth column boxplot(iris[-5], col = "orchid", border = "red") # Like above with box color = orchid1 and border color = red boxplot(iris[-5], col = "azure", border = "blue") # # Like above with box color = azure and border color = blue boxplot(iris$Sepal.Width, col = "orchid2", pch = 24, main = "My First Boxplot", col.axis ="blue", col.lab ="red", col.main ="darkblue", boxwex = 0.2) # With lots of options # Repeat the last command by changing the numbers for pch (plot symbol) and boxwex (box width expansion) boxplot(iris$Sepal.Width, col = "orchid2", pch = 18, main = "My First Fancy Boxplot", col.axis ="red", col.lab ="red", col.main ="orchid", border = "red", boxwex = 0.8) # Adding cex = 1.5 to increase the size of symbols, x and y axis labels (with colors set to orchid) boxplot(iris$Sepal.Width, col = "orchid2", pch = 18, main = "My First Fancy Boxplot", col.axis ="orchid", col.lab ="orchid", col.main ="orchid", border = "red", boxwex = 1, cex = 1.5, xlab = "Sepal Width", ylab = "cm") # Notched boxplot: boxplot(iris$Sepal.Width, col = "orchid2", pch = 18, main = "My First Fancy Boxplot", col.axis ="orchid", col.lab ="orchid", col.main ="orchid", border = "red", boxwex = 1, cex = 1.5, xlab = "Sepal Width", ylab = "cm", notch = TRUE) # Horizontal boxplot: boxplot(iris$Sepal.Width, col = "orchid2", pch = 18, main = "My First Fancy Boxplot", col.axis ="orchid", col.lab ="orchid", col.main ="orchid", border = "red", boxwex = 1, cex = 1.5, xlab = "Sepal Width", ylab = "cm", horizontal = TRUE) # Parallel or perpendicular axis labels with las option labels are parallel (las=0) or perpendicular(las=1) to axis) boxplot(iris$Sepal.Width, col = "orchid2", pch = 18, main = "My First Fancy Boxplot", col.axis ="orchid", col.lab ="orchid", col.main ="orchid", border = "red", boxwex = 1, cex = 1.5, xlab = "Sepal Width", ylab = "cm", las = 0) # default option windows() # to allow comparison of the two graphs boxplot(iris$Sepal.Width, col = "orchid2", pch = 18, main = "My First Fancy Boxplot", col.axis ="orchid", col.lab ="orchid", col.main ="orchid", border = "red", boxwex = 1, cex = 1.5, xlab = "Sepal Width", ylab = "cm", las = 1) # perpendicular labels # To remove outliers (from the graph) with the option outline=FALSE: boxplot(iris$Sepal.Width, col = "orchid2", pch = 18, main = "My First Fancy Boxplot", col.axis ="orchid", col.lab ="orchid", col.main ="orchid", border = "red", boxwex = 1, cex = 1.5, xlab = "Sepal Width", ylab = "cm", outline = FALSE) # Add a horizontal line at the desired y axis value: data(iris) boxplot(iris[-5]) abline(h=2.5, col = "red") # The three line code above is equivalent to: boxplot(iris[-5]); abline(h=2.5, col = "red") # single line command # With added options: boxplot(iris$Sepal.Width ~ iris$Species, col = "orchid2", border = "red"); abline(h = 3.1, col = "red") # Boxplots of a variable for each category (factor): boxplot(iris$Sepal.Length ~ iris$Species) ## Save your boxplot using the menu on the graphics window: # File > Save As > TIFF or "Copy to the Clipboard" # Saving via the menu does not have the flexibility of setting up the pixels. Better to do it via code (for high resolution set height and width to at least 1200 pixels) ## If you want to save your plot by code, try: tiff("myplot.tiff") # If you want to save it as TIFF boxplot(iris[-5]) # The plot you want to generate and save (which will not appear on the screen). No arguments used here. dev.off() # Device off (the plot will be saved in the working directory) # OR in one line: tiff("myplot.tiff"); boxplot(iris[-5]); dev.off() ## If you want to save the plot at a resolution different from the default, see the script: tiff.R # A sample scatter plot with lots of options: a <- rnorm(50) b <- rnorm(50) plot(a, b, pch = 16, col = "red", xlim = c(-3, 3), ylim = c(-3, 3), main = "MY PLOT", xlab = "X VARIABLE" , ylab = "Y VARIABLE" , col.axis="blue", col.lab="red", col.main="darkblue") ### Descriptive statistics with R: Statistical metrics (base R) # For each vector: mean(iris$Sepal.Length) sd(iris$Sepal.Length) var(iris$Sepal.Length) median(iris$Sepal.Length) mad(iris$Sepal.Length) # mad = median absolute deviation (from the median) ~ SD in normally distributed data IQR(iris$Sepal.Length) # interquartile range For kurtosis and skewness, another package is needed: install.packages("moments"); library("moments"); skewness(iris$Sepal.Length); kurtosis(iris$Sepal.Length) # This one line command installs and loads the library "moments" and calculates kurtosis # A cumulative summary of some of the above in one command: summary(iris$Sepal.Length) # Package: psych install.packages("psych") library("psych") a <- c(2,4,3,4,5,6,5,4,5,6,7,4,3,5,4,6,5,7,6,5,4,5,6,7) describe(a) # summary statistics for the vector "a" describe(iris[-5]) # summary statistics for the numerical variables in the dataframe "iris" describe(iris[-5], IQR = TRUE, quant = c(0.10, 0.25, 0.50, 0.75, 0.90)) # more detailed summary statistics for the numerical variables in the dataframe "iris" # You can also obtain summary statistics by a grouping factor (like Species in iris): describeBy(iris, group = iris$Species) # Package: fBasics install.packages("fBasics") library("fBasics") a <- c(2,4,3,4,5,6,5,4,5,6,7,4,3,5,4,6,5,7,6,5,4,5,6,7) basicStats(a) basicStats(iris[-5]) # Package: pastecs stat.desc(x, basic=TRUE, desc=TRUE, norm=FALSE, p=0.95) install.packages ("pastecs") library("pastecs") x <- c(2,4,3,4,5,6,5,4,5,6,7,4,3,5,4,6,5,7,6,5,4,5,6,7) stat.desc(x) # Try the stat.desc() function with a full dataframe (not a vector): stat.desc(iris, norm = TRUE) # measures of normal distribution included # Package: Hmisc install.packages("Hmisc") library("Hmisc") x <- c(2,4,3,4,5,6,5,4,5,6,7,4,3,5,4,6,5,7,6,5,4,5,6,7) describe(x) describe(iris) # Package: epiR for basic stats for aritmetic and geometric scales install.packages("epiR") library("epiR") data(iris) epi.descriptives(iris$Sepal.Length, conf.level = 0.95) # only works for vectors; do not try for the whole dataframe "iris" ## Normality checking with histogram and density plots in base R: data(iris) hist(iris$Sepal.Width, col = "orchid2") windows() d <- density(iris$Sepal.Width); plot(d, col = "azure") # OR to have both plots in one graph: par(mfrow=c(1,2)) hist(iris$Sepal.Width, col = "orchid2") d <- density(iris$Sepal.Width); plot(d, col = "azure") # Histogram and density plots for normality checking in the whole dataframe using DataExplorer install.packages("DataExplorer") library("DataExplorer") data(iris) plot_histogram(iris) # Ideal for eye test for normal distribution of all variables in one multipanel graph plot_density(iris) # Ideal for eye test for normal distribution of all variables in one multipanel graph ## You can have the two plots simultaneously on two windows: plot_histogram(iris); windows(); plot_density(iris) # Three lines of code is in one line separated by semicolons ## Histogram overlaid with density plots using ggplot2: # https://www.r-bloggers.com/how-to-make-a-histogram-with-ggplot2/ # http://www.sthda.com/english/wiki/ggplot2-histogram-plot-quick-start-guide-r-software-and-data-visualization ## Histogram overlaid with density plot with ggplot2: library("ggplot2") ggplot(iris, aes(x=Sepal.Length)) + geom_histogram(aes(y=..density..), colour="red", fill="orchid2") + geom_density(alpha=.2, fill="lightskyblue3") ## Simple Fast Exploratory Data Analysis in R with DataExplorer Package install.packages("DataExplorer") library("DataExplorer") data(iris) plot_str(iris) plot_missing(iris) plot_histogram(iris) # Ideal for eye test for normal distribution of all variables in one multipanel graph plot_density(iris) # Ideal for eye test for normal distribution of all variables in one multipanel graph plot_correlation(iris) # All pairwise correlations with heatmap # For options of plot_correlation(), see: https://www.rdocumentation.org/packages/DataExplorer/versions/0.6.0/topics/plot_correlation plot_bar(iris) # Bar graph for categorical variables # Next command to create a full report requires "installr" and then installation of "pandoc" with installr: install.packages("installr") library("installr") install.pandoc() # If successful (may have 32byte - 64byte issues): create_report(iris) # Runs all the functions above and creates a full report in HTML format ## Normality checking with QQ plots using base R: qqnorm(iris$Sepal.Width, col = "darkblue"); qqline(iris$Sepal.Width, col = "orchid") qqnorm(iris$Petal.Length, col = "darkblue"); qqline(iris$Petal.Length, col = "orchid") ## Normality checking with statiscs: # Shapiro-Wilk test: shapiro.test(iris$Sepal.Width) shapiro.test(iris$Petal.Length) # Same test on simulated vectors: set.seed(334) # To get the same simulated numbers every time a <- rnorm(100, 30, 5) # To generate a sample of size 25 from a normal distribution with mean 30 and standard deviation 5 shapiro.test(a) # To run the Shapiro-Wilk test on vector "a" set.seed(334) b <- runif(100, min = 20, max = 30) # To generate a sample size of 25 random numbers between 20 and 30 shapiro.test(b) # To run the Shapiro-Wilk test on vector "a" # Kolmogorov-Smirnov test: ks.test(iris$Sepal.Width, pnorm, mean(iris$Sepal.Width), sd(iris$Sepal.Width)) # To run the K-S test on Sepal.Width ks.test(iris$Petal.Length, pnorm, mean(iris$ Petal.Length), sd(iris$ Petal.Length)) # To run the K-S test on Petal.Length ### WELL DONE ### * * * * * * * ## Additional notes on relevant graphics: ## HISTOGRAMS with base R::lattice # Histogram by a grouping factor using lattice included in base R: data(infert) library(lattice) histogram(~ parity | case, data = infert) # parity is a count variable (a factor), and case is the case-control status variable histogram(~ spontaneous | case, data = infert) # spontaneous is a count variable (a factor), and case is the case-control status variable # Multiple histograms in the same panel: library(lattice) data(infert) histogram( ~ parity + spontaneous + induced, data = infert) ## GGPLOT2: # Correlation scatter plot by a grouping factor (in different colours): library("ggplot2") data(iris) qplot(Petal.Width, Petal.Length, color = Species, data = iris) ## Pie chart using base R: v <- c(88, 76, 52, 26, 12) pie(v) pie(v, main = "Number of e-mails received", col = rainbow(length(v)), labels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")) ## Barplot example (with labels and colours): v <- c(88, 76, 52, 26, 12) barplot(v) barplot(v, main = "Number of e-mails received", xlab = "Days", ylab = "Count", col = rainbow(length(v)), names.arg = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"), border = "blue") ## 3D piechart: install.packages("plotrix") # installs the package (called "plotrix" in this case) library(plotrix) # calls the package (called "plotrix") v <- c(2,6,4,9,10,1,12,3,5) # generates a vector called "v" pie3D(v) # generates a 3D piechart of "v" (D = capital letter!) ## R Color Chart (http://www.endmemo.com/program/R/color.php): R has 657 built-in color names. The function colors() will show all of them. All these color names can be used in plot parameters like col=. The function col2rgb() can convert all these colors into RGB numbers. ## PCH Symbols Chart lists PCH symbols used in R plot (http://www.endmemo.com/program/R/pchsymbols.php). When the PCH is 21-25, the parameter "col=" and "bg=" should be specified. PCH can also be in characters, such as "#", "%", "A", "a", and the character will be plotted. ## Creating Heat Maps and Contour Plots (Chapter 8 of R Graph Cookbook_PACKTPUB.pdf; p.181) ## Finalizing graphs for publications and presentations (Chapter 10 of R Graph Cookbook_PACKTPUB.pdf; p.223) ## R Graph Essentials_PACKTPUB.pdf