## Codes used in session 3 (from the slides) ## To run the code, open this script file on R (File > Open script..), and select the code you want to run (like selecting text) and press "CTRL + R". ## Generating series of numbers (vectors) of different kinds seq(1:100) # To have sequential numbers from 1 to 100 in a vector seq(from=1, to=3, by=0.1) # To have a series of sequential numbers starting from 1 to 3 and increasing by 0.1 runif(4) # To generate four random numbers within the default range of 0 to 1 runif(10, min=1, max=100) # To generate 10 random numbers within the range 1 to 100 rnorm(n, mean = 0, sd = 1) # To generate "n" number of random numbers fitting to normal distrbution with default mean=0 and sd=1 rnorm(25, 30, 5) # To generate a sample of size 25 from a normal distribution with mean 30 and standard deviation 5 a <- rbinom(100, 10, 0.5) # 100 people tossing a fair coin (third argument=0.5) 10 times # mean(a) should be close to 10*0.5 = 5 a <- c(1,5,3,8) # to create a vector of these numbers and assign it to an R object called ' a ' ###################### # Let's generate a dataset: set.seed(12345) id <- seq(1:242) aget <- rnorm(n=242, mean=52, sd=6) sbp1t <- rnorm(n=242, mean=182, sd=5) sbp2t <- rnorm(n=242, mean=152, sd=6) bmit <- rnorm(n=242, mean=28, sd=4) diett <- rbinom(242, 1, 0.8) owt <- rbinom(242, 1, 0.6) data <- data.frame(id, aget, sbp1t, sbp2t, bmit, diett, owt) head(data) str(data) # round up everything to one decimal data$age <- round(data$aget, digits=1) # continuous variable data$sbp1 <- round(data$sbp1t, digits=1) # continuous variable data$sbp2 <- round(data$sbp2t, digits=1) # continuous variable data$age <- round(data$aget, digits=1) # continuous variable data$bmi <- round(data$bmit, digits=1) # continuous variable # convert categorical variables to factor data$diet <- as.factor(data$diett) # binary variable data$ow <- as.factor(data$owt) # binary variable # two additional variable for trend test data$trend1 <- rbinom(242, 2, 0.6) # a three level variable for trend analysis data$trend2 <- rbinom(242, 2, 0.3) # another three level variable for trend analysis head(data) tail(data) str(data) ###################### # Generate a dataset/dataframe for a case-control study: cc <- data.frame(case.id = 1:100, age = rnorm(100, mean = 60, sd = 12), caco = gl(2, 50, labels = c("Case", "Control"))) # All of this command can be written in a single line # gl() function generates factors with levels summary(cc) # Summary descriptive statistics of the newly generated dataset # Generate a dataset with multiple random variables: set.seed(955) # set.seed() makes sure that the simulated dataset will be the same everytime vvar <- 1:20 + rnorm(20,sd=3) wvar <- 1:20 + rnorm(20,sd=5) xvar <- 20:1 + rnorm(20,sd=3) yvar <- (1:20)/2 + rnorm(20, sd=10) zvar <- rnorm(20, sd=6) data <- data.frame(vvar, wvar, xvar, yvar, zvar) head(data) # Create a 2x2 Contingency Table: x <- matrix(c(22, 46, 66, 58), nrow = 2) # To create a 2x2 table # Alternatively, use the data editor: x <- data.frame() # Assign a name to the contingency table to be created fix(x) # Opens the data editor to enter the cell values (rxc) x # Prints the newly created contingency table ## Factors as grouping variables (with levels) data(iris) str(iris) # Shows "Species" as a factor. If it wasn't a factor, but we need it to be a factor for analysis purposes, we would use the following command: iris$Species <- as.factor(iris$Species) str(iris) # Would now show as a factor (but it aleready is) data(infert) str(infert) # showing "case" as numerical variable infert$case <- as.factor(infert$case) # changing "case" variable from class numerical to class factor str(infert) # showing "case" as factor class(infert$case) # checking the class of case variable ## apply() data(iris) apply(iris[-5], 2, mean) # "2" as the second argument denotes columns ## tapply() data(iris) tapply(iris$Petal.Length, iris$Species, mean) ## aggregate() data(iris) aggregate(.~Species, iris, var) # var = variance ## with() data(infert) with(infert, t.test(infert$parity ~ infert$case)) # parity is a count variable and case is the case-control indicator (grouping factor) ## by() data(iris) by(iris$Sepal.Length, iris$Species, summary) # summary statistics of Sepal.Length is requested for each Species ## Generating two (2x2) contingency tables (ct1 and ct2): ct1 <- matrix(c(43, 28, 42, 58), nrow = 2) ct2 <- matrix(c(53, 38, 52, 78), nrow = 2) # Pearson's Chi-squared test: chisq.test(ct1) # using a contingency table chisq.test(ct2) # using a contingency table # Fisher's exact test: fisher.test(ct1) # using a contingency table fisher.test(ct2) # using a contingency table # chisq.test and fisher.test directly with the binary variables in the dataframe chisq.test(binary1, binary2) # binary1 is the input variable, binary2 is the outcome variable in the dataframe) fisher.test(binary1, binary2) # binary1 is the input variable, binary2 is the outcome variable in the dataframe) ## Cochrane-Mantel-Haenszel test for combined analysis of multiple contingency tables with weights: # First, put together multiple contingency tables (in this case, ct1 and ct2) in an array: array1 <- array(c(ct1, ct2), dim = c(2,2,2)) # " dim = c(2,2,2) " denotes two 2x2 tables # install the package "lawstat" if needed: install.packages("lawstat") library("lawstat") cmh.test(array1) ## Dose-response (trend) test using base R: # Requires data from at least 3 groups to check linear change (or trend) in the effect size # Generate a 5x2 table as a matrix: matrix1 <- matrix(c(35, 82, 250, 293, 196, 190, 136, 71, 32, 13), nrow = 5, byrow = TRUE) colnames(matrix1) <- c("cases", "controls") rownames(matrix1) <- c("0", "1", "2", "3", "4+") matrix1 # to inspect the matrix with row and column names included # We will need the row sums for this test: sums <- rowSums(matrix1) # We need to run prop.trend.test() with the arguments being x (first column numbers) and n (row totals) prop.trend.test(x = c(35, 250, 196, 136, 32), n = sums) # Another example for prop.trend.test() for a new 4x2 table: prop.trend.test(x = c(47, 31, 9, 5), n = c(49, 40, 18, 10)) ## Dose-response (trend) test using the R package "DescTools": # https://www.rdocumentation.org/packages/DescTools/versions/0.99.19/topics/MHChisqTest # A rxc (4x4) table for Job Satisfaction (from Agresti 2002, p.57): job <- matrix( c(1,2,1,0, 3,3,6,1, 10,10,14,9, 6,7,12,11), 4, 4, dimnames = list(income = c("< 15k", "15-25k", "25-40k", "> 40k"), satisfaction = c("VeryD", "LittleD", "ModerateS", "VeryS")) ) job # to inspect the 4x4 table MHChisqTest(job, srow=c(7.5,20,32.5,60)) #runs the MH Chi-squared test for trend (degrees of freedom = 1) # Another example for MHChisqTest() using the same matrix (matrix1) created above: matrix1 <- matrix(c(35, 82, 250, 293, 196, 190, 136, 71, 32, 13), nrow = 5, byrow = TRUE) colnames(matrix1) <- c("cases", "controls") rownames(matrix1) <- c("0", "1", "2", "3", "4+") matrix1 # to inspect the matrix with row and column names included MHChisqTest(matrix1) # which yields a very similar result to that of prop.trend.test() for matrix1 above ## Chi-squared test as goodness-of-fit test (using Mendel's results) chisq.test(c(26, 31, 26, 27), p = c(0.25, 0.25, 0.25, 0.25)) # First four numbers are the observed counts; 0.25 is the expected proportion for all four counts ## McNemar test for matched data analysis: # Assuming data given in ct1 is from a matched design (like controls are matched to the cases by age and gender): mcnemar.test(ct1) ## Z-test for proportion (like Chi-squared or Fisher's exact test): prop.test(76, 100, 0.50) # One-sample Z-test; 76 is the observed count out of the sample size of 100, and 0.50 is the expected proportion prop.test(c(76, 50), c(100, 100)) # The proportions compared are 76/100 and 50/100 # If continuity correction is not needed, correct = FALSE argument can be added ### For a complete analysis of a 2x2 table, run the script "contingency.R" ###