## Codes used in session 3 (from the slides) 
## To run the code, open this script file on R (File > Open script..), and select the code you want to run (like selecting text) and press "CTRL + R".


## Generating series of numbers (vectors) of different kinds 

seq(1:100)                 # To have sequential numbers from 1 to 100 in a vector
seq(from=1, to=3, by=0.1)  # To have a series of sequential numbers starting from 1 to 3 
                             and increasing by 0.1 
runif(4)                   # To generate four random numbers within the default range 
                             of 0 to 1 
runif(10, min=1, max=100)  # To generate 10 random numbers within the range 1 to 100 
rnorm(n, mean = 0, sd = 1) # To generate "n" number of random numbers fitting to 
                             normal distrbution with default mean=0 and sd=1 
rnorm(25, 30, 5)           # To generate a sample of size 25 from a normal 
                             distribution with mean 30 and standard deviation 5
a <- rbinom(100, 10, 0.5)  # 100 people tossing a fair coin (third argument=0.5) 10 
                             times
                           # mean(a) should be close to 10*0.5 = 5
a <- c(1,5,3,8)            # to create a vector of these numbers and assign it to an R object called ' a '

######################
# Let's generate a dataset:
set.seed(12345)
id     <- seq(1:242)
aget   <- rnorm(n=242, mean=52, sd=6)
sbp1t  <- rnorm(n=242, mean=182, sd=5)
sbp2t  <- rnorm(n=242, mean=152, sd=6)
bmit   <- rnorm(n=242, mean=28, sd=4)
diett  <- rbinom(242, 1, 0.8)
owt    <- rbinom(242, 1, 0.6)
data   <- data.frame(id, aget, sbp1t, sbp2t, bmit, diett, owt)
head(data)
str(data)

# round up everything to one decimal
data$age   <- round(data$aget, digits=1)	# continuous variable
data$sbp1  <- round(data$sbp1t, digits=1)	# continuous variable
data$sbp2  <- round(data$sbp2t, digits=1)	# continuous variable
data$age   <- round(data$aget, digits=1)	# continuous variable
data$bmi   <- round(data$bmit, digits=1)	# continuous variable

# convert categorical variables to factor
data$diet  <- as.factor(data$diett)	# binary variable
data$ow    <- as.factor(data$owt)	# binary variable

# two additional variable for trend test 
data$trend1 <- rbinom(242, 2, 0.6)	# a three level variable for trend analysis
data$trend2 <- rbinom(242, 2, 0.3)	# another three level variable for trend analysis

head(data)
tail(data)
str(data)
######################

# Generate a dataset/dataframe for a case-control study:
        cc <- 
           data.frame(case.id = 1:100,
           age = rnorm(100, mean = 60, sd = 12),
           caco = gl(2, 50, labels = c("Case", "Control")))   # All of this command can be written in a single line
                                                              # gl() function generates factors with levels 

        summary(cc)        # Summary descriptive statistics of the newly generated dataset 

# Generate a dataset with multiple random variables:

set.seed(955)              # set.seed() makes sure that the simulated dataset will be the same everytime
vvar <- 1:20 + rnorm(20,sd=3)
wvar <- 1:20 + rnorm(20,sd=5)
xvar <- 20:1 + rnorm(20,sd=3)
yvar <- (1:20)/2 + rnorm(20, sd=10)
zvar <- rnorm(20, sd=6)
data <- data.frame(vvar, wvar, xvar, yvar, zvar)
head(data)

# Create a 2x2 Contingency Table:

x  <-  matrix(c(22, 46, 66, 58), nrow = 2)    # To create a 2x2 table
                           # Alternatively, use the data editor: 
x <- data.frame()          # Assign a name to the contingency table to be created 
fix(x)	                   # Opens the data editor to enter the cell values (rxc)
x	                   # Prints the newly created contingency table

## Factors as grouping variables (with levels) 

data(iris)
str(iris)     # Shows "Species" as a factor. If it wasn't a factor, 
              but we need it to be a factor for analysis purposes, 
              we would use the following command: 

iris$Species <- as.factor(iris$Species)
str(iris)     # Would now show as a factor (but it aleready is) 

data(infert)
str(infert)   # showing "case" as numerical variable
infert$case <- as.factor(infert$case)      # changing "case" variable from class numerical to class factor 
str(infert)   # showing "case" as factor
class(infert$case)    # checking the class of case variable 

## apply()
data(iris)
apply(iris[-5], 2, mean)       # "2" as the second argument denotes columns

## tapply()
data(iris)
tapply(iris$Petal.Length, iris$Species, mean)

## aggregate()
data(iris) 
aggregate(.~Species, iris, var)    # var = variance

## with()
data(infert)
with(infert, t.test(infert$parity ~ infert$case))  # parity is a count variable and case is the case-control indicator (grouping factor)

## by()
data(iris)
by(iris$Sepal.Length, iris$Species, summary)    # summary statistics of Sepal.Length is requested for each Species

## Generating two (2x2) contingency tables (ct1 and ct2): 
ct1 <- matrix(c(43, 28, 42, 58), nrow = 2)
ct2 <- matrix(c(53, 38, 52, 78), nrow = 2)

# Pearson's Chi-squared test:
chisq.test(ct1)		# using a contingency table
chisq.test(ct2)		# using a contingency table

# Fisher's exact test:
fisher.test(ct1)	# using a contingency table
fisher.test(ct2)	# using a contingency table

# chisq.test and fisher.test directly with the binary variables in the dataframe
chisq.test(binary1, binary2)	# binary1 is the input variable, binary2 is the outcome variable in the dataframe)
fisher.test(binary1, binary2)	# binary1 is the input variable, binary2 is the outcome variable in the dataframe)

## Cochrane-Mantel-Haenszel test for combined analysis of multiple contingency tables with weights:
# First, put together multiple contingency tables (in this case, ct1 and ct2) in an array:
array1 <- array(c(ct1, ct2), dim = c(2,2,2))   # " dim = c(2,2,2) " denotes  two 2x2 tables
# install the package "lawstat" if needed: install.packages("lawstat")
library("lawstat")
cmh.test(array1)

## Dose-response (trend) test using base R: 
# Requires data from at least 3 groups to check linear change (or trend) in the effect size
# Generate a 5x2 table as a matrix: 
matrix1 <- matrix(c(35, 82, 250, 293, 196, 190, 136, 71, 32, 13), nrow = 5, byrow = TRUE)
colnames(matrix1) <- c("cases", "controls")
rownames(matrix1) <- c("0", "1", "2", "3", "4+")
matrix1     # to inspect the matrix with row and column names included 
# We will need the row sums for this test:
sums <- rowSums(matrix1)
# We need to run prop.trend.test() with the arguments being x (first column numbers) and n (row totals)
prop.trend.test(x = c(35, 250, 196, 136, 32), n = sums)

# Another example for prop.trend.test() for a new 4x2 table:
prop.trend.test(x = c(47, 31, 9, 5), n = c(49, 40, 18, 10))

## Dose-response (trend) test using the R package "DescTools":
# https://www.rdocumentation.org/packages/DescTools/versions/0.99.19/topics/MHChisqTest
# A rxc (4x4) table for Job Satisfaction (from Agresti 2002, p.57): 
job <- matrix( c(1,2,1,0,  3,3,6,1,  10,10,14,9,  6,7,12,11), 4, 4, 
              dimnames = list(income = c("< 15k", "15-25k", "25-40k", "> 40k"),
                              satisfaction = c("VeryD", "LittleD", "ModerateS", "VeryS")) )
job           # to inspect the 4x4 table
MHChisqTest(job, srow=c(7.5,20,32.5,60))     #runs the MH Chi-squared test for trend (degrees of freedom = 1)

# Another example for MHChisqTest() using the same matrix (matrix1) created above: 
matrix1 <- matrix(c(35, 82, 250, 293, 196, 190, 136, 71, 32, 13), nrow = 5, byrow = TRUE)
colnames(matrix1) <- c("cases", "controls")
rownames(matrix1) <- c("0", "1", "2", "3", "4+")
matrix1       # to inspect the matrix with row and column names included 
MHChisqTest(matrix1)     # which yields a very similar result to that of prop.trend.test() for matrix1 above

## Chi-squared test as goodness-of-fit test (using Mendel's results)
chisq.test(c(26, 31, 26, 27), p = c(0.25, 0.25, 0.25, 0.25))  # First four numbers are the observed counts; 0.25 is the expected proportion for all four counts

## McNemar test for matched data analysis: 
# Assuming data given in ct1 is from a matched design (like controls are matched to the cases by age and gender):
mcnemar.test(ct1)

## Z-test for proportion (like Chi-squared or Fisher's exact test):
prop.test(76, 100, 0.50)           # One-sample Z-test; 76 is the observed count out of the sample size of 100, and 0.50 is the expected proportion

prop.test(c(76, 50), c(100, 100))  # The proportions compared are 76/100 and 50/100
                                   # If continuity correction is not needed, correct = FALSE argument can be added


### For a complete analysis of a 2x2 table, run the script "contingency.R" ###