#####     Introduction to R: basic commands and objects     #####
#
#
# From menu do the following
#   Session --> Set Working Directory --> To Source File Location
# 
# R may be used as a simple calculator:
5 + 9 # +, -, *, /, ^ , sqrt() and log() are the most common operators...
# Set a scalar
x <- 10
# Look at the object
x
print(x)
# We may use "=" instead of the assignment "<-".
x = 10
x
#
#### Set a vector
v <- c(3,4,5)
# c stands for concatenate
#
## Assignment 1
## Create the following vector a: a = (4, 7, 6, 8)
#
# Creating sequences of
1:7
vec.1 <- 1:7
vec.1
?seq
b <- seq(from = 1, to = 7, by = 0.5)
length(b)
class(b)
b
# 
# back to vector v
v
class(v)
v <- as.integer(v)
class(v)
#
# Vectors & logical values
l <- c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE)
l
class(l)
summary(l)
# Logical values may be expressed as dummy variables (0, 1)
l1 <- as.numeric(l)
class(l1)
l1
summary(l1)
# Transformation to "character" class is also possible
l2 <- as.character(l)
class(l2)
l2
summary(l2)
#
# Factors
#
Gender <- c("Male", "Female", "Female")
Gender
class(Gender)
# Factors are used to represent categorical data.
# Factors are useful in modelling, e.g. using lm() and glm()
genderFactors <- as.factor(Gender)
genderFactors
class(genderFactors)
summary(genderFactors)
#
# A vector may only contain objects of the same class
# One exception is a list - special type of vector that
# may contain elements of different classes:
vect <- c(1L, "a", TRUE, 20.5)
vect
class(vect)
#
list1 <- list(1L, "a", TRUE, 20.5)
class(list1)
list1
str(list1)
#
# Subsetting: Access to different elements of a vector
b
# Vector elements are indexed starting from 1 (not from 0!)
b[1]
b[1:3]
b[3:5]
b[5:length(b)]
b[c(1, 4, 5)]
# Elements not-to-be-returned may be excluded using the minus sign:
b[-1]
b[-(2:4)]
# Elements returned in reverse order:
b[5:1]
#
# List and delete variables within the Global Environment
ls()
rm("b")
rm(l)
# The following command empties the whole Global Environment!
rm(list = ls())
#
##### Matrices
#
# Matrix: set matrix, assign names to columns/rows, access elements, cols/rows
?matrix
M1 <- matrix( c(1,2,3,4,5,6,7,8,9), ncol=3)
# By default, R creates matrices by filling in columns.
# This may be changed using the byrow=TRUE argument
M2 <- matrix( c(1,2,3,4,5,6,7,8,9), byrow=TRUE, ncol=3)
M1
M2
dim(M1)
summary(M1) # columns treated as variables, rows as observations
#
#
?colnames
colnames(M1) <- c("variable_1", "variable_2", "variable_3")
M1
colnames(M1)
summary(M1)
#
## Assignment 2
## Use the function rownames (see ?rownames) to name the rows in M1
## as "observation_1", ...
#
M1
#
# 
##### Summing, multiplying and other basic operations among scalars
#
?rnorm() # distributions and random number will be discussed separately
#
x <- 10 + rnorm(1, mean=0, sd=2)
x
x*x
x^3
x+x
sqrt(x)
log(x)
exp(x)
#
x1 <- -2:2
x1
sign(x1)
#
diff(x1)
#
# Special "values": NaN and Inf
0/0 # Not a Number, undefined value
1/0 # Infinity
-1/0
#
##### Summing and multiplying scalars and vectors
#
a <- c(3, 7, 11)
b <- 1:17
c <- 1:6
# Lets have a look at the data
x
a
c
b
x*a # each element of vector a is multiplied by scalar x 
a*c # elements of a and c are individually multiplied, a 'loops' twice
a*b # element-wise multiplication is performed even if the length
    # of the longer vector is not a multiple of the shorter object
    # length - warning message is displayed
#
#
##### Matrix multiplication operator is %*%
#
M3 <- M1%*%M2 
M3
M2%*%a # Multiply by a vector, vectors are "column" by default
t(M2) # Transpose
M1 - t(M2)
#
#
#
##### Subsetting a matrix
#
M2
M2[2,3] # the element in 2nd row and 3rd column is returned
# by default, this element is returned as a vector of length 1
# this may be turned off by using the argument drop=FALSE
M2[2,3, drop=FALSE]
# 
M2[,2] # missing indices: whole row/column is returned 
M2[, 2, drop=FALSE]
M2[1,]
M2[1, , drop=FALSE]
M2[1:2,]
#
#
##### Loops
# If you want to perform an operation multiple times you will use 
# for loop or while loop
#
# For loop -> you know how many times you want to do the operation
# while loop -> you dont know how many times, but you have a variable 
#               that will turn FALSE when the loop should stop

# Easy loop, prints 1, 5, 5
for (j in c(1,5,6)) {
  print(j)
}
#
# The loop above does the following commands
j = 1
{print(j)}
j = 5
{print(j)}
j = 6
{print(j)}
#
# You can put anything into the iterating vector
for (i in c("Hello","World!")) {
  print(i)
}
# When do you use for loop? When you know how many times it will run!
#
#
# While is a type of loop that is repeating the code until some condition is evaluated as false
#
# First, we set i equal to 2
i <- 2 
while (i < 100000) { 
  i <- i^2  
  print(i) # Prints the value to Console
}
# The condition here is (i<100000) which is true if i<100000 and false otherwise, 
# so if i>100000 the code will stop repeating. 
# What if you change the code and set i = 1?
#
#
#
##### Conditionals (Control structure if-else)
#
# Evaluate and execute the code only if some condition is evaluated to TRUE
#
i <- 100 + rnorm(1, mean = 0, sd = 10)
if (i > 100) {
    print(c("i =", i, "and i is bigger than 100."))
    print(paste("i =", i, "and  i is bigger than 100."))
} else {
    print (c("i =", i, "and i is lower or equal to 100."))
    print(paste("i =", i, "and i is lower or equal to 100."))
}
# 
# Indentation should be observed... 
# 
#### c() vs. paste()
#
?paste
date()
Date <- paste("Today is", date())
Date
str(Date)
class(Date)
#
Date2 <- c("Today is", date())
Date2
str(Date2)
#
#
#
#### Functions - basic rules and examples
#
# Declare a simple function - no arguments
myFunction1 <- function() {
  myNoise1 <- rnorm(30, mean=10, sd=0.5) #we set custom defaults for rnorm
  print(myNoise1)
} 
# Run a function
myFunction1()
#
#
# Arguments are usually passed to (used in) funtions:
# 
myFunction4 <- function(n) {
  set.seed(1) # 
  myNoise4 <- rnorm(n, mean=10, sd=0.5)
  print(myNoise4)
}
# This function creates 'n' normally distr. random numbers with mean=10 and sd=0.5
# We have to provide 'n' as an argument to the function myFunction4()
myFunction4(4)
myFunction4(n = 4)
#
#
#
#### Selected Logical operators
#
#    <         less than
#    <=	       less than or equal to
#    >	       greater than
#    >=	       greater than or equal to
#    ==	       exactly equal to
#    !=	       not equal to
#    x | y	   x OR y
#    x & y	   x AND y
#    xor(TRUE, FALSE) evaluates to TRUE if one elements evaluates
#                            to TRUE while other evaluates to FALSE
#    any()     TRUE if at least one element evaluates to TRUE
#    all()     TRUE if all elements evaluate to TRUE
#
11 > 11
11 >= 11
8 == 9
8 != 9
(8 == 9) | TRUE
FALSE & TRUE
xor(8 > 7, 8 != 8)
any(T, F, F)
all(T, F, T, T)
A <- -10:10
A
any(A >= 12)
all(A <= 7)
#
## which() command is used to find indices of elements
#          satisfying given condition
which(A > 5)
which(A == 0)
# 
?length
# Counts and percentages using length(which())
# Count
length(which(A < 0))
# %
per.cent <- round(100 * length(which( A < 0))/length(A), 2)
paste("(A < 0) values make", per.cent, "% of the data.")
# sum() may also be used with T/F evaluation as an argument
sum(A !=0)
#
#
# remove everything
rm(list=ls())
#
#
# Reading data from a text file 
LungCapacity <- read.table("LungCapacity.txt", sep = "\t", header = T)
#
str(LungCapacity)
#
# Write the LungCapacity data frame as a .csv file into the working directory
write.csv(LungCapacity, file = "Lung.csv", row.names = FALSE)
#
## Assignment 2
##  1) Check your Working directory: is the "Lung.csv" file there?
##  2) use "?write.csv" and find out what the argument"row.names = FALSE" means and does.
#
# Reading data from a .csv file
Lung2 <- read.csv("Lung.csv")
#
# As we won't need the Lung2 dataframe, we may delete it from the Global Environment
rm(Lung2)
#
#
### Basic operations with a dataframe, i.e. looking at data
summary(LungCapacity)
head(LungCapacity, 10)
dim(LungCapacity)
names(LungCapacity)
#
attach(LungCapacity) # Variables may be accessed directly, 
# without the LunCapacity$ 'prefix'
object.size(LungCapacity)
#
table(Gender) # LungCapacity is 'attached', no need to use LungCapacity$Gender
table(Smoke) # frequencies
table(Smoke)/length(Smoke) # proportions
#
myTable <- table(Gender, Smoke)
myTable
class(myTable)
str(myTable)
#
#
# Basic statistics
#
summary(LungCap)
mean(LungCap)
median(LungCap)
sd(LungCap)
var(LungCap)
?cor
cor(LungCap, Height) # pair-wise correlation
cor(LungCapacity) # ! May not be used for non-numeric variables
cor(LungCapacity[,1:3]) # Correlation matrix is returned
covMatrix <- cov(LungCapacity[,1:3]) # covariance matrix is returned
covMatrix
# Variances are on the diagonal of covMatrix
diag(covMatrix)
#
#
range(LungCap)
min(LungCap)
max(LungCap)
?quantile
quantile(LungCap, probs = c(0, 0.025, 0.25, 0.5, 0.75, 0.975, 1))
#
#
### Plots ###
#
#
### Bar plots ###
#
barplot(LungCap)
barplot(LungCap, horiz=T)
box()
#
### Boxplots ###
#
boxplot(LungCap)
#
# Circle: outlier, Box-bottom: 25th percentile, Thick-line: median
# Box-top: 75th percentile
# "Whiskers": defined as a fraction of inter-quartile range
# See ?boxplot
#
# Next, we may observe how arguments to "boxplot" work.
boxplot(LungCap[Smoke == "no"], LungCap[Smoke == "yes"])
boxplot(LungCap ~ Smoke)
boxplot(LungCap ~ Smoke, main = "Two-Group Boxplot")
#
boxplot(LungCap ~ Smoke, main = "Two-Group Boxplot",
        ylab = "Lung Capacity")
#
boxplot(LungCap ~ Smoke, main = "Two-Group Boxplot",
        ylab = "Lung Capacity", las=1)
#
boxplot(LungCap ~ Smoke, main = "Two-Group Boxplot",
        ylab = "Lung Capacity", las=1, col = 3)
#
#
## Assignment 3
## Create a two-group boxplot,
## describing Height for each Gender.
#
### Histograms ###
#
hist(LungCap)
hist(LungCap, las = 1, col = 2)
hist(LungCap, las = 1, col = 2, border = 3)
#
hist(LungCap, las = 1, col = 2, border = 3,
     density = 10, angle = 60)
#
#
?density
### Density ###
plot(density(LungCap, na.rm = TRUE))
# Computes and shows density estimates
#
#
### Stacked barcharts ###
#
#
# Stacked barcharts are based on contingency tables
#
Table1 <- table(Smoke, Gender)
Table1
barplot(Table1)
barplot(Table1, beside =T)
barplot(Table1, beside =T, legend.text=T)
barplot(Table1, beside =T, legend.text= c("Non smoker", "Smoker"))
box()
#
#
### Mosaic plots ###
#
# Mosaic plots are used to display proportions for tables with 2 or more 
# conditional distributions
table(Smoke, Caesarean)
mosaicplot(Caesarean ~ Smoke, main="Smoking vs Caesarean", col = c(3,2) )
#
#
### Scatterplots ###
#
#
plot(LungCapacity) # Good data overview when the number of variables is small
#
cor(LungCapacity[ , 1:3]) 
# plot(x, y, ... )
plot(Age, Height)
# Plot(y ~ x) - y as a function of x gives the same result
plot(Height ~ Age)
plot(Age, Height, main = "Age-Height Scatterplot", pch = 8, col = 4)
abline(lm(Height~Age))
abline(lm(Height~Age), col = 2, lwd = 3)
#
# By creating any "new" graph, we remove the abline
#
plot(Age, Height, main = "Age-Height Scatterplot", pch = 4, col = 4)
abline(lm(Height~Age), col = 2, lwd = 2, lty = 3)
#
#
### A simple 3D graph ###
#
# Suppose a LRM has been estimated as y = 5 + 3x1 - 4x1^2 + 10x2 - 5x2^2
x1 <- seq(-300, 300, by = 10)
x2 <- x1
y <- outer(x1, x2, function(x1,x2) 5 + 3*x1 - 4*(x1^2) + 10*x2 - 5*(x2^2))
persp(x1,x2,y, theta = -30, phi=20)
persp(x1,x2,y, theta = 180, phi=20)
persp(x1,x2,y, theta = 40, phi=20)
image(x1, x2, y, main = "This looks like a heatmap")
#
# We may want to display multiple 2D or 3D graphs at once:
par(mfrow = c(2,2)) # Prepares a 2x2 'matrix' to accomodate 4 graphs 
persp(x1,x2,y, theta = -30, phi=20)
persp(x1,x2,y, theta = 180, phi=20)
persp(x1,x2,y, theta = 40, phi=20)
image(x1, x2, y, main = "This looks like a heatmap")
#
# When we need to get back to single-graphs
par(mfrow = c(1,1))
persp(x1,x2,y, theta = 180, phi=20)
#
#
# Three simple ways to save graphs for subsequent use
#
# Export to pdf file
dev.copy(pdf, "myplot1.pdf") 
# saves the LAST graph to R Working directory (RWD)
# in pdf format.
dev.off() 
# Shuts down the 'pdf device'. Practically: you cannot open the pdf
# file, unless you run dev.off() or close R (RStudio).
#
# A more flexible way to save any graph:
#
pdf("myplot2.pdf")
plot(LungCapacity, main = "Data Overview")
dev.off()
#
# Export to Windows Metafile - i.e. for use in MS Word and MS PowerPoint
#win.metafile("myplot3.wmf")
#persp(x1,x2,y, theta = -30, phi=30)
#dev.off()
# Export to PNG image
png("myplot3.png")
persp(x1,x2,y, theta = -30, phi=30)
dev.off()
#
#
# To save graphs, you may use RStudio Menu on the Plot tab - interactive mode only.
#
#
# Additonal options/graphs: http://www.cyclismo.org/tutorial/R/intermediatePlotting.html
# 
# For nice looking plots, you may install the "ggplot2" package
# install.package("ggplot2")
# library("ggplot2")
# You may want to Google some ggplot2 tutorial  
#
#
#
#
#####  LRM with cross-sectional data  ##### 
#
#
rm(list=ls())
lungCapacity <- read.table("LungCapacity.txt", header = T, sep = "\t")
#
#
#
#
#### Dummy variables for use in LRMs
#
#
# In R, factors (male/female), ("USA"/"CAN"/"MEX") and logical values (T/F)
# may be used in  many regressions without conversion to dummy variables,
# yet dummies (vectors with 1s or 0s) are necesary for many 'advanced' 
# regression specifications.
#
# Let's use "Age" variable and make a dummy for persons aged 12 and more
#
# First, age is evaluated against 12-year treshold and we obtain a vector
# containing logical TRUE/FALSE values.
#
head(lungCapacity$Age >= 12, 20)
#
# Logical values may be handled by the lm() function directly,
# however, sometimes we need a dummy variable (0,1), e.g.
# when using interaction elements.
# as.numeric() converts logical vectors, TRUE -> 1, FALSE -> 0
#
over12 <- as.numeric(lungCapacity$Age >= 12)
# We may want to append the new variable to the data frame:
lungCapacity <- cbind(lungCapacity, over12)
# 
# Many valid alternative ways can lead to the same over12 variable!!
#
#
#
#### Linear regression models
#
#
lrm1 <- lm(LungCap ~ Age, data=lungCapacity)
summary(lrm1)
#
lrm2 <- lm(LungCap ~ Age + Height, data=lungCapacity)
summary(lrm2)
#
lrm3 <- lm(LungCap ~ Age + Height + Smoke , data=lungCapacity)
summary(lrm3)
# Smoke is a "factor". Notice that it is included into
# the LRM as a dummy, with r-1 levels being included.
# The first category to appear in the data is chosen as 'reference'
# 
# 
# What happens if we remove the intercept from lrm3 ?
#
# lrm3 without the intercept:
lrm4 <- lm(LungCap ~ Age + Height + Smoke -1, data=lungCapacity)
summary(lrm4)
#
# Note that Intercept gets excluded using "- 1"
# 
summary(lrm3)$coefficients
summary(lrm4)$coefficients
# Compare & interpret the coefficients of lrm3 and lrm4.
#
# Re-level may be used for interpretation/presentation purposes:
# First define Smoke as factor
lungCapacity$Smoke <- factor(lungCapacity$Smoke)
lungCapacity$Smoke <- relevel(lungCapacity$Smoke, ref= "yes")
summary(lm(LungCap ~ Age + Height + Smoke , data=lungCapacity))$coefficients
#
#
# Sometimes, we want to start the regression analysis by including
# all regressors: "~ ." uses all the variables except the dependent
# variable as regressors.
# Note: What if data frame contains variables with factors such as 
#       individual's names, addresses, etc.?
#
lrm5 <- lm(LungCap ~ ., data = lungCapacity)
summary(lrm5)
#
#
# Simple model comparison (requires identical dependent variable)
lrm1
lrm3
# F-test for linear parameter restrictions: 
# H0: expanding lrm1 to lrm3 is not statistically
# significant
anova(lrm1, lrm3)
str(anova(lrm1, lrm3))
# 
# 
#### More LRM specification topics, interaction terms
#
# Logarithms log(x) and square roots sqrt(x)
# are easy to include into a LRM:
summary(lm(LungCap ~ Age + log(Height) + sqrt(Height), data = lungCapacity))
# 
# Due to R's formula evaluation properties, most
# regressor transformations need to be included using identity I() operator:
#
summary(lm(LungCap ~ Height^2 + 1/Age, data = lungCapacity))
# vs
summary(lm(LungCap ~ I(Height^2) + I(1/Age), data = lungCapacity))
# 
#### Polynomials
#
# Polynomials may be conveniently passed to lm() using the poly() function:
# For Height, Height^2, Height^3 and Height^4, we use 
# 
summary(lm(LungCap ~ poly(Height, degree=4, raw=TRUE), data = lungCapacity))
# Use ?poly() find out what "raw=TRUE" does.  
# 
#
#### Main effects and interaction terms
#
summary(lm(LungCap ~ Age*Height, data = lungCapacity))
#
# R solves the ' Age*Height' argument by including
# both 'main effects' and the 'interaction term'.
# NOTE: If the interaction term's coefficient is statistically
# significant, we should leave the main effects in our LRM
# regardless of their p-values. 
#
# Should we choose oterwise, we do so by using the I() operator.
# Here, we leave Height (main effect) out of the previous specification
summary(lm(LungCap ~ Age + I(Age*Height), data = lungCapacity))
#
#
rm(list=ls())
#
#### Wage2 example
#
wageData <- read.csv("wage2.csv", header = T)
#
#
#
#### Accessing elements of the LRM output, model verification,
#### selected econometric tests
#
#
#
wage.fit <- lm(wage~educ+exper+tenure+married+black, 
               data = wageData[1:100, ])
summary(wage.fit)
#
str(wage.fit)
str(summary(wage.fit))
?lm
#
## Coefficients
#
wage.fit$coefficients
class(wage.fit$coefficients)
#
summary(wage.fit)$coefficients
class(summary(wage.fit)$coefficients)
dim(summary(wage.fit)$coefficients)
#
?coef
coef(wage.fit)
#
## Residuals
# Residuals may be accessed in different ways...
head(wage.fit$residuals)
head(summary(wage.fit)$residuals)
?resid
head(resid(wage.fit))
#
## Sigma - "s", the square root of the estimated variance of the random error,
## s^2 = RSS / (n-k-1), s = sqrt(s^2)
#
summary(wage.fit)$sigma
#
## Fitted values
#
head(wage.fit$fitted.values, 10)
head(summary(wage.fit)$fitted.values)
# attributes(summary(wage.fit)) 
head(fitted(wage.fit))
#
## R-squared
#
summary(wage.fit)$r.squared
summary(wage.fit)$adj.r.squared
#
## F-statistics & its p-value
#
summary(wage.fit)
summary(wage.fit)$fstatistic
length(summary(wage.fit)$fstatistic)
summary(wage.fit)$fstatistic[1]
paste("The F-statistics is", summary(wage.fit)$fstatistic[1])
# 
# The $fstatistic element does not contain p-value for the F-test.
?pf # random numbers and distributions shall be discussed individually
# p-value calculated 'individually' for wage.fit:
pf(summary(wage.fit)$fstatistic[1],
   summary(wage.fit)$fstatistic[2],
   summary(wage.fit)$fstatistic[3], lower.tail = FALSE)
#
# A generally defined function, returns
# p-value for any saved lm() object that
# is passed as an argument
#
p.value.F.test <- function(model) {
  ft <- summary(model)$fstatistic[1]   
  print(paste("F-test value:", ft))
  df1 <- summary(model)$fstatistic[2]
  df2 <- summary(model)$fstatistic[3]
  print(paste("Degrees of freedom: (k) =", df1, "and (n-k-1) =", df2))
  print(paste("p-value:", pf(ft,df1,df2, lower.tail = FALSE)))
}
p.value.F.test(wage.fit) 
#
## Covariance matrix for regression coefficients
#
vcov(wage.fit)
dim(vcov(wage.fit))
# The estimated coefficients' variances are on the diagonal
# line of the vcov matrix:
diag(vcov(wage.fit))
round(diag(vcov(wage.fit)),2)
# And the standard errors of the estimates may be calculated as:
round(sqrt(diag(vcov(wage.fit))),2)
#
#
## Confidence intervals for regression coefficients
#
confint(wage.fit)
#
#
#
#### Heteroskedasticity, autocorrelation, multicolinearity
#### Basic econometric tests (BP, BG, DW, vif)
#
# 
install.packages("car") # Companion to Applied Regression
install.packages("lmtest")
#
# help(package = car)
# help(package = lmtest)
#
require("car")
require("lmtest")
#
#
## Breusch - Pagan Test (heteroskedasticity)
#  - from the {lmtest} package
#
bptest(wage.fit)
#
# NOTE: In general, autocorrelation tests are not relevant for cross-sectional data,
#       the tests below are used to illustrate the use of the tests in R.
# 
## Breusch - Godfrey Test (autocorrelation)
#  - from the {lmtest} package
#
bgtest(wage.fit)
bgtest(wage.fit, order = 1, data = wageData[1:100, ])
#
#
## Durbin - Watson Test
#  - from the lmtest package
#  dwt() is also included in the {car} package... 
#
dwtest(wage.fit)
#
#
#
#
## VIF
#
vif(wage.fit)
# 
# Interpretation example
# vif[Education] = 2 -> the variance of coeff[Education]
# is 2-times higher than it would be if Education were
# not correlated with other regressors.
#
#
#
##### Plot-based LRM diagnostics
#
par(mfrow = c(2,2))
plot(wage.fit)
par(mfrow = c(1,1))
#
#
## Residuals vs. Fitted
#  - may show non-constant variance and/or non-linearity in the model
plot(wage.fit, which=1) 
# no clear implication towards model change
#
#
## Normal Q-Q
#  - standardized empirical residuals' quantiles are compared against
#    their theoretical distribution (Normal with constant variance) 
plot(wage.fit, which=2) 
# Overall satisfactory, "strange" observations identified (row number).
#
#
## Scale-Location plot 
#  - shows the square root of standardized residuals against fitted values
#    (theoretically, all observations have the same variance)
#  - may show outliers and heteroskedasticity
plot(wage.fit, which=3) 
# Outliers shown, mild positive heteroskedasticity
#
#
## Cook's distances
#  - a normalized indicator of influence for individual observations:
#    How far the predicted values would move if the model were fit 
#    without the data point in question?
#  - Leverage (and Outliers) will be discussed separately.
plot(wage.fit, which=4)
#
#
## Residuals vs. Leverage
#  - Leverage (and Outliers) will be discussed separately.
plot(wage.fit, which=5)
#
#
#
## Cook's distance vs. Leverage
#  - Leverage (and Outliers) will be discussed separately.
plot(wage.fit, which=6)
#
#
#
#
#
##### Predictions from a LRM
#
?predict()
#
#
predict(wage.fit, wageData[101:110, ])
#
## Interval predictions
#
predict(wage.fit, wageData[101:110, ], interval = "prediction")
# we predict "individual" wages, coefficient estimation uncertainty
# and the effect of random errors is taken into account
# when calculating intervals
#
predict(wage.fit, wageData[101:110, ], interval = "confidence")
# Reflects the uncertainty (confidence interval) related to
# "average" predictions - as if we had many datasets to estimate
# the LRM and make predictions. Here, we ignore the variance of
# residuals.
# 
# "confidence" intervals are usually narrower than "prediction" intervals.
#
#
##### Mean squared error of prediction
#
# may be calculated using "Metrics" package,
# using the mse function:  mse(actual, predicted)
#
# Manual prediction evaluation, for wageData[101:110,]
actual <- wageData[101:110, "wage"] # "wage" column from the wagData frame is selected
predicted <- predict(wage.fit, wageData[101:110, ])
MSE <- mean((actual - predicted)^2)
MSE
RMSE <- sqrt(MSE) # Root mean squared error
RMSE
# Let's finish with an ad-hoc Actual/Predicted/Difference overview:
summary.matrix <- matrix()
summary.matrix <- cbind(actual, predicted)
difference <- (actual-predicted)
summary.matrix <- cbind(summary.matrix, difference)
summary.matrix
#
#
#
# Quit R (and RStudio)
q()
