install.packages("np")

# Load the data
library(np) # library has the data set
data(cps71)
attach(cps71)
str(cps71)  # gives information about the variables
cps71       # print the data
plot(age, logwage)
# ------------------------------------------------------------------------




# ---- Fixing nonlinearity by using polynomila regression ----
plot(age, logwage, xlab="Age", ylab="log(wage)", main="Scatter plot and fitted line")
linear.model=lm(logwage~age)
lines(age,fitted(linear.model), lty=1, lwd=2, col=1)
plot(linear.model) # check the assumptions

plot(age, logwage, xlab="Age", ylab="log(wage)", main="Scatter plot and fitted line")
Quad.model <- lm(logwage ~ age + I(age^2))
lines(age,fitted(Quad.model), lty=1, lwd=2, col=1)
plot(Quad.model)

cubic.model <- lm(logwage ~ age + I(age^2)+ I(age^3))
plot(age, logwage, xlab="Age", ylab="log(wage)", main="Fixing nonlinearity by adding x^2 and x^3 to the model")
lines(age,fitted(cubic.model), lty=1, lwd=2, col=1)
plot(cubic.model)

Quart.model <- lm(logwage ~ age + I(age^2)+ I(age^3)+ I(age^4))
plot(age, logwage, xlab="Age", ylab="log(wage)", main="Fixing nonlinearity by adding x^2, x^3 and x^4 to the model")
lines(age,fitted(Quart.model), lty=1, lwd=2, col=1)
plot(Quart.model)

# --------------------------------------------------------------
##########         GO TO PRESENTATION SLIDES        ############
# --------------------------------------------------------------








### --------------------- WELCOME Back -------------------------
### ------------- First method: Kernel regression --------------
### ------------------------------------------------------------
library(np) # non parametric library
data(cps71)
attach(cps71)
# help(npreg)
Kernel.smooth <- npreg(logwage~age)
plot(Kernel.smooth, plot.errors.method="asymptotic", plot.errors.style="band"
     , ylim=c(11,15.2), main="Estimated unknown function using Kernel Regression and its 95% confidence band")
points(age,logwage)
summary(Kernel.smooth)

# --------------- Another R function can be used to estimate the ----
### ------------- the unknown function using kernel regression-------
library(stats) 
# help(ksmooth)
require(graphics)
# Effect of bandwidth on smoothing
plot(age, logwage)
lines(ksmooth(age, logwage, "normal", bandwidth =.5),  col = 1, lwd=2)
lines(ksmooth(age, logwage, "normal", bandwidth = 2),  col = 2, lwd=2)
lines(ksmooth(age, logwage, "normal", bandwidth = 5),  col = 3, lwd=2)
lines(ksmooth(age, logwage, "normal", bandwidth = 10), col = 4, lwd=2)
lines(ksmooth(age, logwage, "normal", bandwidth = 100),col = 14, lwd=3,lty=2)

legend(20,15, c("h=0.5", "h=2"),lwd =2, col = 1:2, bty="n")
legend(30,12, c("h=5", "h=10"),lwd =2, col = 3:4, bty="n")
legend(50,12, c("h=100"),lwd =2, col = 14, bty="n")

# -----------------------------------------------------------------------------------
# COMPARISION between Kernel smooth (npnparametric model) and polynomial regression second order (parametric model)
# in terms of standard error and the R2 of the parametric.
library(np) # non parametric library
data(cps71)
attach(cps71)
Kernel.smooth <- npreg(logwage~age)
plot(Kernel.smooth, plot.errors.method="asymptotic", plot.errors.style="band", ylim=c(11,15.2))
points(age,logwage)

# Fit Kernel regression
kernel.model.bw <- npregbw(logwage ~ age, regtype = "lc")
kernel.model <- npreg(kernel.model.bw)

# Comparision in terms of R2 and Residual Standard Error 
quad.model <- lm(logwage ~ age + I(age^2), x = TRUE, y = TRUE)
summary(quad.model)
summary(kernel.model)

# We observe that the nonparametric model performs much
# better (smaller regression standard error and larger R2).


# Comparison in terms of prediction
library(np) # non parametric library
data(cps71)
attach(cps71)

## Define n1 and n2
N <- nrow(cps71)
n1 <- 180     # Training data
n2 <- N - n1  # Evauating data
## Draw a sample w/o replacement from the indices 1 to n
## and use it to create the training data and evaluation data sets 
sampl <- sample(seq(1:N),replace=FALSE)
## Data.train uses 180 observations with random indices.
## Data.eval employs the remaining 25 observations.
data.train <- cps71[sampl[1:n1],]
data.eval <- cps71[sampl[(n1+1):N],]
## Compute the bandwidths using the training data set
bw <- npregbw(logwage~age,data=data.train)
## Calculate prediction mean square (pmse) for the evaluation data
kernel.model <- npreg(bws=bw,data=data.train)
pmse.kernel <- mean((data.eval$logwage-predict(kernel.model <- npreg(bws=bw,data=data.train)
                                               ,data=data.train,newdata=data.eval))^2)
## Similarly, do the same for the parametric model (quadratic model)
quad.model <- lm(logwage~age+I(age^2),data=data.train)
pmse.quad <- mean((data.eval$logwage-predict(quad.model,data=data.train,newdata=data.eval))^2)
print(pmse.kernel)
print(pmse.quad)
# ------------ Reapeat for 100 times -------
pmse.kernel <- c()
pmse.quadratic <- c()
for (i in 1:100) {
  sampl <- sample(seq(1:N), replace = FALSE)
  data.train <- cps71[sampl[1:n1], ]
  data.eval <- cps71[sampl[(n1 + 1):N], ]
  bw <- npregbw(logwage ~ age, data = data.train, regtype = "lc")
  mod.kernel <- npreg(bws = bw, data = data.train, regtype = "lc")
  mod.quad <- lm(logwage ~ age + I(age^2), data = data.train)
  pmse.kernel[i] <- mean((data.eval$logwage - predict(mod.kernel, data = data.train,newdata = data.eval))^2)
  pmse.quadratic[i] <- mean((data.eval$logwage - predict(mod.quad, data = data.train,newdata = data.eval))^2)
}
boxplot(pmse.kernel, pmse.quadratic, names = c("Kernel model", "quad. model"))
# t.test(pmse.kernel, pmse.quadratic, alternative = "two.sided")
# -------------------------------------------------------------
##########         GO TO PRESENTATION SLIDES        ###########
# -------------------------------------------------------------









### --------------------- WELCOME Back -------------------------
### ------------- Second method: Spline regression -------------
### ------------------------------------------------------------
################################################################
##########    Spline Smoothing for wage data
################################################################
# help(smooth.spline)
# Study the effect of lambda on smoothing
plot(smooth.spline(age,  logwage,  spar = .1), xlab="age", ylab="logwage",type="l", xlim=c(20,70), ylim=c(11.5, 14.5), lwd=2)
points(age, logwage)
lines(smooth.spline(age, logwage,  spar = .2), type="l", lwd=2, col=2)
lines(smooth.spline(age, logwage,  spar = .5), type="l", lwd=2, col=3)
lines(smooth.spline(age, logwage,  spar =  1), type="l", lwd=2, col=4)
lines(smooth.spline(age, logwage,  spar =  2), type="l", lwd=2, col=5)
legend(20,14.7, c("h=0.1", "h=.2"),lwd =2, col = 1:2, bty="n")
legend(30,12.1, c("h=.5", "h=1"),  lwd =2, col = 3:4, bty="n")
legend(50,12.1, c("h=2"),          lwd =2, col = 5, bty="n")

# Let R square fit the function using the optimal soothing value
plot(smooth.spline(age, logwage), col=2, type="l")
points(age, logwage)

spline=smooth.spline(age, logwage)
plot(spline, type="l", xlim=c(20,70), ylim=c(11, 15), lwd=2)
points(age, logwage)
spline

R2=cor(fitted(spline), logwage)
print(R2)
# ---------------------------------------------------------------
##########         GO TO PRESENTATION SLIDES        #############
# ---------------------------------------------------------------








### --------------------- WELCOME Back -------------------------
### ------------------------------------------------------------
################################################################
###########   More than one explanatory variable
################################################################
###################### 1. Kernel Regression  ###################
library(np)
data(wage1)
wage1
bw <- npregbw(lwage ~ factor(female) + factor(married) + educ+ 
                exper + tenure, regtype = "ll", bwmethod = "cv.ls", data = wage1,
              tol = 0.1, ftol = 0.1)
npplot(bw, plot.errors.method = "asymptotic", common.scale = FALSE)
summary(bw)
####################### 2. Smoothing Spline  ##################
library(car) # for data sets
data(Prestige)
attach(Prestige)
library(mgcv)
plot(income, prestige)
plot(education, prestige)

mod.gam <- gam(prestige ~ s(income) + s(education))
mod.gam
plot(mod.gam)
summary(mod.gam)