install.packages("np") # Load the data library(np) # library has the data set data(cps71) attach(cps71) str(cps71) # gives information about the variables cps71 # print the data plot(age, logwage) # ------------------------------------------------------------------------ # ---- Fixing nonlinearity by using polynomila regression ---- plot(age, logwage, xlab="Age", ylab="log(wage)", main="Scatter plot and fitted line") linear.model=lm(logwage~age) lines(age,fitted(linear.model), lty=1, lwd=2, col=1) plot(linear.model) # check the assumptions plot(age, logwage, xlab="Age", ylab="log(wage)", main="Scatter plot and fitted line") Quad.model <- lm(logwage ~ age + I(age^2)) lines(age,fitted(Quad.model), lty=1, lwd=2, col=1) plot(Quad.model) cubic.model <- lm(logwage ~ age + I(age^2)+ I(age^3)) plot(age, logwage, xlab="Age", ylab="log(wage)", main="Fixing nonlinearity by adding x^2 and x^3 to the model") lines(age,fitted(cubic.model), lty=1, lwd=2, col=1) plot(cubic.model) Quart.model <- lm(logwage ~ age + I(age^2)+ I(age^3)+ I(age^4)) plot(age, logwage, xlab="Age", ylab="log(wage)", main="Fixing nonlinearity by adding x^2, x^3 and x^4 to the model") lines(age,fitted(Quart.model), lty=1, lwd=2, col=1) plot(Quart.model) # -------------------------------------------------------------- ########## GO TO PRESENTATION SLIDES ############ # -------------------------------------------------------------- ### --------------------- WELCOME Back ------------------------- ### ------------- First method: Kernel regression -------------- ### ------------------------------------------------------------ library(np) # non parametric library data(cps71) attach(cps71) # help(npreg) Kernel.smooth <- npreg(logwage~age) plot(Kernel.smooth, plot.errors.method="asymptotic", plot.errors.style="band" , ylim=c(11,15.2), main="Estimated unknown function using Kernel Regression and its 95% confidence band") points(age,logwage) summary(Kernel.smooth) # --------------- Another R function can be used to estimate the ---- ### ------------- the unknown function using kernel regression------- library(stats) # help(ksmooth) require(graphics) # Effect of bandwidth on smoothing plot(age, logwage) lines(ksmooth(age, logwage, "normal", bandwidth =.5), col = 1, lwd=2) lines(ksmooth(age, logwage, "normal", bandwidth = 2), col = 2, lwd=2) lines(ksmooth(age, logwage, "normal", bandwidth = 5), col = 3, lwd=2) lines(ksmooth(age, logwage, "normal", bandwidth = 10), col = 4, lwd=2) lines(ksmooth(age, logwage, "normal", bandwidth = 100),col = 14, lwd=3,lty=2) legend(20,15, c("h=0.5", "h=2"),lwd =2, col = 1:2, bty="n") legend(30,12, c("h=5", "h=10"),lwd =2, col = 3:4, bty="n") legend(50,12, c("h=100"),lwd =2, col = 14, bty="n") # ----------------------------------------------------------------------------------- # COMPARISION between Kernel smooth (npnparametric model) and polynomial regression second order (parametric model) # in terms of standard error and the R2 of the parametric. library(np) # non parametric library data(cps71) attach(cps71) Kernel.smooth <- npreg(logwage~age) plot(Kernel.smooth, plot.errors.method="asymptotic", plot.errors.style="band", ylim=c(11,15.2)) points(age,logwage) # Fit Kernel regression kernel.model.bw <- npregbw(logwage ~ age, regtype = "lc") kernel.model <- npreg(kernel.model.bw) # Comparision in terms of R2 and Residual Standard Error quad.model <- lm(logwage ~ age + I(age^2), x = TRUE, y = TRUE) summary(quad.model) summary(kernel.model) # We observe that the nonparametric model performs much # better (smaller regression standard error and larger R2). # Comparison in terms of prediction library(np) # non parametric library data(cps71) attach(cps71) ## Define n1 and n2 N <- nrow(cps71) n1 <- 180 # Training data n2 <- N - n1 # Evauating data ## Draw a sample w/o replacement from the indices 1 to n ## and use it to create the training data and evaluation data sets sampl <- sample(seq(1:N),replace=FALSE) ## Data.train uses 180 observations with random indices. ## Data.eval employs the remaining 25 observations. data.train <- cps71[sampl[1:n1],] data.eval <- cps71[sampl[(n1+1):N],] ## Compute the bandwidths using the training data set bw <- npregbw(logwage~age,data=data.train) ## Calculate prediction mean square (pmse) for the evaluation data kernel.model <- npreg(bws=bw,data=data.train) pmse.kernel <- mean((data.eval$logwage-predict(kernel.model <- npreg(bws=bw,data=data.train) ,data=data.train,newdata=data.eval))^2) ## Similarly, do the same for the parametric model (quadratic model) quad.model <- lm(logwage~age+I(age^2),data=data.train) pmse.quad <- mean((data.eval$logwage-predict(quad.model,data=data.train,newdata=data.eval))^2) print(pmse.kernel) print(pmse.quad) # ------------ Reapeat for 100 times ------- pmse.kernel <- c() pmse.quadratic <- c() for (i in 1:100) { sampl <- sample(seq(1:N), replace = FALSE) data.train <- cps71[sampl[1:n1], ] data.eval <- cps71[sampl[(n1 + 1):N], ] bw <- npregbw(logwage ~ age, data = data.train, regtype = "lc") mod.kernel <- npreg(bws = bw, data = data.train, regtype = "lc") mod.quad <- lm(logwage ~ age + I(age^2), data = data.train) pmse.kernel[i] <- mean((data.eval$logwage - predict(mod.kernel, data = data.train,newdata = data.eval))^2) pmse.quadratic[i] <- mean((data.eval$logwage - predict(mod.quad, data = data.train,newdata = data.eval))^2) } boxplot(pmse.kernel, pmse.quadratic, names = c("Kernel model", "quad. model")) # t.test(pmse.kernel, pmse.quadratic, alternative = "two.sided") # ------------------------------------------------------------- ########## GO TO PRESENTATION SLIDES ########### # ------------------------------------------------------------- ### --------------------- WELCOME Back ------------------------- ### ------------- Second method: Spline regression ------------- ### ------------------------------------------------------------ ################################################################ ########## Spline Smoothing for wage data ################################################################ # help(smooth.spline) # Study the effect of lambda on smoothing plot(smooth.spline(age, logwage, spar = .1), xlab="age", ylab="logwage",type="l", xlim=c(20,70), ylim=c(11.5, 14.5), lwd=2) points(age, logwage) lines(smooth.spline(age, logwage, spar = .2), type="l", lwd=2, col=2) lines(smooth.spline(age, logwage, spar = .5), type="l", lwd=2, col=3) lines(smooth.spline(age, logwage, spar = 1), type="l", lwd=2, col=4) lines(smooth.spline(age, logwage, spar = 2), type="l", lwd=2, col=5) legend(20,14.7, c("h=0.1", "h=.2"),lwd =2, col = 1:2, bty="n") legend(30,12.1, c("h=.5", "h=1"), lwd =2, col = 3:4, bty="n") legend(50,12.1, c("h=2"), lwd =2, col = 5, bty="n") # Let R square fit the function using the optimal soothing value plot(smooth.spline(age, logwage), col=2, type="l") points(age, logwage) spline=smooth.spline(age, logwage) plot(spline, type="l", xlim=c(20,70), ylim=c(11, 15), lwd=2) points(age, logwage) spline R2=cor(fitted(spline), logwage) print(R2) # --------------------------------------------------------------- ########## GO TO PRESENTATION SLIDES ############# # --------------------------------------------------------------- ### --------------------- WELCOME Back ------------------------- ### ------------------------------------------------------------ ################################################################ ########### More than one explanatory variable ################################################################ ###################### 1. Kernel Regression ################### library(np) data(wage1) wage1 bw <- npregbw(lwage ~ factor(female) + factor(married) + educ+ exper + tenure, regtype = "ll", bwmethod = "cv.ls", data = wage1, tol = 0.1, ftol = 0.1) npplot(bw, plot.errors.method = "asymptotic", common.scale = FALSE) summary(bw) ####################### 2. Smoothing Spline ################## library(car) # for data sets data(Prestige) attach(Prestige) library(mgcv) plot(income, prestige) plot(education, prestige) mod.gam <- gam(prestige ~ s(income) + s(education)) mod.gam plot(mod.gam) summary(mod.gam)