# 1) LOAD DATA - SUMMARIZE DATA ph<-scan("yoghurt.txt") ph mean(ph) sd(ph) var(ph) # 2) HISTOGRAM hist(ph) help(hist) nclass.Sturges(ph) h<-0.1 bins <- seq(min(ph)-h/2, max(ph)+h/2, by=h) hist(ph, breaks=bins) par(mfrow=c(2,2)) h<-0.01 bins <- seq(min(ph)-h/2, max(ph)+h/2, by=h) hist(ph, breaks=bins) h<-0.1 bins <- seq(min(ph)-h/2, max(ph)+h/2, by=h) hist(ph, breaks=bins) h<-0.2 bins <- seq(min(ph)-h/2, max(ph)+h/2, by=h) hist(ph, breaks=bins) h<-0.3 bins <- seq(min(ph)-h/2, max(ph)+h/2, by=h) hist(ph, breaks=bins) # 3) KERNEL DENSITY ESTIMATION help(density) density(ph) help(plot) plot(density(ph)) kernels <- eval(formals(density.default)$kernel) # show the kernels in the R parametrization plot (density(0, bw = 1), xlab = "", main="R's density() kernels with bw = 1") for(i in 2:length(kernels)) lines(density(0, bw = 1, kern = kernels[i]), col = i) legend(1.5,.4, legend = kernels, col = seq(kernels), lty = 1, cex = .8, y.int = 1) # same bandwidth (default - bw = "nrd0") different kernels par(mfrow=c(2,3)) plot(density(ph)) plot(density(ph, kernel="epanechnikov")) plot(density(ph, kernel="rectangular")) plot(density(ph, kernel="triangular")) plot(density(ph, kernel="biweight")) # same kernel (default - Normal) different bandwidth par(mfrow=c(2,2)) plot(density(ph)) plot(density(ph, bw=0.01)) plot(density(ph, bw=0.2)) plot(density(ph, bw=0.3)) # same kernel (default - Normal) different bandwidth in the same plot plot(density(ph)) lines(density(ph, bw=0.01)) plot(density(ph), ylim=c(0,3.2)) lines(density(ph, bw=0.01), col=2) # add title and legends # you can plot the points (inputs and outputs) of the density function ($x and $y) or # equivalently just type density(ph) inside plot plot(density(ph)$x, density(ph)$y, ylim=c(0,3.2), xlab="X", ylab="Density", type='l') title("Comparison of 2 density estimation with different width") lines(density(ph, bw=0.01), col=2) plot(density(ph), ylim=c(0,3.2), xlab="X", main="Comparison of 2 density estimation with different width") lines(density(ph, bw=0.01), col=2) plot(density(ph), ylim=c(0,3.2), xlab="X", main="Comparison of 2 density estimation with different width") lines(density(ph, bw=0.01), col=2) legend(4.7, 3.0, legend = c("h =0.07325", "h=0.01") , col = c(1,2), lty=1) # calculate bandwidth manually using the ideas in class summary(ph) iqr<-4.545-4.200 iqr/1.345 sd(ph) h<-1.059*sd(ph)*length(ph)^(-1/5) plot(density(ph, bw=h)) # Values of x we estimated the density, the estimated value of the density, the bandwidth we used, # the function we used in R and finally the name of the data we used to perform density estimation density(ph)$x density(ph)$y density(ph)$bw density(ph)$call density(ph)$data.name # 4) Non Parametric Regression - Nadaraya - Watson Estimator help("ksmooth") help("cars") ksmooth(cars$speed, cars$dist, "normal", bandwidth=2) with(cars, { plot(speed, dist) lines(ksmooth(speed, dist, "normal", bandwidth=2), col=2) lines(ksmooth(speed, dist, "normal", bandwidth=5), col=3) }) # find bandwidth manually using the ideas in class hist(cars$speed) sd(cars$speed) 1.059*sd(cars$speed)*length(cars$speed)^(-1/5) ksmooth(cars$speed, cars$dist, "normal", bandwidth=2.56) with(cars, { plot(speed, dist) lines(ksmooth(speed, dist, "normal", bandwidth=2.56), col=2) }) # scatter plot plot(cars, xlab = "Speed (mph)", ylab = "Stopping distance (ft)") title(main = "cars data") # fit a simple linear regression fm1 <- lm(dist ~ speed, data = cars) help(lm) summary(fm1) # several diagnostics of the linear model plot(fm1) # install (have already done it) and load package car library(car) # plot regression line (simple linear regression) with data plot(cars$speed, cars$dist) regLine(lm(dist ~ speed, data = cars)) # or equivalently use plot(cars$speed, cars$dist) abline(fm1$coef[[1]], fm1$coef[[2]], col=2) # extract x.points used in ksmooth that are used to estimate the ksmooth function # as well as the coefficients of the linear model ksmooth(cars$speed, cars$dist, "normal", bandwidth=2.56) xx<-ksmooth(cars$speed, cars$dist, "normal", bandwidth=2.56)$x fm1$coef length(xx) # for these x.points find predictions according to the linear model yy<-rep(1:100) for( i in 1:100) { yy[i]<-fm1$coef[[1]]+fm1$coef[[2]]*xx[i] } yy # and according to the smooth function ksmooth(cars$speed, cars$dist, "normal", bandwidth=2.56)$y # plot both functions (linear and smooth) plot(cars$speed, cars$dist) regLine(lm(dist ~ speed, data = cars)) lines(ksmooth(cars$speed, cars$dist, "normal", bandwidth=2.56), col=3)