# 1) LOAD DATA - SUMMARIZE DATA

ph<-scan("yoghurt.txt")
ph
mean(ph)
sd(ph)
var(ph)

# 2) HISTOGRAM

hist(ph)
help(hist)
nclass.Sturges(ph)

h<-0.1
bins <- seq(min(ph)-h/2, max(ph)+h/2, by=h)
hist(ph, breaks=bins)

par(mfrow=c(2,2))
h<-0.01
bins <- seq(min(ph)-h/2, max(ph)+h/2, by=h)
hist(ph, breaks=bins)
h<-0.1
bins <- seq(min(ph)-h/2, max(ph)+h/2, by=h)
hist(ph, breaks=bins)
h<-0.2
bins <- seq(min(ph)-h/2, max(ph)+h/2, by=h)
hist(ph, breaks=bins)
h<-0.3
bins <- seq(min(ph)-h/2, max(ph)+h/2, by=h)
hist(ph, breaks=bins)

# 3) KERNEL DENSITY ESTIMATION

help(density)
density(ph)
help(plot)
plot(density(ph))


kernels <- eval(formals(density.default)$kernel)

# show the kernels in the R parametrization
plot (density(0, bw = 1), xlab = "",
      main="R's density() kernels with bw = 1")
for(i in 2:length(kernels))
   lines(density(0, bw = 1, kern =  kernels[i]), col = i)
legend(1.5,.4, legend = kernels, col = seq(kernels),
       lty = 1, cex = .8, y.int = 1)


# same bandwidth (default - bw = "nrd0") different kernels
par(mfrow=c(2,3))
plot(density(ph))
plot(density(ph, kernel="epanechnikov"))
plot(density(ph, kernel="rectangular"))
plot(density(ph, kernel="triangular"))
plot(density(ph, kernel="biweight"))

# same kernel (default - Normal) different bandwidth
par(mfrow=c(2,2))
plot(density(ph))
plot(density(ph, bw=0.01))
plot(density(ph, bw=0.2))
plot(density(ph, bw=0.3))

# same kernel (default - Normal) different bandwidth in the same plot
plot(density(ph))
lines(density(ph, bw=0.01))
plot(density(ph), ylim=c(0,3.2))
lines(density(ph, bw=0.01), col=2)

# add title and legends
# you can plot the points (inputs and outputs) of the density function ($x and $y) or 
# equivalently just type density(ph) inside plot
 
plot(density(ph)$x, density(ph)$y, ylim=c(0,3.2), xlab="X", ylab="Density", type='l')
title("Comparison of 2 density estimation with different width")
lines(density(ph, bw=0.01), col=2)

plot(density(ph), ylim=c(0,3.2), xlab="X", main="Comparison of 2 density estimation with different width")
lines(density(ph, bw=0.01), col=2)

plot(density(ph), ylim=c(0,3.2), xlab="X", main="Comparison of 2 density estimation with different width")
lines(density(ph, bw=0.01), col=2)
legend(4.7, 3.0, legend = c("h =0.07325", "h=0.01")  , col = c(1,2), lty=1)

# calculate bandwidth manually using the ideas in class
summary(ph)
iqr<-4.545-4.200
iqr/1.345
sd(ph)
h<-1.059*sd(ph)*length(ph)^(-1/5)
plot(density(ph, bw=h))

# Values of x we estimated the density, the estimated value of the density, the bandwidth we used, 
# the function we used in R and finally the name of the data we used to perform density estimation

density(ph)$x
density(ph)$y
density(ph)$bw
density(ph)$call
density(ph)$data.name


# 4) Non Parametric Regression - Nadaraya - Watson Estimator

help("ksmooth")
help("cars")
ksmooth(cars$speed, cars$dist, "normal", bandwidth=2)

with(cars, {
    plot(speed, dist)
    lines(ksmooth(speed, dist, "normal", bandwidth=2), col=2)
    lines(ksmooth(speed, dist, "normal", bandwidth=5), col=3)
})

# find bandwidth manually using the ideas in class
hist(cars$speed)
sd(cars$speed)
1.059*sd(cars$speed)*length(cars$speed)^(-1/5)
ksmooth(cars$speed, cars$dist, "normal", bandwidth=2.56)
with(cars, {
    plot(speed, dist)
    lines(ksmooth(speed, dist, "normal", bandwidth=2.56), col=2)
})

# scatter plot
 plot(cars, xlab = "Speed (mph)", ylab = "Stopping distance (ft)")
 title(main = "cars data")

# fit a simple linear regression
fm1 <- lm(dist ~ speed, data = cars)
help(lm)
summary(fm1)

# several diagnostics of the linear model
plot(fm1)


# install (have already done it) and load package car
library(car)

# plot regression line (simple linear regression) with data
plot(cars$speed, cars$dist)
regLine(lm(dist ~ speed, data = cars))

# or equivalently use
plot(cars$speed, cars$dist)
abline(fm1$coef[[1]], fm1$coef[[2]], col=2)

# extract x.points used in ksmooth that are used to estimate the ksmooth function
# as well as the coefficients of the linear model
ksmooth(cars$speed, cars$dist, "normal", bandwidth=2.56)
xx<-ksmooth(cars$speed, cars$dist, "normal", bandwidth=2.56)$x
fm1$coef
length(xx)

# for these x.points find predictions according to the linear model
yy<-rep(1:100)
for( i in 1:100)
{
yy[i]<-fm1$coef[[1]]+fm1$coef[[2]]*xx[i]
}
yy

# and according to the smooth function
ksmooth(cars$speed, cars$dist, "normal", bandwidth=2.56)$y

# plot both functions (linear and smooth)
plot(cars$speed, cars$dist)
regLine(lm(dist ~ speed, data = cars))
lines(ksmooth(cars$speed, cars$dist, "normal", bandwidth=2.56), col=3)