Selection

Mikis Stasinopoulos
Bob Rigby
Fernanda De Bastiani

Introduction

  • step-wise selection;
  • boosting; and
  • modelling interactions in general

Selection

flowchart TB
  A[Models] --> B(automatic \n selection) 
  A --> C(declared\n selection)
  B --> G[automatic \n interaction]
  B --> H[set up \n interaction]
  G --> D(NN, RT)
  H --> E(LASSO, \n Ridge, \n Elastic Net, \n PCR )
  S --> M[step-wise]
  S --> N[boost]
  C --> S(LM, AM )
Figure 1: Different methods of selecting features.

Stepwise Selection

  • current model

  • lower model

    • which could be the Null model
  • upper model

    • which could be the saturated model

Stepwise Selection (continue)

Table 1: A general algorithmic stepwise GAIC procedure for GAMLSS.
steps Lower Direction Current Direction Upper Creates Given
1 (\(\mu\)) \(L_{\mu}\) \(\leftarrow\) \(C_{\mu}\) \(\rightarrow\) \(U_{\mu}\) \(F_{\mu}^{(1)}\) \(C_{\sigma}, C_{\nu}, C_{\tau}\)
2 (\(\sigma\)) \(L_{\sigma}\) \(\leftarrow\) \(C_{\sigma}\) \(\rightarrow\) \(U_{\sigma}\) \(F_{\sigma}^{(2)}\) \(F_{\mu}^{(1)}, C_{\nu}, C_{\tau}\)
3 (\(\nu\)) \(L_{\nu}\) \(\leftarrow\) \(C_{\nu}\) \(\rightarrow\) \(U_{\nu}\) \(F_{\nu}^{(3)}\) \(F_{\mu}^{(1)},F_{\sigma}^{(2)}, C_{\tau}\)
4 (\(\tau\)) \(L_{\tau}\) \(\leftarrow\) \(C_{\tau}\) \(\rightarrow\) \(U_{\tau}\) \(F_{\tau}^{(4)}\) \(F_{\mu}^{(1)}, F_{\sigma}^{(2)}, F_{\nu}^{(3)}\)
5 (\(\nu\)) \(L_{\nu}\) \(\leftarrow\) \(F_{\nu}^{(3)}\) \(\rightarrow\) \(U_{\nu}\) \(F_{\nu}^{(5)}\) \(F_{\mu}^{(1)}, F_{\sigma}^{(2)}, F_{\tau}^{(4)}\)
6 (\(\sigma\)) \(L_{\sigma}\) \(\leftarrow\) \(F_{\sigma}^{(2)}\) \(\rightarrow\) \(U_{\sigma}\) \(F_{\sigma}^{(6)}\) \(F_{\mu}^{(1)}, F_{\nu}^{(5)}, F_{\tau}^{(4)}\)
7 (\(\mu\)) \(L_{\mu}\) \(\leftarrow\) \(F_{\mu}^{(1)}\) \(\rightarrow\) \(U_{\mu}\) \(F_{\mu}^{(7)}\) \(F_{\sigma}^{(6)}, F_{\nu}^{(5)}, F_{\tau}^{(4)}\)

Stepwise Selection (continue)

 m1 <- gamlss(rent~area+yearc+location+bath+kitchen+cheating,~area+yearc+location+ bath + kitchen+cheating,
~area+yearc+location + bath + kitchen+cheating,
~area+yearc+location + bath + kitchen+cheating,
              family=BCTo,  data=da, trace=TRUE, n.cyc=20,
              c.crit=0.01)
mfA <- stepGAICAll.A(m1, scope=list(lower=~1,
  upper = ~poly(area,3)+poly(yearc,3)+
    (area+yearc+location+bath+kitchen + cheating)^2),
  trace=FALSE, parallel="snow", ncpus=10, k=log(3032),
         direction=rep("both",7) )

model, LM

\[ \begin{split} \texttt{mfA:} \qquad &\texttt{rent} \sim \text{BCTo}(\mu, \sigma, \nu, \tau ) \\ &\mu \sim \texttt{poly(area,3)}+ \texttt{poly(yearc,3)} \\ & \qquad +\texttt{location}+ \texttt{bath}+\texttt{cheating}+ \texttt{bath}\\ \log\,&\sigma \sim \texttt{yearc}+\texttt{kitchen}+\texttt{yearc*kitchen}+\\ & \qquad +\texttt{poly(yeatc,3)} \\ & \nu \sim \texttt{yearc} + \texttt{kitchen} \\ \log\,&\tau \sim \texttt{yearc} + \texttt{cheating}. \\ \end{split} \]

model, add. sm.

\[ \begin{split} \texttt{mfA1:} \qquad &\texttt{rent} \sim \text{BCTo}(\mu, \sigma, \nu, \tau ) \\ &\mu \sim \texttt{pb(area)}+ \texttt{pb(yearc)} \\ & \qquad +\texttt{location}+ \texttt{bath}+\texttt{cheating}+ \texttt{bath}\\ \log\,&\sigma \sim \texttt{yearc}+\texttt{kitchen}+\texttt{yearc*kitchen}+\\ & \qquad +\texttt{pb(yeatc)} \\ & \nu \sim \texttt{yearc} + \texttt{kitchen} \\ \log\,&\tau \sim \texttt{yearc} + \texttt{cheating}. \\ \end{split} \]

Boosting

library(gamboostLSS)
mfboost  <- gamboostLSS(list(
   mu = rent ~ bbs(area)+bbs(yearc)+
  (area+yearc+location+kitchen+bath+cheating),
sigma = rent ~ bbs(area)+bbs(yearc)+
  (area+yearc+location+kitchen+bath+cheating),
   nu = rent ~ bbs(area)+bbs(yearc)+
  (area+yearc+location+kitchen+bath+cheating),
  tau = rent ~ bbs(area)+bbs(yearc)+
  (area+yearc+location+kitchen+bath+cheating)),
        data = da, families = as.families("BCTo"),
        control=boost_control(mstop=1000, center=TRUE),
                                  method = "noncyclic")

Boosting (continuous)

cvr <- cvrisk(mfboost)
Starting cross-validation...
mstop(cvr)
[1] 979
mstop(mfboost) <- mstop(cvr)

model

\[ \begin{split} \texttt{mfboost:} \qquad &\texttt{rent} \sim \text{BCTo}(\mu, \sigma, \nu, \tau ) \\ &\mu \sim s(\texttt{area})+ s(\texttt{yearc}) +\texttt{location} \\ & \qquad +\texttt{bath}+\texttt{kitchen}+\texttt{cheating}\\ \log\,&\sigma \sim s(\texttt{area})+s(\texttt{yearc})+\texttt{location}\\ & \qquad +\texttt{bath}+ \texttt{cheating} \\ & \nu \sim s(\texttt{area})+ s(\texttt{yearc}) +\texttt{location} \\ & \qquad +\texttt{kitchen}+ \texttt{cheating} \\ \log\,&\tau \sim s(\texttt{yearc}). \\ \end{split} \]

LASSO

library(gamlss.ggplots)
library(gamlss.add)
library(rlang)
library(ggplot2)
source("~/Dropbox/GAMLSS-development/glmnet/sumplementary_functions_for_gnet.R")
gnet_path(mfLASSO)
gnet_path(mfLASSO, "sigma")
gnet_path(mfLASSO, "nu")
gnet_path(mfLASSO, "tau")

model

\[ \begin{split} \texttt{mfLASSO:} \qquad &\texttt{rent} \sim \text{BCTo}(\mu, \sigma, \nu, \tau ) \\ &\mu \sim poly(\texttt{area},2)+ \texttt{yearc}+\texttt{cheating} \\ & \qquad +\texttt{area:location}+\texttt{area:kitchen}+\texttt{year}^2:\texttt{bath}\\ & \qquad +\texttt{year}^2:\texttt{cheating}+\texttt{location:kitchen}\\ \log\,&\sigma \sim \texttt{area}^3+\texttt{yearc}^3+\texttt{location}\\ & \qquad +\texttt{cheating}+ \texttt{area:location}+ \texttt{area:cheating} \\ & \qquad +\texttt{area}^2:\texttt{year}^2+ \texttt{year:cheating}+ \texttt{location:bath} \\ & \nu \sim 1 \\ \log\,&\tau \sim 1. \\ \end{split} \]

PCR

source("~/Dropbox/GAMLSS-development/PCR/GAMLSS-pcr.R")
source("~/Dropbox/GAMLSS-development/PCR/fitPCR.R")
X = formula2X(formula=rent~poly(area,2)+poly(yearc,2)+(area+yearc+location+bath+kitchen+cheating)^2,data=da)
mfPCR <- gamlss(rent~pcr(x=X),
                     ~pcr(x=X),
                     ~pcr(x=X),
                     ~pcr(x=X),
      data=da, family=BCTo, bf.cyc=1, c.crit=0.1, n.cyc=100,
                  trace=TRUE)

model

\[ \begin{split} \texttt{mfPCR:} \qquad &\texttt{rent} \sim \text{BCTo}(\mu, \sigma, \nu, \tau ) \\ & \boldsymbol{\mu} = \textbf{T}_{\mu, 17} \boldsymbol{\gamma}_{\mu} \\ \log\,&\boldsymbol{\sigma} \sim \textbf{T}_{\sigma, 6} \boldsymbol{\gamma}_{\sigma} \\ & \nu \sim \textbf{T}_{\nu, 6} \boldsymbol{\gamma}_{\nu} \\ \log\,&\tau \sim \textbf{T}_{\tau, 1} \boldsymbol{\gamma}_{\tau} \\ \end{split} \]

Neural Network

source("~/Dropbox/github/gamlss-ggplots/R/data_stand.R")
da01 <- data_scale(da, response=rent, scale = "0to1" )
library(gamlss.add)
set.seed(213)
mfNN <- gamlss(rent~nn(~area+yearc+location+bath+kitchen+
                         cheating, size=5),
      ~nn(~area+yearc+location+bath+kitchen+cheating), 
      ~nn(~area+yearc+location+bath+kitchen+cheating), 
      ~nn(~area+yearc+location+bath+kitchen+cheating), 
              family=BCTo, data=da01)

Model

\[ \begin{split} \texttt{mfNN:} \qquad &\texttt{rent} \sim \text{BCTo}(\mu, \sigma, \nu, \tau ) \\ & \boldsymbol{\mu} = NN_{\mu}(\textbf{X}) \\ \log\,&\boldsymbol{\sigma} \sim NN_{\sigma}(\textbf{X}) \\ & \nu \sim NN_{\nu}(\textbf{X}) \\ \log\,&\tau \sim NN_{\tau}(\textbf{X}) \\ \end{split} \]

end

back to the index

The Books