suppressWarnings(library(RODBC))
library(ISLR)
library(glmnet)
library(leaps)
library(pls)
library(boot)
library(splines)
library(tree)
library(randomForest)
library(gbm)
library(ggplot2)
library(tidyverse)
library(dplyr)
library(corrplot)
library(knitr)
library(plyr)
library(Rmisc)
library(e1071)
library(robustbase)
library(lmtest)
library(sandwich)
library(VIF)
set.seed(1)


infant=read.csv("infant.csv",header=T,na.strings="?")
sum(is.na(infant))


#change the fitst column
fix(infant)
rownames(infant) = infant[,1]
infant = infant[,-1]
#dimention of the data
dim(infant)


head(infant)


#convert to categorical variables
infant$SEX <- factor(infant$SEX, levels = c(1,2),labels = c("Boy", "Girl"))
infant$MARITAL <- factor(infant$MARITAL, levels = c(1,2),labels = c("Married", "Unmarried"))
infant$DIABETES<- factor(infant$DIABETES, levels = c(0,1),labels = c("No", "Yes"))
infant$HYDRAM<- factor(infant$HYDRAM, levels = c(0,1),labels = c("No", "Yes"))
infant$HYPERCH<- factor(infant$HYPERCH, levels = c(0,1),labels = c("No", "Yes"))
infant$HYPERPR<- factor(infant$HYPERPR, levels = c(0,1),labels = c("No", "Yes"))
infant$ECLAMP<- factor(infant$ECLAMP, levels = c(0,1),labels = c("No", "Yes"))
infant$CERVIX<- factor(infant$CERVIX, levels = c(0,1),labels = c("No", "Yes"))
infant$PINFANT<- factor(infant$PINFANT, levels = c(0,1),labels = c("No", "Yes"))
infant$PRETERM<- factor(infant$PRETERM, levels = c(0,1),labels = c("No", "Yes"))
infant$UTERINE<- factor(infant$UTERINE, levels = c(0,1),labels = c("No", "Yes"))
infant$LOUTCOME<- factor(infant$LOUTCOME, levels=sort(unique(infant$LOUTCOME)))
summary(infant)

   SEX             MARITAL           FAGE           GAINED          VISITS     
 Boy :52160   Married  :70592   Min.   :14.00   Min.   : 0.00   Min.   : 0.00  
 Girl:49239   Unmarried:30807   1st Qu.:25.00   1st Qu.:21.00   1st Qu.:10.00  
                                Median :30.00   Median :30.00   Median :12.00  
                                Mean   :30.17   Mean   :30.28   Mean   :12.44  
                                3rd Qu.:35.00   3rd Qu.:39.00   3rd Qu.:15.00  
                                Max.   :74.00   Max.   :98.00   Max.   :49.00  
      MAGE           FEDUC           MEDUC           TOTALP      
 Min.   :11.00   Min.   : 0.00   Min.   : 0.00   Min.   : 1.000  
 1st Qu.:23.00   1st Qu.:12.00   1st Qu.:12.00   1st Qu.: 1.000  
 Median :28.00   Median :12.00   Median :13.00   Median : 2.000  
 Mean   :27.74   Mean   :12.93   Mean   :13.26   Mean   : 2.378  
 3rd Qu.:32.00   3rd Qu.:16.00   3rd Qu.:16.00   3rd Qu.: 3.000  
 Max.   :53.00   Max.   :17.00   Max.   :17.00   Max.   :20.000  
     BDEAD           TERMS         LOUTCOME      WEEKS           CIGNUM      
 Min.   :0.000   Min.   : 0.0000   1:53133   Min.   :18.00   Min.   : 0.000  
 1st Qu.:0.000   1st Qu.: 0.0000   2:13750   1st Qu.:38.00   1st Qu.: 0.000  
 Median :0.000   Median : 0.0000   9:34516   Median :39.00   Median : 0.000  
 Mean   :0.015   Mean   : 0.3716             Mean   :38.74   Mean   : 0.726  
 3rd Qu.:0.000   3rd Qu.: 0.0000             3rd Qu.:40.00   3rd Qu.: 0.000  
 Max.   :6.000   Max.   :16.0000             Max.   :45.00   Max.   :60.000  
    DRINKNUM        DIABETES    HYDRAM      HYPERCH      HYPERPR    
 Min.   : 0.00000   No :98046   No :99899   No :100003   No :96325  
 1st Qu.: 0.00000   Yes: 3353   Yes: 1500   Yes:  1396   Yes: 5074  
 Median : 0.00000                                                   
 Mean   : 0.00555                                                   
 3rd Qu.: 0.00000                                                   
 Max.   :40.00000                                                   
 ECLAMP       CERVIX       PINFANT      PRETERM      UTERINE     
 No :101013   No :101020   No :100823   No :100501   No :101050  
 Yes:   386   Yes:   379   Yes:   576   Yes:   898   Yes:   349  
                                                                 
                                                                 
                                                                 
                                                                 
    BWEIGHT       
 Min.   : 0.1875  
 1st Qu.: 6.6250  
 Median : 7.3750  
 Mean   : 7.2581  
 3rd Qu.: 8.0625  
 Max.   :13.0625


#distribution of response variable (BWEIGHT)
library(repr)
options(repr.plot.width=4, repr.plot.height=4)
ggplot(data=infant, aes(x=BWEIGHT)) +
  geom_histogram(fill="blue", binwidth = 1) +
  scale_x_continuous(breaks= seq(0, 15, by=1))


#Correlation between response variable and all numeric varibales, set threshold to 0.1
options(repr.plot.width=5, repr.plot.height=5)
numericVars <- which(sapply(infant, is.numeric)) #index vector numeric variables
numericVarNames <- names(numericVars) #saving names vector for use later on
cat('There are', length(numericVars), 'numeric variables')
infant_numVar <- infant[, numericVars]
cor_numVar <- cor(infant_numVar, use="pairwise.complete.obs") #correlations of infant numeric variables
#sort on decreasing correlations with infant birth weight
cor_sorted <- as.matrix(sort(cor_numVar[,'BWEIGHT'], decreasing = TRUE))
#select only high corelations
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.1)))
cor_numVar <- cor_numVar[CorHigh, CorHigh]
corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt")

There are 13 numeric variables


#Plot WEEKS,GAINED,VISITS vs BWEIGHT
q1<-ggplot(data=infant, aes(x=WEEKS, y=BWEIGHT))+
  geom_point(color= "steelblue") + geom_smooth(method = "lm", se=FALSE, color = "tomato", aes(group=5))

q2<-ggplot(data=infant, aes(x=GAINED, y=BWEIGHT))+
  geom_point(color= "steelblue") + geom_smooth(method = "lm", se=FALSE, color = "tomato", aes(group=5))

q3<-ggplot(data=infant, aes(x=VISITS, y=BWEIGHT))+
  geom_point(color= "steelblue") + geom_smooth(method = "lm", se=FALSE, color = "tomato", aes(group=5))
layout <- matrix(c(1,2,3,4),2,2,byrow=TRUE)
multiplot(q1,q2,q3, layout=layout)

`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'


#Infant birth weight between boys and girls amount same weeks
plotdata<-ddply(infant,c("WEEKS","SEX"),summarise,Median_infant_birth_weight=median(BWEIGHT))
ggplot(plotdata,aes(x = WEEKS, y = Median_infant_birth_weight, color = SEX)) +
  geom_line() + xlab("Weeks") + ylab("Median infant birth weight")
#boys are heavier than girls


#Numeric variables histogram plot
options(repr.plot.width=5, repr.plot.height=5)
l1<-ggplot(infant, aes(x=FAGE))+
  geom_histogram(color="darkblue", fill="lightblue")
l2<-ggplot(infant, aes(x=MAGE))+
  geom_histogram(color="darkblue", fill="lightblue")
l3<-ggplot(infant, aes(x=FEDUC))+
  geom_histogram(color="darkblue", fill="lightblue")
l4<-ggplot(infant, aes(x=MEDUC))+
  geom_histogram(color="darkblue", fill="lightblue")
layout <- matrix(c(1,2,3,4),2,2,byrow=TRUE)
multiplot(l1,l2,l3,l4, layout=layout)

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


options(repr.plot.width=8, repr.plot.height=8)
l5<-ggplot(infant, aes(x=VISITS))+
  geom_histogram(color="darkblue", fill="lightblue")
l6<-ggplot(infant, aes(x=GAINED))+
  geom_histogram(color="darkblue", fill="lightblue")
l7<-ggplot(infant, aes(x=TOTALP))+
  geom_histogram(color="darkblue", fill="lightblue")
l8<-ggplot(infant, aes(x=BDEAD))+
  geom_histogram(color="darkblue", fill="lightblue")
l9<-ggplot(infant, aes(x=TERMS))+
  geom_histogram(color="darkblue", fill="lightblue")
l10<-ggplot(infant, aes(x=WEEKS))+
  geom_histogram(color="darkblue", fill="lightblue")
l11<-ggplot(infant, aes(x=CIGNUM))+
  geom_histogram(color="darkblue", fill="lightblue")
l12<-ggplot(infant, aes(x=DRINKNUM))+
  geom_histogram(color="darkblue", fill="lightblue")
layout <- matrix(c(1,2,3,4,5,6,7,8),4,2,byrow=TRUE)
multiplot(l5,l6,l7,l8,l9,l10,l11,l12, layout=layout)

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


#Explore categorical variables with infant birth weight
p1<-ggplot(infant, aes(x=MARITAL, y=BWEIGHT, color=MARITAL)) +
  geom_boxplot()
p2<-ggplot(infant, aes(x=DIABETES, y=BWEIGHT, color=DIABETES)) +
  geom_boxplot()
p3<-ggplot(infant, aes(x=PINFANT, y=BWEIGHT, color=PINFANT)) +
  geom_boxplot()
p4<-ggplot(infant, aes(x=PRETERM, y=BWEIGHT, color=PRETERM)) +
  geom_boxplot()
layout <- matrix(c(1,2,3,4),2,2,byrow=TRUE)
multiplot(p1,p2,p3,p4, layout=layout)


p5<-ggplot(infant, aes(x=LOUTCOME, y=BWEIGHT, color=LOUTCOME)) +
  geom_boxplot()
p6<-ggplot(infant, aes(x=HYDRAM, y=BWEIGHT, color=HYDRAM)) +
  geom_boxplot()
p7<-ggplot(infant, aes(x=HYPERCH, y=BWEIGHT, color=HYPERCH)) +
  geom_boxplot()
p8<-ggplot(infant, aes(x=HYPERPR, y=BWEIGHT, color=HYPERPR)) +
  geom_boxplot()
layout <- matrix(c(1,2,3,4),2,2,byrow=TRUE)
multiplot(p5,p6,p7,p8, layout=layout)


p9<-ggplot(infant, aes(x=ECLAMP, y=BWEIGHT, color=ECLAMP)) +
  geom_boxplot()
p10<-ggplot(infant, aes(x=CERVIX, y=BWEIGHT, color=CERVIX)) +
  geom_boxplot()
p11<-ggplot(infant, aes(x=UTERINE, y=BWEIGHT, color=UTERINE )) +
  geom_boxplot()
layout <- matrix(c(1,2,3,4),2,2,byrow=TRUE)
multiplot(p9,p10,p11, layout=layout)


#weight type and weeks
options(repr.plot.width=8, repr.plot.height=6)
weight_range_data<- infant %>%
  mutate(weight_type = case_when(
    BWEIGHT< 5.5 ~ "low birth weight",
    BWEIGHT > 8.8 ~ "overweight",
    TRUE ~ "normal birth weight"
  ))
#histogram for 3 birth weight types
t1<-ggplot(weight_range_data, aes(x=weight_type))+
  geom_bar(color="darkblue", fill="lightblue")
t2<-ggplot(weight_range_data, aes(x = weight_type, fill = SEX)) + 
  geom_bar(position = "dodge")
t3<-ggplot(weight_range_data, aes(x = weight_type,fill = HYPERPR)) + 
  geom_bar(position = "dodge")
t4<-ggplot(weight_range_data, aes(x=weight_type, y=WEEKS, color=weight_type)) +
  geom_boxplot()
t5<-ggplot(weight_range_data, aes(x=weight_type, y=GAINED, color=weight_type)) +
  geom_boxplot()
t6<-ggplot(weight_range_data, aes(x=weight_type, y=VISITS, color=weight_type)) +
  geom_boxplot()
layout <- matrix(c(1,2,3,4,5,6),3,2,byrow=TRUE)
multiplot(t1,t2,t3,t4,t5,t6, layout=layout)


#Linear model
par(mfrow=c(2,2))
lm.fit=lm(BWEIGHT~.,data=infant)
summary(lm.fit)
plot(lm.fit)

Call:
lm(formula = BWEIGHT ~ ., data = infant)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.5478 -0.6696  0.0001  0.6658  5.7927 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -4.4419204  0.0588512 -75.477  < 2e-16 ***
SEXGirl          -0.2655177  0.0065758 -40.378  < 2e-16 ***
MARITALUnmarried -0.1716246  0.0082035 -20.921  < 2e-16 ***
FAGE             -0.0008530  0.0007367  -1.158 0.246930    
GAINED            0.0130282  0.0002484  52.450  < 2e-16 ***
VISITS            0.0107707  0.0009277  11.610  < 2e-16 ***
MAGE              0.0111721  0.0009309  12.002  < 2e-16 ***
FEDUC            -0.0063596  0.0017052  -3.730 0.000192 ***
MEDUC             0.0033036  0.0017729   1.863 0.062417 .  
TOTALP            0.0379519  0.0042667   8.895  < 2e-16 ***
BDEAD            -0.1437499  0.0240176  -5.985 2.17e-09 ***
TERMS            -0.0659972  0.0070287  -9.390  < 2e-16 ***
LOUTCOME2        -0.0452999  0.0125396  -3.613 0.000303 ***
LOUTCOME9        -0.1596704  0.0098804 -16.160  < 2e-16 ***
WEEKS             0.2877879  0.0013566 212.137  < 2e-16 ***
CIGNUM           -0.0326816  0.0011209 -29.157  < 2e-16 ***
DRINKNUM         -0.0018217  0.0158210  -0.115 0.908333    
DIABETESYes       0.2759147  0.0186167  14.821  < 2e-16 ***
HYDRAMYes        -0.3413281  0.0272522 -12.525  < 2e-16 ***
HYPERCHYes       -0.2186825  0.0284270  -7.693 1.45e-14 ***
HYPERPRYes       -0.3439736  0.0152971 -22.486  < 2e-16 ***
ECLAMPYes        -0.6501162  0.0536382 -12.120  < 2e-16 ***
CERVIXYes        -0.4365996  0.0542580  -8.047 8.59e-16 ***
PINFANTYes        0.9330495  0.0438495  21.278  < 2e-16 ***
PRETERMYes       -0.4623945  0.0354035 -13.061  < 2e-16 ***
UTERINEYes       -0.3124886  0.0561667  -5.564 2.65e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.046 on 101373 degrees of freedom
Multiple R-squared:  0.3815,	Adjusted R-squared:  0.3813 
F-statistic:  2501 on 25 and 101373 DF,  p-value: < 2.2e-16


#check heteroscedasticity
lmtest::bptest(lm.fit)  # Breusch-Pagan test

	studentized Breusch-Pagan test

data:  lm.fit
BP = 1208.6, df = 25, p-value < 2.2e-16


#fix heteroscedasticity
coeftest(lm.fit, vcov = vcovHC(lm.fit, type="HC1"))

t test of coefficients:

                    Estimate  Std. Error  t value  Pr(>|t|)    
(Intercept)      -4.44192036  0.07067827 -62.8470 < 2.2e-16 ***
SEXGirl          -0.26551768  0.00656804 -40.4257 < 2.2e-16 ***
MARITALUnmarried -0.17162459  0.00826256 -20.7714 < 2.2e-16 ***
FAGE             -0.00085303  0.00074915  -1.1387 0.2548508    
GAINED            0.01302818  0.00026333  49.4743 < 2.2e-16 ***
VISITS            0.01077068  0.00100405  10.7272 < 2.2e-16 ***
MAGE              0.01117207  0.00095624  11.6833 < 2.2e-16 ***
FEDUC            -0.00635964  0.00170921  -3.7208 0.0001987 ***
MEDUC             0.00330358  0.00181812   1.8170 0.0692154 .  
TOTALP            0.03795190  0.00455564   8.3308 < 2.2e-16 ***
BDEAD            -0.14374987  0.02739267  -5.2477 1.543e-07 ***
TERMS            -0.06599722  0.00741603  -8.8993 < 2.2e-16 ***
LOUTCOME2        -0.04529993  0.01289348  -3.5134 0.0004426 ***
LOUTCOME9        -0.15967045  0.01000096 -15.9655 < 2.2e-16 ***
WEEKS             0.28778792  0.00169589 169.6968 < 2.2e-16 ***
CIGNUM           -0.03268163  0.00117058 -27.9191 < 2.2e-16 ***
DRINKNUM         -0.00182165  0.01925654  -0.0946 0.9246334    
DIABETESYes       0.27591466  0.02160766  12.7693 < 2.2e-16 ***
HYDRAMYes        -0.34132814  0.03058735 -11.1591 < 2.2e-16 ***
HYPERCHYes       -0.21868250  0.03273970  -6.6794 2.411e-11 ***
HYPERPRYes       -0.34397364  0.01750205 -19.6533 < 2.2e-16 ***
ECLAMPYes        -0.65011622  0.06713452  -9.6838 < 2.2e-16 ***
CERVIXYes        -0.43659965  0.05663396  -7.7091 1.278e-14 ***
PINFANTYes        0.93304953  0.04942341  18.8787 < 2.2e-16 ***
PRETERMYes       -0.46239454  0.03642231 -12.6954 < 2.2e-16 ***
UTERINEYes       -0.31248858  0.05879647  -5.3148 1.070e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1


skewness(infant$BWEIGHT)


attach(infant)
train=sample(1:nrow(infant),nrow(infant)*0.8)
lm.fit=lm(BWEIGHT~.,data=infant,subset=train)
#test MSE using linear regression
mean((BWEIGHT-predict(lm.fit,infant))[-train]^2)


#####################################################
#Best Subset,Forward and Backward Stepwise Selection 
#####################################################
regfit.full=regsubsets(BWEIGHT~.,data=infant) #best subset selection
summary(regfit.full)
regfit.fwd=regsubsets(BWEIGHT~.,data=infant,nvmax=8,method="forward")
summary(regfit.fwd)
regfit.bwd=regsubsets(BWEIGHT~.,data=infant,nvmax=8,method="backward")
summary(regfit.bwd)

Subset selection object
Call: regsubsets.formula(BWEIGHT ~ ., data = infant)
25 Variables  (and intercept)
                 Forced in Forced out
SEXGirl              FALSE      FALSE
MARITALUnmarried     FALSE      FALSE
FAGE                 FALSE      FALSE
GAINED               FALSE      FALSE
VISITS               FALSE      FALSE
MAGE                 FALSE      FALSE
FEDUC                FALSE      FALSE
MEDUC                FALSE      FALSE
TOTALP               FALSE      FALSE
BDEAD                FALSE      FALSE
TERMS                FALSE      FALSE
LOUTCOME2            FALSE      FALSE
LOUTCOME9            FALSE      FALSE
WEEKS                FALSE      FALSE
CIGNUM               FALSE      FALSE
DRINKNUM             FALSE      FALSE
DIABETESYes          FALSE      FALSE
HYDRAMYes            FALSE      FALSE
HYPERCHYes           FALSE      FALSE
HYPERPRYes           FALSE      FALSE
ECLAMPYes            FALSE      FALSE
CERVIXYes            FALSE      FALSE
PINFANTYes           FALSE      FALSE
PRETERMYes           FALSE      FALSE
UTERINEYes           FALSE      FALSE
1 subsets of each size up to 8
Selection Algorithm: exhaustive
         SEXGirl MARITALUnmarried FAGE GAINED VISITS MAGE FEDUC MEDUC TOTALP
1  ( 1 ) " "     " "              " "  " "    " "    " "  " "   " "   " "   
2  ( 1 ) " "     " "              " "  "*"    " "    " "  " "   " "   " "   
3  ( 1 ) " "     " "              " "  "*"    " "    "*"  " "   " "   " "   
4  ( 1 ) "*"     " "              " "  "*"    " "    "*"  " "   " "   " "   
5  ( 1 ) "*"     " "              " "  "*"    " "    "*"  " "   " "   " "   
6  ( 1 ) "*"     "*"              " "  "*"    " "    " "  " "   " "   " "   
7  ( 1 ) "*"     "*"              " "  "*"    " "    " "  " "   " "   " "   
8  ( 1 ) "*"     "*"              " "  "*"    " "    " "  " "   " "   " "   
         BDEAD TERMS LOUTCOME2 LOUTCOME9 WEEKS CIGNUM DRINKNUM DIABETESYes
1  ( 1 ) " "   " "   " "       " "       "*"   " "    " "      " "        
2  ( 1 ) " "   " "   " "       " "       "*"   " "    " "      " "        
3  ( 1 ) " "   " "   " "       " "       "*"   " "    " "      " "        
4  ( 1 ) " "   " "   " "       " "       "*"   " "    " "      " "        
5  ( 1 ) " "   " "   " "       " "       "*"   "*"    " "      " "        
6  ( 1 ) " "   " "   " "       "*"       "*"   "*"    " "      " "        
7  ( 1 ) " "   " "   " "       "*"       "*"   "*"    " "      " "        
8  ( 1 ) " "   " "   " "       "*"       "*"   "*"    " "      " "        
         HYDRAMYes HYPERCHYes HYPERPRYes ECLAMPYes CERVIXYes PINFANTYes
1  ( 1 ) " "       " "        " "        " "       " "       " "       
2  ( 1 ) " "       " "        " "        " "       " "       " "       
3  ( 1 ) " "       " "        " "        " "       " "       " "       
4  ( 1 ) " "       " "        " "        " "       " "       " "       
5  ( 1 ) " "       " "        " "        " "       " "       " "       
6  ( 1 ) " "       " "        " "        " "       " "       " "       
7  ( 1 ) " "       " "        "*"        " "       " "       " "       
8  ( 1 ) " "       " "        "*"        " "       " "       "*"       
         PRETERMYes UTERINEYes
1  ( 1 ) " "        " "       
2  ( 1 ) " "        " "       
3  ( 1 ) " "        " "       
4  ( 1 ) " "        " "       
5  ( 1 ) " "        " "       
6  ( 1 ) " "        " "       
7  ( 1 ) " "        " "       
8  ( 1 ) " "        " "

Subset selection object
Call: regsubsets.formula(BWEIGHT ~ ., data = infant, nvmax = 8, method = "forward")
25 Variables  (and intercept)
                 Forced in Forced out
SEXGirl              FALSE      FALSE
MARITALUnmarried     FALSE      FALSE
FAGE                 FALSE      FALSE
GAINED               FALSE      FALSE
VISITS               FALSE      FALSE
MAGE                 FALSE      FALSE
FEDUC                FALSE      FALSE
MEDUC                FALSE      FALSE
TOTALP               FALSE      FALSE
BDEAD                FALSE      FALSE
TERMS                FALSE      FALSE
LOUTCOME2            FALSE      FALSE
LOUTCOME9            FALSE      FALSE
WEEKS                FALSE      FALSE
CIGNUM               FALSE      FALSE
DRINKNUM             FALSE      FALSE
DIABETESYes          FALSE      FALSE
HYDRAMYes            FALSE      FALSE
HYPERCHYes           FALSE      FALSE
HYPERPRYes           FALSE      FALSE
ECLAMPYes            FALSE      FALSE
CERVIXYes            FALSE      FALSE
PINFANTYes           FALSE      FALSE
PRETERMYes           FALSE      FALSE
UTERINEYes           FALSE      FALSE
1 subsets of each size up to 8
Selection Algorithm: forward
         SEXGirl MARITALUnmarried FAGE GAINED VISITS MAGE FEDUC MEDUC TOTALP
1  ( 1 ) " "     " "              " "  " "    " "    " "  " "   " "   " "   
2  ( 1 ) " "     " "              " "  "*"    " "    " "  " "   " "   " "   
3  ( 1 ) " "     " "              " "  "*"    " "    "*"  " "   " "   " "   
4  ( 1 ) "*"     " "              " "  "*"    " "    "*"  " "   " "   " "   
5  ( 1 ) "*"     " "              " "  "*"    " "    "*"  " "   " "   " "   
6  ( 1 ) "*"     " "              " "  "*"    " "    "*"  " "   " "   " "   
7  ( 1 ) "*"     " "              " "  "*"    " "    "*"  " "   " "   " "   
8  ( 1 ) "*"     "*"              " "  "*"    " "    "*"  " "   " "   " "   
         BDEAD TERMS LOUTCOME2 LOUTCOME9 WEEKS CIGNUM DRINKNUM DIABETESYes
1  ( 1 ) " "   " "   " "       " "       "*"   " "    " "      " "        
2  ( 1 ) " "   " "   " "       " "       "*"   " "    " "      " "        
3  ( 1 ) " "   " "   " "       " "       "*"   " "    " "      " "        
4  ( 1 ) " "   " "   " "       " "       "*"   " "    " "      " "        
5  ( 1 ) " "   " "   " "       " "       "*"   "*"    " "      " "        
6  ( 1 ) " "   " "   " "       "*"       "*"   "*"    " "      " "        
7  ( 1 ) " "   " "   " "       "*"       "*"   "*"    " "      " "        
8  ( 1 ) " "   " "   " "       "*"       "*"   "*"    " "      " "        
         HYDRAMYes HYPERCHYes HYPERPRYes ECLAMPYes CERVIXYes PINFANTYes
1  ( 1 ) " "       " "        " "        " "       " "       " "       
2  ( 1 ) " "       " "        " "        " "       " "       " "       
3  ( 1 ) " "       " "        " "        " "       " "       " "       
4  ( 1 ) " "       " "        " "        " "       " "       " "       
5  ( 1 ) " "       " "        " "        " "       " "       " "       
6  ( 1 ) " "       " "        " "        " "       " "       " "       
7  ( 1 ) " "       " "        "*"        " "       " "       " "       
8  ( 1 ) " "       " "        "*"        " "       " "       " "       
         PRETERMYes UTERINEYes
1  ( 1 ) " "        " "       
2  ( 1 ) " "        " "       
3  ( 1 ) " "        " "       
4  ( 1 ) " "        " "       
5  ( 1 ) " "        " "       
6  ( 1 ) " "        " "       
7  ( 1 ) " "        " "       
8  ( 1 ) " "        " "

Subset selection object
Call: regsubsets.formula(BWEIGHT ~ ., data = infant, nvmax = 8, method = "backward")
25 Variables  (and intercept)
                 Forced in Forced out
SEXGirl              FALSE      FALSE
MARITALUnmarried     FALSE      FALSE
FAGE                 FALSE      FALSE
GAINED               FALSE      FALSE
VISITS               FALSE      FALSE
MAGE                 FALSE      FALSE
FEDUC                FALSE      FALSE
MEDUC                FALSE      FALSE
TOTALP               FALSE      FALSE
BDEAD                FALSE      FALSE
TERMS                FALSE      FALSE
LOUTCOME2            FALSE      FALSE
LOUTCOME9            FALSE      FALSE
WEEKS                FALSE      FALSE
CIGNUM               FALSE      FALSE
DRINKNUM             FALSE      FALSE
DIABETESYes          FALSE      FALSE
HYDRAMYes            FALSE      FALSE
HYPERCHYes           FALSE      FALSE
HYPERPRYes           FALSE      FALSE
ECLAMPYes            FALSE      FALSE
CERVIXYes            FALSE      FALSE
PINFANTYes           FALSE      FALSE
PRETERMYes           FALSE      FALSE
UTERINEYes           FALSE      FALSE
1 subsets of each size up to 8
Selection Algorithm: backward
         SEXGirl MARITALUnmarried FAGE GAINED VISITS MAGE FEDUC MEDUC TOTALP
1  ( 1 ) " "     " "              " "  " "    " "    " "  " "   " "   " "   
2  ( 1 ) " "     " "              " "  "*"    " "    " "  " "   " "   " "   
3  ( 1 ) "*"     " "              " "  "*"    " "    " "  " "   " "   " "   
4  ( 1 ) "*"     "*"              " "  "*"    " "    " "  " "   " "   " "   
5  ( 1 ) "*"     "*"              " "  "*"    " "    " "  " "   " "   " "   
6  ( 1 ) "*"     "*"              " "  "*"    " "    " "  " "   " "   " "   
7  ( 1 ) "*"     "*"              " "  "*"    " "    " "  " "   " "   " "   
8  ( 1 ) "*"     "*"              " "  "*"    " "    " "  " "   " "   " "   
         BDEAD TERMS LOUTCOME2 LOUTCOME9 WEEKS CIGNUM DRINKNUM DIABETESYes
1  ( 1 ) " "   " "   " "       " "       "*"   " "    " "      " "        
2  ( 1 ) " "   " "   " "       " "       "*"   " "    " "      " "        
3  ( 1 ) " "   " "   " "       " "       "*"   " "    " "      " "        
4  ( 1 ) " "   " "   " "       " "       "*"   " "    " "      " "        
5  ( 1 ) " "   " "   " "       "*"       "*"   " "    " "      " "        
6  ( 1 ) " "   " "   " "       "*"       "*"   "*"    " "      " "        
7  ( 1 ) " "   " "   " "       "*"       "*"   "*"    " "      " "        
8  ( 1 ) " "   " "   " "       "*"       "*"   "*"    " "      " "        
         HYDRAMYes HYPERCHYes HYPERPRYes ECLAMPYes CERVIXYes PINFANTYes
1  ( 1 ) " "       " "        " "        " "       " "       " "       
2  ( 1 ) " "       " "        " "        " "       " "       " "       
3  ( 1 ) " "       " "        " "        " "       " "       " "       
4  ( 1 ) " "       " "        " "        " "       " "       " "       
5  ( 1 ) " "       " "        " "        " "       " "       " "       
6  ( 1 ) " "       " "        " "        " "       " "       " "       
7  ( 1 ) " "       " "        "*"        " "       " "       " "       
8  ( 1 ) " "       " "        "*"        " "       " "       "*"       
         PRETERMYes UTERINEYes
1  ( 1 ) " "        " "       
2  ( 1 ) " "        " "       
3  ( 1 ) " "        " "       
4  ( 1 ) " "        " "       
5  ( 1 ) " "        " "       
6  ( 1 ) " "        " "       
7  ( 1 ) " "        " "       
8  ( 1 ) " "        " "


#pick the best number of variables based on best subset model
par(mfrow=c(2,2))
options(repr.plot.width=6, repr.plot.height=6)
reg.summary=summary(regfit.full)
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
which.max(reg.summary$adjr2)
points(8,reg.summary$adjr2[8], col="red",cex=2,pch=20)
plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",type='l')
which.min(reg.summary$cp)
points(8,reg.summary$cp[8],col="red",cex=2,pch=20)
which.min(reg.summary$bic)
plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
points(8,reg.summary$bic[8],col="red",cex=2,pch=20)
#the best pick would be 8


#find coefficients for 3 variable selection models
Bestsubset=coef(regfit.full,8)
Bestsubset
Forward=coef(regfit.fwd,8)
Forward
Backward=coef(regfit.bwd,8)
Backward


##################################
##Ridge regression
##################################
par(mfrow=c(1,1))
attach(infant)
x=model.matrix(BWEIGHT~.,infant)[,-1]
y=infant$BWEIGHT
train=sample(1:nrow(infant),nrow(infant)*0.8)
test=(-train)
y.test=y[test]
#Ridge Regression
grid=10^seq(10,-2,length=100)
ridge.mod=glmnet(x,y,alpha=0,lambda=grid)
ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid, thresh=1e-12)
#we can use CV to choose the tuning parameter lambda
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
bestlam=cv.out$lambda.min
bestlam

The following objects are masked from infant (pos = 3):

    BDEAD, BWEIGHT, CERVIX, CIGNUM, DIABETES, DRINKNUM, ECLAMP, FAGE,
    FEDUC, GAINED, HYDRAM, HYPERCH, HYPERPR, LOUTCOME, MAGE, MARITAL,
    MEDUC, PINFANT, PRETERM, SEX, TERMS, TOTALP, UTERINE, VISITS, WEEKS


ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])
mean((ridge.pred-y.test)^2)#test MSE


#refit on teh full data set, using the "best" lambda
out=glmnet(x,y,alpha=0)
predict(out,type="coefficients",s=bestlam)[1:24,] 
#note:none of beta.hat are zero!


##################################
##Lasso regression
##################################
lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda=grid)
#plot(lasso.mod)#coefficient plot
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
#plot(cv.out)#CV error plot
bestlam=cv.out$lambda.1se
bestlam


lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])
mean((lasso.pred-y.test)^2) #test MSE


out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:24,]
#lasso.coef
lasso.coef[lasso.coef!=0]


###########################################################
#Principal Components Regression and Partial Least Squares
##########################################################
#Fitting on training, use CV found the best number of principal components
options(repr.plot.width=5, repr.plot.height=5)
pcr.fit=pcr(BWEIGHT~., data=infant,subset=train,scale=TRUE, validation="CV")
validationplot(pcr.fit,val.type="MSEP")
#the lowest CV error occurs when M=21
pcr.pred=predict(pcr.fit,x[test,],ncomp=21)
mean((pcr.pred-y.test)^2) #test MSE

pcr.fit=pcr(y~x,scale=TRUE,ncomp=21)#refit on the whole data
summary(pcr.fit)

Data: 	X dimension: 101399 25 
	Y dimension: 101399 1
Fit method: svdpc
Number of components considered: 21
TRAINING: % variance explained
   1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
X  12.1770   21.925   27.450    32.44    36.91    41.22    45.38    49.46
y   0.3362    2.113    6.625    19.32    20.37    22.57    27.02    27.45
   9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
X    53.49     57.49     61.43     65.30     69.14     72.91     76.60
y    27.49     27.97     28.20     28.55     29.35     29.79     29.94
   16 comps  17 comps  18 comps  19 comps  20 comps  21 comps
X     80.25     83.77     87.08     90.05     92.78     95.23
y     31.65     32.23     32.52     37.71     37.84     38.07


#Partial Least Squares
#fit pls using training data
pls.fit=plsr(BWEIGHT~., data=infant,subset=train,scale=TRUE, validation="CV")
pls.pred=predict(pls.fit,x[test,],ncomp=4)
mean((pls.pred-y.test)^2)

#the lowest CV error occurs around M=4
validationplot(pls.fit,val.type="MSEP")
pls.fit=plsr(BWEIGHT~., data=infant,scale=TRUE,ncomp=4)
summary(pls.fit)

Data: 	X dimension: 101399 25 
	Y dimension: 101399 1
Fit method: kernelpls
Number of components considered: 4
TRAINING: % variance explained
         1 comps  2 comps  3 comps  4 comps
X          5.993    14.94    22.22    30.07
BWEIGHT   34.352    37.58    38.03    38.12


########################
# Non-linear Modeling 
#######################
#log transformation
options(repr.plot.width=8, repr.plot.height=6)
par(mfrow=c(2,2))
lm.fit=lm(log(BWEIGHT)~.,data=infant)
summary(lm.fit)
plot(lm.fit)

Call:
lm(formula = log(BWEIGHT) ~ ., data = infant)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.71268 -0.08879  0.01324  0.10701  0.66115 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -4.277e-01  9.828e-03 -43.520  < 2e-16 ***
SEXGirl          -3.797e-02  1.098e-03 -34.575  < 2e-16 ***
MARITALUnmarried -2.453e-02  1.370e-03 -17.905  < 2e-16 ***
FAGE             -1.993e-04  1.230e-04  -1.620  0.10529    
GAINED            2.053e-03  4.148e-05  49.488  < 2e-16 ***
VISITS            2.600e-03  1.549e-04  16.782  < 2e-16 ***
MAGE              1.604e-03  1.554e-04  10.317  < 2e-16 ***
FEDUC            -8.941e-04  2.848e-04  -3.140  0.00169 ** 
MEDUC             2.186e-04  2.961e-04   0.738  0.46026    
TOTALP            6.993e-03  7.125e-04   9.815  < 2e-16 ***
BDEAD            -3.632e-02  4.011e-03  -9.055  < 2e-16 ***
TERMS            -1.139e-02  1.174e-03  -9.704  < 2e-16 ***
LOUTCOME2        -1.006e-02  2.094e-03  -4.805 1.55e-06 ***
LOUTCOME9        -2.849e-02  1.650e-03 -17.270  < 2e-16 ***
WEEKS             5.924e-02  2.265e-04 261.493  < 2e-16 ***
CIGNUM           -4.606e-03  1.872e-04 -24.608  < 2e-16 ***
DRINKNUM          1.350e-03  2.642e-03   0.511  0.60944    
DIABETESYes       4.618e-02  3.109e-03  14.855  < 2e-16 ***
HYDRAMYes        -5.074e-02  4.551e-03 -11.150  < 2e-16 ***
HYPERCHYes       -3.922e-02  4.747e-03  -8.262  < 2e-16 ***
HYPERPRYes       -4.974e-02  2.554e-03 -19.471  < 2e-16 ***
ECLAMPYes        -1.132e-01  8.957e-03 -12.643  < 2e-16 ***
CERVIXYes        -1.238e-01  9.061e-03 -13.666  < 2e-16 ***
PINFANTYes        1.133e-01  7.322e-03  15.475  < 2e-16 ***
PRETERMYes       -6.970e-02  5.912e-03 -11.790  < 2e-16 ***
UTERINEYes       -8.933e-02  9.379e-03  -9.524  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1746 on 101373 degrees of freedom
Multiple R-squared:  0.4636,	Adjusted R-squared:  0.4634 
F-statistic:  3504 on 25 and 101373 DF,  p-value: < 2.2e-16


lm.fit=lm(log(BWEIGHT)~.,data=infant,subset = train)
exp(mean((log(BWEIGHT)-predict(lm.fit,infant))[-train]^2))


#use k-fold to choose the optimal degree d for the polynomial and plot
options(repr.plot.width=5, repr.plot.height=5)
all.deltas = rep(NA, 10)
for (i in 1:10) {
  glm.fit = glm(BWEIGHT~poly(WEEKS, i), data=infant)
  all.deltas[i] = cv.glm(infant, glm.fit, K=10)$delta[2]
}
plot(1:10, all.deltas, xlab="Degree", ylab="CV error", type="l", pch=20, lwd=2, ylim=c(1,1.3))
min.point = min(all.deltas)
sd.points = sd(all.deltas)
abline(h=min.point + 0.2 * sd.points, col="red", lty="dashed")
abline(h=min.point - 0.2 * sd.points, col="red", lty="dashed")
legend("topright", "0.2-standard deviation lines", lty="dashed", col="red")
#Degree 3 appears to provide a reasonable fit to the data


#now plot the polynomial prediction on the data
plot(BWEIGHT~WEEKS, data=infant, col="darkgrey")
WEEKSlims = range(infant$WEEKS)
WEEKS.grid = seq(from=WEEKSlims[1], to=WEEKSlims[2])
lm.fit = lm(BWEIGHT~poly(WEEKS, 3), data=infant)
lm.pred = predict(lm.fit, data.frame(WEEKS=WEEKS.grid))
lines(WEEKS.grid, lm.pred, col="blue", lwd=2)


#find test MSE
lm.fit=lm(BWEIGHT~.+poly(WEEKS, 3),data=infant,subset = train)
summary(lm.fit)
mean((BWEIGHT-predict(lm.fit,infant))[-train]^2)

Call:
lm(formula = BWEIGHT ~ . + poly(WEEKS, 3), data = infant, subset = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.4117 -0.6488 -0.0222  0.6223  5.5469 

Coefficients: (1 not defined because of singularities)
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -4.436e+00  6.290e-02 -70.525  < 2e-16 ***
SEXGirl          -2.609e-01  7.031e-03 -37.108  < 2e-16 ***
MARITALUnmarried -1.557e-01  8.781e-03 -17.731  < 2e-16 ***
FAGE             -1.718e-03  7.858e-04  -2.186 0.028816 *  
GAINED            1.203e-02  2.661e-04  45.191  < 2e-16 ***
VISITS            5.373e-03  9.920e-04   5.416 6.11e-08 ***
MAGE              1.079e-02  9.942e-04  10.851  < 2e-16 ***
FEDUC            -6.384e-03  1.820e-03  -3.507 0.000453 ***
MEDUC             1.993e-03  1.898e-03   1.050 0.293754    
TOTALP            3.554e-02  4.557e-03   7.800 6.29e-15 ***
BDEAD            -9.588e-02  2.598e-02  -3.690 0.000224 ***
TERMS            -5.947e-02  7.494e-03  -7.936 2.11e-15 ***
LOUTCOME2        -3.295e-02  1.338e-02  -2.463 0.013797 *  
LOUTCOME9        -1.348e-01  1.056e-02 -12.770  < 2e-16 ***
WEEKS             2.910e-01  1.449e-03 200.806  < 2e-16 ***
CIGNUM           -3.111e-02  1.196e-03 -26.008  < 2e-16 ***
DRINKNUM          3.783e-03  1.553e-02   0.244 0.807489    
DIABETESYes       2.609e-01  1.977e-02  13.194  < 2e-16 ***
HYDRAMYes        -3.216e-01  2.896e-02 -11.105  < 2e-16 ***
HYPERCHYes       -1.882e-01  3.037e-02  -6.197 5.79e-10 ***
HYPERPRYes       -3.122e-01  1.638e-02 -19.053  < 2e-16 ***
ECLAMPYes        -5.488e-01  5.790e-02  -9.478  < 2e-16 ***
CERVIXYes        -2.547e-01  5.847e-02  -4.356 1.32e-05 ***
PINFANTYes        9.270e-01  4.733e-02  19.588  < 2e-16 ***
PRETERMYes       -3.579e-01  3.780e-02  -9.466  < 2e-16 ***
UTERINEYes       -1.430e-01  6.065e-02  -2.357 0.018420 *  
poly(WEEKS, 3)1          NA         NA      NA       NA    
poly(WEEKS, 3)2  -7.344e+01  1.128e+00 -65.122  < 2e-16 ***
poly(WEEKS, 3)3  -6.191e+01  1.122e+00 -55.185  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9998 on 81091 degrees of freedom
Multiple R-squared:  0.434,	Adjusted R-squared:  0.4338 
F-statistic:  2303 on 27 and 81091 DF,  p-value: < 2.2e-16

Warning message in predict.lm(lm.fit, infant):
"prediction from a rank-deficient fit may be misleading"


#Step function
#use k-fold cv to choose best cuts
all.cvs = rep(NA, 10)
for (i in 2:10) {
  infant$WEEKS.cut = cut(infant$WEEKS, i)
  lm.fit = glm(BWEIGHT~WEEKS.cut, data=infant)
  all.cvs[i] = cv.glm(infant, lm.fit, K=10)$delta[2]
}
plot(2:10, all.cvs[-1], xlab="Number of cuts", ylab="CV error", type="l", pch=20, lwd=2)


#now train the entire data with step function using 10 cuts and plot it.
lm.fit = glm(BWEIGHT~cut(WEEKS, 10), data=infant)
WEEKSlims = range(infant$WEEKS)
WEEKS.grid = seq(from=WEEKSlims[1], to=WEEKSlims[2])
lm.pred = predict(lm.fit, data.frame(WEEKS=WEEKS.grid))
plot(BWEIGHT~WEEKS, data=infant, col="darkgrey")
lines(WEEKS.grid, lm.pred, col="red", lwd=2)


#check the test error
lm.fit=lm(BWEIGHT~.+cut(WEEKS, 10), data=infant,subset = train)
mean((BWEIGHT-predict(lm.fit,infant))[-train]^2)

Warning message in predict.lm(lm.fit, infant):
"prediction from a rank-deficient fit may be misleading"


# Cubic Splines
#bs() function generates the matrix of basis functions for splines
#with the specified set of knots. By default, cubic splinse are produced
lm.fit=lm(BWEIGHT~bs(WEEKS,knots=c(15,25,35,45)),data=infant)
pred=predict(lm.fit,newdata=list(WEEKS=WEEKS.grid),se=T)
plot(WEEKS,BWEIGHT,col="gray")
lines(WEEKS.grid,pred$fit,lwd=2)
lines(WEEKS.grid,pred$fit+2*pred$se,lty="dashed")
lines(WEEKS.grid,pred$fit-2*pred$se,lty="dashed")

Warning message in predict.lm(lm.fit, newdata = list(WEEKS = WEEKS.grid), se = T):
"prediction from a rank-deficient fit may be misleading"


lm.fit=lm(BWEIGHT~bs(WEEKS,knots=c(15,20,25,30,35,40,45)),data=infant,subset = train)
mean((BWEIGHT-predict(lm.fit,infant))[-train]^2)

Warning message in predict.lm(lm.fit, infant):
"prediction from a rank-deficient fit may be misleading"


#################
#Decision trees
#################
#Regression Trees
tree.infant=tree(BWEIGHT~.,infant,subset=train)
summary(tree.infant)
plot(tree.infant)
text(tree.infant,pretty=0)

Regression tree:
tree(formula = BWEIGHT ~ ., data = infant, subset = train)
Variables actually used in tree construction:
[1] "WEEKS"
Number of terminal nodes:  5 
Residual mean deviance:  1.125 = 91230 / 81110 
Distribution of residuals:
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-6.45800 -0.70820 -0.02074  0.00000  0.66680  5.65000


#in regression, deviance is the RSS
cv.infant=cv.tree(tree.infant)
plot(cv.infant$size,cv.infant$dev,type='b')


yhat=predict(tree.infant,newdata=infant[-train,])
infant.test=infant[-train,"BWEIGHT"]
plot(yhat,infant.test)
abline(0,1)
mean((yhat-infant.test)^2)


#Bagging 
bag.infant=randomForest(BWEIGHT~.,data=infant,subset=train,mtry=24,ntree=15,importance=TRUE)
bag.infant
yhat.bag = predict(bag.infant,newdata=infant[-train,])
#plot(yhat.bag, infant.test)
#abline(0,1)
mean((yhat.bag-infant.test)^2)

Call:
 randomForest(formula = BWEIGHT ~ ., data = infant, mtry = 24,      ntree = 15, importance = TRUE, subset = train) 
               Type of random forest: regression
                     Number of trees: 15
No. of variables tried at each split: 24

          Mean of squared residuals: 1.21612
                    % Var explained: 31.11


#Random Forest
rf.infant=randomForest(BWEIGHT~.,data=infant,subset=train,mtry=5,ntree=15,importance=TRUE)
yhat.rf = predict(rf.infant,newdata=infant[-train,])
mean((yhat.rf-infant.test)^2)

#view the importance of each variable
importance(rf.infant)
varImpPlot(rf.infant)


#Boosting
boost.infant=gbm(BWEIGHT~.,data=infant[train,],distribution="gaussian",n.trees=500,interaction.depth=4)
#produces influence plot and influence statistics
summary(boost.infant)

yhat.boost = predict(boost.infant,newdata=infant[-train,],n.trees=500)
mean((yhat.boost-infant.test)^2)


#summary of test MSE for all models
# Create data
options(repr.plot.width = 8, repr.plot.height = 5)
data <- data.frame(
  Model=c("Linear","Ridge","Lasso","PCR","PLS","Log","Poly","Step","Cubic Spline","RegTrees","Bagging","RF","Boosting") ,  
  Test_MSE=c(1.09,1.11,1.12,1.11,1.11,1.03,1.02,1.03,1.12,1.16,1.11,1.05,1)
)

# Barplot
ggplot(data, aes(x = reorder(Model, +Test_MSE), y = Test_MSE)) +
         geom_bar(stat = "identity",fill="steelblue")

	%IncMSE	IncNodePurity
SEX	11.3785731	1640.00062
MARITAL	12.2720592	1315.87367
FAGE	14.6220199	7740.37587
GAINED	33.3141228	10367.83347
VISITS	9.1154421	7468.46252
MAGE	22.1282790	7471.09635
FEDUC	26.1814351	4850.04693
MEDUC	21.0629529	5099.79180
TOTALP	8.8898635	3322.11780
BDEAD	1.1559141	344.34499
TERMS	5.9550367	1862.12689
LOUTCOME	10.1823037	1609.13709
WEEKS	10.6210870	35526.77746
CIGNUM	17.5684979	1915.45669
DRINKNUM	0.8395205	96.83607
DIABETES	7.3494876	792.92903
HYDRAM	2.7382119	543.79625
HYPERCH	3.8310557	399.29859
HYPERPR	8.5163815	1168.25648
ECLAMP	3.3727051	227.57158
CERVIX	0.4753805	129.96199
PINFANT	13.0091975	504.87354
PRETERM	2.9054686	329.84983
UTERINE	0.9705705	133.60566
WEEKS.cut	5.4041805	19052.22040

Infant Birth Weight Prediction

Linsi Lin

Introduction¶

Table of Contents¶

Variable Names and Descriptions:¶

Get the Data¶

Data Preprocessing¶

Exploratory Data Analysis¶

Linear Model¶

Linear Model Selection¶

Ridge and Lasso Regression¶

Principle Components Regression and Partial Least Squares¶

Non-linear Model¶

Logarithmic Transformation¶

Polynomial Regression¶

Step Function¶

Cubic Splines¶

Regression Trees¶

Bagging and Random Forest¶

Boosting¶

Conclusion¶

	SEX	MARITAL	FAGE	GAINED	VISITS	MAGE	FEDUC	MEDUC	TOTALP	...	HYPERPR	BWEIGHT
2001	2	1	33	26	10	34	12	4	2	...	0	4.3750
2002	2	2	19	40	10	18	11	12	1	...	0	6.9375
2003	2	1	33	16	14	31	16	16	2	...	0	8.5000
2004	1	1	25	40	15	28	12	12	3	...	0	8.5000
2005	1	2	21	60	13	20	12	14	2	...	1	9.0000
2006	1	1	21	30	15	21	12	13	1	...	0	8.0000

	var	rel.inf
WEEKS	WEEKS	71.27633568
WEEKS.cut	WEEKS.cut	9.39893458
GAINED	GAINED	4.55093255
SEX	SEX	2.18388214
CIGNUM	CIGNUM	2.00213074
MAGE	MAGE	1.61573543
MARITAL	MARITAL	1.07752527
VISITS	VISITS	1.06896761
LOUTCOME	LOUTCOME	1.04531812
FAGE	FAGE	0.87191711
PINFANT	PINFANT	0.73328658
HYPERPR	HYPERPR	0.69964973
MEDUC	MEDUC	0.65855236
FEDUC	FEDUC	0.59702004
TOTALP	TOTALP	0.56094523
DIABETES	DIABETES	0.42471666
HYDRAM	HYDRAM	0.25904084
ECLAMP	ECLAMP	0.20711348
TERMS	TERMS	0.20618812
PRETERM	PRETERM	0.18594217
HYPERCH	HYPERCH	0.14069017
BDEAD	BDEAD	0.11518760
CERVIX	CERVIX	0.06249128
DRINKNUM	DRINKNUM	0.03267230
UTERINE	UTERINE	0.02482421