Load North Carolina Births data

Use data to predict baby carry to full term or premie

In R

download.file("http://www.openintro.org/stat/data/nc.RData", destfile = "nc.RData")
load("nc.RData")

In Python

import numpy as np
import pandas as pd
data = pd.read_csv('http://photo.etangkk.com/python/NCbirths.txt', sep='\t')

Refer to Data Basics and Summary Statistics in R, Python Pandas, PySpark DataFrame for summary statistics this dataset

Impute missing values with median for continuous variables

In R

num_var <- names(nc[, sapply(nc, is.numeric)])
for (i in num_var) {
    nc[is.na(nc[, i]), i] <- median(nc[, i], na.rm = TRUE)
}
summary(nc)
##       fage            mage            mature        weeks      
##  Min.   :14.00   Min.   :13   mature mom :133   Min.   :20.00  
##  1st Qu.:26.00   1st Qu.:22   younger mom:867   1st Qu.:37.00  
##  Median :30.00   Median :27                     Median :39.00  
##  Mean   :30.21   Mean   :27                     Mean   :38.34  
##  3rd Qu.:34.00   3rd Qu.:32                     3rd Qu.:40.00  
##  Max.   :55.00   Max.   :50                     Max.   :45.00  
##        premie        visits            marital        gained     
##  full term:846   Min.   : 0.0   married    :386   Min.   : 0.00  
##  premie   :152   1st Qu.:10.0   not married:613   1st Qu.:21.00  
##  NA's     :  2   Median :12.0   NA's       :  1   Median :30.00  
##                  Mean   :12.1                     Mean   :30.32  
##                  3rd Qu.:15.0                     3rd Qu.:38.00  
##                  Max.   :30.0                     Max.   :85.00  
##      weight       lowbirthweight    gender          habit    
##  Min.   : 1.000   low    :111    female:503   nonsmoker:873  
##  1st Qu.: 6.380   not low:889    male  :497   smoker   :126  
##  Median : 7.310                               NA's     :  1  
##  Mean   : 7.101                                              
##  3rd Qu.: 8.060                                              
##  Max.   :11.750                                              
##       whitemom  
##  not white:284  
##  white    :714  
##  NA's     :  2  
##                 
##                 
## 

In Python

num_var = list(data.select_dtypes(include=[np.number]).columns.values)
for i in num_var:
  data[i].fillna(data[i].median(), inplace=True)
print data.describe()
##               fage         mage        weeks       visits       gained  \
## count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000   
## mean     30.212000    27.000000    38.336000    12.104000    30.317000   
## std       6.158489     6.213583     2.928768     3.937091    14.047628   
## min      14.000000    13.000000    20.000000     0.000000     0.000000   
## 25%      26.000000    22.000000    37.000000    10.000000    21.000000   
## 50%      30.000000    27.000000    39.000000    12.000000    30.000000   
## 75%      34.000000    32.000000    40.000000    15.000000    38.000000   
## max      55.000000    50.000000    45.000000    30.000000    85.000000   
## 
##            weight  
## count  1000.00000  
## mean      7.10100  
## std       1.50886  
## min       1.00000  
## 25%       6.38000  
## 50%       7.31000  
## 75%       8.06000  
## max      11.75000

Remove NAs in categorical variables

In R

nc <- nc[complete.cases(nc), ]

In Python

data.dropna(inplace=True)

Convert categoriccal variables

PCA works only on nummeric variables, convert categorical variables into numeric variables

In R

library(dummies)
factor_var <- names(nc[, sapply(nc, is.factor)])
new_nc <- dummy.data.frame(nc, names = c(factor_var))

names(new_nc) <- sub(" ", "_", names(new_nc))
# remove predictive variables
names(new_nc)[names(new_nc) == "premiepremie"] <- "label"
new_nc$premiefull_term <- NULL
str(new_nc)
## 'data.frame':    996 obs. of  19 variables:
##  $ fage                 : int  30 30 19 21 30 30 18 17 30 20 ...
##  $ mage                 : num  13 14 15 15 15 15 15 15 16 16 ...
##  $ maturemature_mom     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ matureyounger_mom    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ weeks                : num  39 42 37 41 39 38 37 35 38 37 ...
##  $ label                : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ visits               : int  10 15 11 6 9 19 12 5 9 13 ...
##  $ maritalmarried       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ maritalnot_married   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ gained               : int  38 20 38 34 27 22 76 15 30 52 ...
##  $ weight               : num  7.63 7.88 6.63 8 6.38 5.38 8.44 4.69 8.81 6.94 ...
##  $ lowbirthweightlow    : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ lowbirthweightnot_low: int  1 1 1 1 1 0 1 0 1 1 ...
##  $ genderfemale         : int  0 0 1 0 1 0 0 0 0 1 ...
##  $ gendermale           : int  1 1 0 1 0 1 1 1 1 0 ...
##  $ habitnonsmoker       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ habitsmoker          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ whitemomnot_white    : int  1 1 0 0 1 1 1 1 0 0 ...
##  $ whitemomwhite        : int  0 0 1 1 0 0 0 0 1 1 ...
##  - attr(*, "dummies")=List of 7
##   ..$ mature        : int  3 4
##   ..$ premie        : int  6 7
##   ..$ marital       : int  9 10
##   ..$ lowbirthweight: int  13 14
##   ..$ gender        : int  15 16
##   ..$ habit         : int  17 18
##   ..$ whitemom      : int  19 20

In Python

factor_var = list(set(data.columns.values)- set(num_var))

for i in factor_var:
  data = pd.get_dummies(data, columns=[i])

data = data.rename(columns={col: col.replace(' ', '_') for col in data.columns})
data = data.rename(columns = {'premie_premie': 'label'})
del data['premie_full_term']
print data.head(2)
##    fage  mage  weeks  visits  gained  weight  habit_nonsmoker  habit_smoker  \
## 1    30    13     39      10      38    7.63                1             0   
## 2    30    14     42      15      20    7.88                1             0   
## 
##    gender_female  gender_male  label  marital_married  marital_not_married  \
## 1              0            1      0                1                    0   
## 2              0            1      0                1                    0   
## 
##    mature_mature_mom  mature_younger_mom  lowbirthweight_low  \
## 1                  0                   1                   0   
## 2                  0                   1                   0   
## 
##    lowbirthweight_not_low  whitemom_not_white  whitemom_white  
## 1                       1                   1               0  
## 2                       1                   1               0

Split data into training and testing set

In R

library(caret)
set.seed(100)

training_index <- createDataPartition(new_nc$label, p = 0.75, list = FALSE)
nc_train <- new_nc[training_index, ]
nc_test <- new_nc[-training_index, ]

In Python

from sklearn.cross_validation import train_test_split as sklearn_split

train_df, test_df = sklearn_split(data, test_size=.25, random_state=42)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
print "train dim =", train_df.shape, "; test dim =", test_df.shape
## train dim = (747, 19) ; test dim = (249, 19)

Perform PCA

In R In Python Output description
prcomp() sklearn.decomposition.PCA()
center df.mean(axis=0) mean of the variables prior to PCA
scale df.std(axis=0) standard devication of the variables prior to PCA
rotation - the matrix of variable loadings
x components_ the principal component score vector
sdev - the standard deviation of principal components
sdev ^ 2 explained_variance_ratio_ the percentage of variance explained by each of the selected components
# centered to mean = 0 parameter scale = T normalizes variables to have standard devication = 1
prin_comp <- prcomp(nc_train[, !(colnames(nc_train) %in% c('label'))], scale = T)

# Plot the resultant principal components
biplot(prin_comp, scale=0)

# computer variance of principal components (sdev ^ 2)
pc_var <- prin_comp$sdev ^ 2

# calculate proportion of variance explainted by each pc
prop_varex <- pc_var / sum(pc_var)

In Python

# Convert pandas df to numpy arrays
train_np = train_df[train_df.columns.difference(['label'])].values
test_np = test_df[test_df.columns.difference(['label'])].values

# Scale the values
from sklearn.preprocessing import scale as sklearnScale
train_scale = sklearnScale(train_np)
test_scale = sklearnScale(test_np)

# train PCA
from sklearn.decomposition import PCA as sklearnPCA
pca = sklearnPCA(n_components=train_df.shape[1]-1)
pca.fit(train_scale)

# calculate proportion of variance explained by each PC
var = pd.DataFrame({'PC': np.arange(1,train_df.shape[1]), 'var': np.round(pca.explained_variance_ratio_, decimals=4)*100})

# calculate the cumulative variance explained by each PC
cumvar = pd.DataFrame({'PC': np.arange(1,train_df.shape[1]), 'cumvar': np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)})

The first pc explains 21% variance, second pc explains 18% variance and so on…

Determine the number of principal components to model

In R

par(mfrow = c(1, 2))

plot(prop_varex, xlab="Principal Component", ylab="Proportion of Variance Explained", type="b", main="Scree plot")

plot(cumsum(prop_varex), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", type="b", main="Cumulative Scree plot")

In Python

import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(1,2,figsize=(10,5))

sns.pointplot(x="PC", y="var", data=var, ax=ax[0])
ax[0].title.set_text('Scree plot')

sns.pointplot(x="PC", y="cumvar", data=cumvar, ax=ax[1])
ax[1].title.set_text('Cumculative Scree plot')

display(fig.figure)

Caption for the picture.

The plot shows that 10 pc results in variance close to 97%, PCA reduces 18 predictors to 10 without compromising on explained variance

PCA transforms train and test dataset

Apply PCA transformation to the test dataset including the center and scaling feature

In R

# add training set with principal components
train_data <- data.frame(label = nc_train$label, prin_comp$x)

# select the first 10 PCs
train_data <- train_data[, 1:11]

# transform test dataset into PC
test_data <- predict(prin_comp, newdata = nc_test[, !(colnames(nc_test) %in% 
    c("label"))])

# select the first 10 PCs
test_data <- as.data.frame(test_data[, 1:10])

In Python

# select the first 10 PCs
pca_reduced = sklearnPCA(n_components=10)
pca_reduced.fit(train_scale)
train_reduced_df = pd.DataFrame(pca_reduced.fit_transform(train_scale))

# add training set with principal components
train_reduced_df['label'] = train_df['label']

# transform test dataset into PC
test_reduced_df = pd.DataFrame(pca_reduced.fit_transform(test_scale))

Predictive model with PCA Components

Use PCA components to train generalized linear models (binomial)

In R

# run a logistic model
train_control <- trainControl(method = "repeatedcv", number = 5)
glm_model <- train(label ~ ., data = train_data, method = "glm", family = binomial, 
    trControl = train_control)

# make prediction
test_data$pred <- predict(glm_model, newdata = test_data, type = "raw")
test_data$pred <- ifelse(test_data$pred > 0.5, 1, 0)

confusionMatrix(test_data$pred, nc_test$label)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 212   3
##          1   3  31
##                                           
##                Accuracy : 0.9759          
##                  95% CI : (0.9483, 0.9911)
##     No Information Rate : 0.8635          
##     P-Value [Acc > NIR] : 7.627e-10       
##                                           
##                   Kappa : 0.8978          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9860          
##             Specificity : 0.9118          
##          Pos Pred Value : 0.9860          
##          Neg Pred Value : 0.9118          
##              Prevalence : 0.8635          
##          Detection Rate : 0.8514          
##    Detection Prevalence : 0.8635          
##       Balanced Accuracy : 0.9489          
##                                           
##        'Positive' Class : 0               
## 

In Python

# Run a logistic model
from sklearn.linear_model import LogisticRegression
glm = LogisticRegression()
glm.fit(train_reduced_df.iloc[:,:-1], train_reduced_df.iloc[:,-1])

# make prediction
test_reduced_df['pred'] = glm.predict(test_reduced_df)

from sklearn.metrics import confusion_matrix
print confusion_matrix(test_df['label'], test_reduced_df['pred'])
## [[175  32]
##  [ 23  19]]