North Carolina births
In 2004, the state of North Carolina released a large data set containing information on births recorded in this state. This data set is useful to researchers studying the relation between habits and practices of expectant mothers and the birth of their children. We will work with a random sample of observations from this data set.
In R
download.file('http://www.openintro.org/stat/data/nc.RData', destfile = 'nc.RData')
load('nc.RData')
In Python
import pandas as pd
data = pd.read_csv('http://photo.etangkk.com/python/NCbirths.txt', sep='\t')
Histogram is used to show the frequency across a continuous or discrete variable, its x-axis is a number line and the ordering of the bars are not interchangeable. It is useful for describing the shape of the distribution.
In R
hist(nc$fage, breaks=10, freq=TRUE, xlim=c(10, max(nc$fage, na.rm=TRUE)), ylim=c(0, 250), main='Histogram of father\'s age', xlab='Age in years')
Using ggplot2
library(ggplot2)
ggplot(data=nc, aes(x=fage)) + geom_histogram(bins=15, na.rm=TRUE) + labs(title="Histogram of father's age") + xlab('Age in years') + xlim(0, max(nc$fage, na.rm=TRUE)) + ylim(0, 200)
It can easily turn into a density plot, instead of counting the number per bin it gives the probability densities.
hist(nc$fage, breaks=15, freq=FALSE, xlim=c(10, max(nc$fage, na.rm=TRUE)), ylim=c(0, .075), main='Probability of father\'s age', xlab='Age in years')
Using ggplot2
ggplot(data=nc, aes(x=fage)) + geom_histogram(aes(y=(..density..)), col="blue", fill="yellow") + labs(title="Probability of father\'s age") + xlab('Age in years') + ylab('Probability')
In Python
import matplotlib.pyplot as plt
%matplotlib inline
ax = data['fage'].plot(kind='hist', bins=10, xlim=[10, data['fage'].max()], ylim=[0, 200])
ax.set_title('Histogram of father\'s age')
ax.set_xlabel('Age in years')
ax.set_ylabel('Frequency')
Using seaborn
import seaborn as sns
f, ax = plt.subplots(figsize=(6, 4));
sns.distplot(data['fage'].dropna(), hist=True, kde=False, rug=False, bins=10);
ax.set(xlim=(0, data['fage'].max()), ylabel="Frequency", xlabel="Age in years", title="Histogram of father\'s age")
ax = data['fage'].plot(kind='hist', bins=15, normed=True, xlim=[10, data['fage'].max()])
ax.set_title('Probability of father\'s age')
ax.set_xlabel('Age in years')
ax.set_ylabel('Density')
Using seaborn
import seaborn as sns
f, ax = plt.subplots(figsize=(6, 4));
sns.distplot(data['fage'].dropna(), hist=True, kde=True, rug=False, bins=10);
ax.set(xlim=(0, data['fage'].max()), ylabel="Density", xlabel="Age in years", title="Probability of father\'s age")
The choice of bin width can alter the story the histogram is telling.
Density plots are usually a more effective way to view the distribution of a continuous variable.
plot(density(nc$fage, na.rm=TRUE), add=TRUE, main="Density of father\'s age", xlab='Father age')
Using ggplot2
ggplot(data=nc, aes(x=fage)) + geom_density(alpha=.2, fill="red") + labs(title="Probability of father\'s age") + xlab('Age in years') + ylab('Probability')
In Python Using seaborn
import seaborn as sns
f, ax = plt.subplots(figsize=(6, 4));
sns.distplot(data['fage'].dropna(), hist=False, kde=True, rug=False, bins=10);
ax.set(xlim=(0, data['fage'].max()), ylabel="Density", xlabel="Age in years", title="Density of father\'s age")
Boxplot is useful for highlighting outliers, median, and interquartile range (IQR).
In R
boxplot(nc$gained, horizontal=TRUE, xlab='Weight gained by mother during pregnancy in pounds')
Using ggplot2
ggplot(nc, aes(x=factor(0), y=gained)) + geom_boxplot() + xlab("") + ylab("Weight gained by mother during pregnancy in pounds") + scale_x_discrete(breaks=NULL) + coord_flip()
In Python
ax = data['gained'].plot(kind='box', vert=False)
ax.set_xlabel('Weight gained by mother during pregnancy in pounds')
Using seaborn
import seaborn as sns
f, ax = plt.subplots(figsize=(6, 4));
sns.distplot(data['fage'].dropna(), hist=False, kde=True, rug=False, bins=10);
ax.set(xlim=(0, data['fage'].max()), ylabel="Density", xlabel="Age in years", title="Density of father\'s age")
Frequency table is the easiest way to see what occurs most often in a set of data.
In R
mytable <- table(nc$whitemom, exclude=NULL)
print(mytable)
##
## not white white <NA>
## 284 714 2
Table of proportions displays the relative frequency.
prop.table(mytable)
##
## not white white <NA>
## 0.284 0.714 0.002
In Python
data['whitemom'].value_counts()
## white 714
## not white 284
## dtype: int64
There is no table of proportions in Python, I’ve written the following function to generate a frequency table with percentage in Python.
def freq_table(df, var, rmna=False):
if rmna is True:
df[var] = df[var].replace(np.nan, 'NaN')
table = pd.DataFrame(df.groupby(var)[var].count())
table['Freq'] = list(map(lambda x: round(x, 2), table[var] / float(table.sum()) * 100))
table.columns = ['Counts', 'Frequencies']
print table
return table
t = freq_table(data, 'whitemom')
freq_table(data, 'whitemom', True)
## Counts Frequencies
## whitemom
## not white 284 28.46
## white 714 71.54
## Counts Frequencies
## whitemom
## NaN 2 0.2
## not white 284 28.4
## white 714 71.4
Barplot is analogous to histogram but for categorical variables.
In R
par(mfrow=c(1, 2))
counts <- table(nc$lowbirthweight)
percent <- prop.table(counts)
barplot(counts, width=3, horiz=TRUE, main='Low birth weight distribution')
barplot(percent, width=6, main='Probability of low birth weight')
Using ggplot2
p1 <- ggplot(data=nc, aes(x=lowbirthweight)) + geom_bar(col="blue", fill="yellow", alpha=.2) + labs(title="Low birth weight distribution")
library(scales)
p2 <- ggplot(data=nc, aes(x=lowbirthweight)) + geom_bar(aes(y=round((..count..)/sum(..count..)*100, 2)), fill='yellow') + geom_text(aes(y = ((..count..)/sum(..count..)*100), label=scales::percent((..count..)/sum(..count..))), stat="count", vjust = -0.25) + labs(title="Probability of low birth weight", y="Percent", x="Birth Weight") + coord_flip()
library(gridExtra)
grid.arrange(p1, p2, ncol=2, nrow=1)
In Python
ax = t['Counts'].plot(kind='barh')
ax = t['Frequencies'].plot(kind='bar')
ax.set_xticklabels(t.index, rotation=0)
Using seaborn
import seaborn as sns
sns.countplot(data['whitemom']);
Barplot is analogous to histogram but for categorical variables.
In R
library(plotrix)
library(dplyr)
nc$visits_bucket <- cut(nc$visits, breaks=seq(0,30,10), include.lowest=TRUE)
nc$visits_bucket <- ifelse(is.na(nc$visits_bucket), "NA", nc$visits_bucket)
d1 <- group_by(nc, visits_bucket) %>% summarise(cnt=length(visits_bucket), pct=length(visits_bucket)/length(nc$visits_bucket)*100)
par(mfrow=c(1, 2))
#counts <- table(nc$visits_bucket)
#percent <- prop.table(counts)
pie(d1$cnt, labels=d1$visits_bucket, main="Pie chart of visits")
pie3D(d1$cnt, labels=d1$visits_bucket, explode=0.1, main="3D Pie chart of visits")
Using ggplot2
g1 <- ggplot(d1, aes(x="", y=cnt, fill=visits_bucket)) + geom_bar(stat="identity") + coord_polar(theta="y", start=0) + guides(fill=guide_legend(title="Visits", ncol=4))
g1 <- g1 + geom_text(aes(y = d1$cnt/3 + c(0, cumsum(d1$cnt)[-length(d1$cnt)]), label=d1$pct)) + ggtitle("Pie chart of visits")
g1 <- g1 + theme(legend.position="top", axis.text=element_blank(), axis.title = element_blank(), axis.ticks = element_blank())
print(g1)
In Python
sums = data.groupby('visits').size()
axis('equal');
pie(sums, labels=sums.index);