Data frame collect multiple vectors (R) or series (Pandas) the same way that a spreadsheet collects multiple columns of data. Spark DataFrames are based on RDDs, RDDs are immutable structures and do not allow updating elements on-site; DataFrame Spark columns are allowed to have the same name.
In R
df <- data.frame()
In Python
import pandas as pd
df = pd.DataFrame()
In R
num <- c(1, 2, 3, 4, 5)
double <- c(1.1, 2.2, 3.3, 4.4, 5.5)
str <- c('one', 'two', 'three', 'four', 'five')
bool <- c(TRUE, FALSE, TRUE, FALSE, TRUE)
df <- data.frame(num, double, str, bool, stringsAsFactors = FALSE)
print(df)
## num double str bool
## 1 1 1.1 one TRUE
## 2 2 2.2 two FALSE
## 3 3 3.3 three TRUE
## 4 4 4.4 four FALSE
## 5 5 5.5 five TRUE
In Python
import numpy as np
num = [1, 2, 3, 4, 5]
double = [1.1, 2.2, 3.3, 4.4, 5.5]
string = ['one', 'two', 'three', 'four', 'five']
boolean = [True, False, True, False, True]
df = pd.DataFrame(np.column_stack([num, double, string, boolean]), columns=['num', 'double', 'string', 'boolean'])
print df
## num double string boolean
## 0 1 1.1 one True
## 1 2 2.2 two False
## 2 3 3.3 three True
## 3 4 4.4 four False
## 4 5 5.5 five True
In PySpark
df = spark.createDataFrame([(1, 1.1, 'one', 'TRUE'), (2, 2.2, 'two', 'FALSE'), (3, 3.4, 'three', 'TRUE'), (4, 4.4, 'four', 'FALSE'), (5, 5.5, 'five', 'TRUE')], ["num", "double", "str", "bool"])
df.show()
In R
names(df)[names(df)=='double'] <- 'float'
print(names(df))
## [1] "num" "float" "str" "bool"
or
names(df) <- c('num', 'double', 'string', 'boolean')
print(names(df))
## [1] "num" "double" "string" "boolean"
In Python
df = df.rename(columns = {'double': 'float'})
print df.columns
## Index([u'num', u'float', u'string', u'boolean'], dtype='object')
or
df.columns = (['num', 'double', 'string', 'boolean'])
print df.columns
## Index([u'num', u'double', u'string', u'boolean'], dtype='object')
In PySpark
print "1. Before rename:", df.schema.names
df = df.withColumnRenamed("double", "float").withColumnRenamed("bool", "boolean")
print "2. After rename:", df.schema.names
print "OR"
oldColumns = df.schema.names
print "3. Before rename:", oldColumns
newColumns = ["num", "double", "string", "bool"]
df = reduce(lambda df, idx: df.withColumnRenamed(oldColumns[idx], newColumns[idx]), xrange(len(oldColumns)), df)
print "4. After rename:", df.schema.names
In R
df <- df[c('boolean', 'double', 'num' ,'string')]
print(df)
## boolean double num string
## 1 TRUE 1.1 1 one
## 2 FALSE 2.2 2 two
## 3 TRUE 3.3 3 three
## 4 FALSE 4.4 4 four
## 5 TRUE 5.5 5 five
In Python
df = df[['boolean', 'double', 'num', 'string']]
print df
## boolean double num string
## 0 True 1.1 1 one
## 1 False 2.2 2 two
## 2 True 3.3 3 three
## 3 False 4.4 4 four
## 4 True 5.5 5 five
In PySpark
from pyspark.sql.functions import *
df = df.select(col("num"), col("string"), col("double").alias("float"), col("bool").alias("str"))
print "After rename:", df.schema.names
When you print a data frame the leftmost column without header is the index, by default they are number and start from 1 in R but 0 in Python. Index are not applicable for Spark DataFrame.
In R - index is also known as rownames
rownames(df)
## [1] "1" "2" "3" "4" "5"
In Python
print df.index
## Int64Index([0, 1, 2, 3, 4], dtype='int64')
In R
rownames(df) <- 0:(nrow(df)-1) # Make it like pandas, index starts from 0
print(rownames(df))
## [1] "0" "1" "2" "3" "4"
rownames(df) <- unlist(df['string'])
print(rownames(df))
## [1] "one" "two" "three" "four" "five"
## num double str bool
## 1 1 1.1 one TRUE
## 2 2 2.2 two FALSE
## 3 3 3.3 three TRUE
## 4 4 4.4 four FALSE
## 5 5 5.5 five TRUE
In Python
df.index = list(range(1, len(df)+1)) # Make it like R, index starts from 1
print df.index
## Int64Index([1, 2, 3, 4, 5], dtype='int64')
df = df.set_index('string')
print df.index
## Index([u'one', u'two', u'three', u'four', u'five'], dtype='object')
df.set_index('string', inplace=True)
print df
## boolean double num
## string
## one True 1.1 1
## two False 2.2 2
## three True 3.3 3
## four False 4.4 4
## five True 5.5 5
In Python
reset_index's parameter drop=True reset index to 0,1,2,3,...df.reset_index(level=0, drop=False, inplace=True)
print df
## string boolean double num
## 0 one True 1.1 1
## 1 two False 2.2 2
## 2 three True 3.3 3
## 3 four False 4.4 4
## 4 five True 5.5 5
There are 3 choices for indexing in Pandas:
One confusing part about iloc and ix is that ix acts like R slicing so the endpoint is inclusive. I’ve changed the columns from text to numeric to demonstrate the difference in the example below.
num = [1, 2, 3, 4, 5]
double = [1.1, 2.2, 3.3, 4.4, 5.5]
string = ['one', 'two', 'three', 'four', 'five']
boolean = [True, False, True, False, True]
df = pd.DataFrame(np.column_stack([num, double, string, boolean]), columns=[11, 22, 33, 44])
print df
## 11 22 33 44
## 0 1 1.1 one True
## 1 2 2.2 two False
## 2 3 3.3 three True
## 3 4 4.4 four False
## 4 5 5.5 five True
print df.iloc[1:3, 0:2]
## 11 22
## 1 2 2.2
## 2 3 3.3
vs.
print df.ix[1:3, 11:33]
## 11 22 33
## 1 2 2.2 two
## 2 3 3.3 three
## 3 4 4.4 four
In R
df$neg <- c(-1, -2, -3, -4, -5)
print(df)
## num double str bool neg
## 1 1 1.1 one TRUE -1
## 2 2 2.2 two FALSE -2
## 3 3 3.3 three TRUE -3
## 4 4 4.4 four FALSE -4
## 5 5 5.5 five TRUE -5
In Python
df['neg'] = [-1, -2, -3, -4, -5]
print df
## num double string boolean neg
## 0 1 1.1 one True -1
## 1 2 2.2 two False -2
## 2 3 3.3 three True -3
## 3 4 4.4 four False -4
## 4 5 5.5 five True -5
In PySpark
from pyspark.sql import functions as F
df.withColumn('neg', F.lit(-1)).show()
df.select('*', F.lit(-1).alias("neg"), (df.num % 2 == 1).alias("IsEven"), round(sqrt(df.num),2).alias("Sqrt")).show()
In R
df$neg <- NULL
print(df)
## num double str bool
## 1 1 1.1 one TRUE
## 2 2 2.2 two FALSE
## 3 3 3.3 three TRUE
## 4 4 4.4 four FALSE
## 5 5 5.5 five TRUE
In Python
There are three ways to delete columns in Pandas:
del df['num']
print df
## double string boolean neg
## 0 1.1 one True -1
## 1 2.2 two False -2
## 2 3.3 three True -3
## 3 4.4 four False -4
## 4 5.5 five True -5
or
df.pop('neg')
print df
## double string boolean
## 0 1.1 one True
## 1 2.2 two False
## 2 3.3 three True
## 3 4.4 four False
## 4 5.5 five True
or
print df.drop(['string', 'boolean'], axis=1)
## double
## 0 1.1
## 1 2.2
## 2 3.3
## 3 4.4
## 4 5.5
In PySpark
df = df.drop(df.neg)
df.show()
Columns are selectable using column names or index slices.
In R
df['num'] # return data frame
## num
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
or
df$num # return double
## [1] 1 2 3 4 5
In Python
df['num'] # return pandas.core.series.Series
## 0 1
## 1 2
## 2 3
## 3 4
## 4 5
## Name: num, dtype: object
or
df.num # return pandas.core.series.Series
## 0 1
## 1 2
## 2 3
## 3 4
## 4 5
## Name: num, dtype: object
In PySpark
df.select("num").show()
df.select(df.float).show()
Slice in R is inclusive start:end-included whereas Python the slice endpoint is not inclusive start:include-till. 1:3 in R includes columns 1, 2, 3 but only 1, 2 in Python.
In R
df[c('double', 'str')]
## double str
## 1 1.1 one
## 2 2.2 two
## 3 3.3 three
## 4 4.4 four
## 5 5.5 five
or
df[2:3]
## double str
## 1 1.1 one
## 2 2.2 two
## 3 3.3 three
## 4 4.4 four
## 5 5.5 five
In Python
df[['double', 'string']]
## double string
## 0 1.1 one
## 1 2.2 two
## 2 3.3 three
## 3 4.4 four
## 4 5.5 five
or
df[df.index[1:3]]
## double string
## 0 1.1 one
## 1 2.2 two
## 2 3.3 three
## 3 4.4 four
## 4 5.5 five
In PySpark
df.select(df.num, "float").show()
Rows are selectable using numerical slices or logical selection.
In R
df[2:4, ]
## num double str bool
## 2 2 2.2 two FALSE
## 3 3 3.3 three TRUE
## 4 4 4.4 four FALSE
or
df[df$double < 4, ]
## num double str bool
## 1 1 1.1 one TRUE
## 2 2 2.2 two FALSE
## 3 3 3.3 three TRUE
In Python
df[1:4]
## num double string boolean
## 1 2 2.2 two False
## 2 3 3.3 three True
## 3 4 4.4 four False
or
df.num = df['num'].astype(np.int)
df.double = df['double'].astype(np.float)
df[df['double'] < 4]
## num double string boolean
## 0 1 1.1 one True
## 1 2 2.2 two False
## 2 3 3.3 three True
Major difference between R and Pandas is that R is column-major vs Pandas is row-major. df[1:3] returns columns 1, 2, 3 in R whereas df[1:3] returns row 0, 1 in Pandas.
In PySpark
df.filter(df.float < 4).show()
The behavior for data frame element slicing is text for columns and numeric/logical for rows, selection notation is [row-selector, column-selector] in both R and Pandas. Index start from 1 in R but 0 in Python!
In R
df[2, 'str']
## [1] "two"
In Python
df.ix[1, 'string']
## two
In PySpark - not applicable
In R
df[df$double < 4 & df$num < 3, c('str', 'bool')]
## str bool
## 1 one TRUE
## 2 two FALSE
In Python
df.ix[(df['double'] < 4) & (df['num'] < 3), ['string', 'boolean']]
## string boolean
## 0 one True
## 1 two False
In PySpark
df.filter((df.float < 4) & (df.num < 3)).select("string", "str").show()
In R
df[df$num==2,'num'] <- 4
print(df)
## num double str bool
## 1 1 1.1 one TRUE
## 2 4 2.2 two FALSE
## 3 3 3.3 three TRUE
## 4 4 4.4 four FALSE
## 5 5 5.5 five TRUE
In Python
df.num = df['num'].astype(np.int)
df.ix[(df['num'] == 2), 'num'] = 4
print df
## num double string boolean
## 0 1 1.1 one True
## 1 4 2.2 two False
## 2 3 3.3 three True
## 3 4 4.4 four False
## 4 5 5.5 five True
In PySpark
from pyspark.sql import functions as F
df = df.withColumn("num", F.when(df.num==2, 4).otherwise(df.num))
df.show()
The is.na function in R and pd.isnull function in Pandas work pretty much the same, both take data frame, list or value and return the a boolean data frame, list or value of whether the value is NA.
In R
num = c(1, NA, 3, 4, 5, NaN)
double = c(1.1, 2.2, NaN, 4.4, 5.5, NaN)
str = c('one', 'two', 'three', 'four', NA, NA)
bool = c(TRUE, FALSE, TRUE, FALSE, TRUE, NA)
df <- data.frame(num, double, str, bool, stringsAsFactors=FALSE)
print(df)
## num double str bool
## 1 1 1.1 one TRUE
## 2 NA 2.2 two FALSE
## 3 3 NaN three TRUE
## 4 4 4.4 four FALSE
## 5 5 5.5 <NA> TRUE
## 6 NaN NaN <NA> NA
is.na(df)
## num double str bool
## [1,] FALSE FALSE FALSE FALSE
## [2,] TRUE FALSE FALSE FALSE
## [3,] FALSE TRUE FALSE FALSE
## [4,] FALSE FALSE FALSE FALSE
## [5,] FALSE FALSE TRUE FALSE
## [6,] TRUE TRUE TRUE TRUE
is.na(df['num'])
## num
## [1,] FALSE
## [2,] TRUE
## [3,] FALSE
## [4,] FALSE
## [5,] FALSE
## [6,] TRUE
is.na(df[1, 'num'])
## [1] FALSE
In Python
num = [1, np.nan, 3, 4, 5, np.nan]
double = [1.1, 2.2, np.nan, 4.4, 5.5, np.nan]
string = ['one', 'two', 'three', 'four', None, np.nan]
boolean = [True, False, True, False, True, np.nan]
df = pd.DataFrame(np.column_stack([num, double, string, boolean]), columns=['num', 'double', 'string', 'boolean'])
print df
## num double string boolean
## 0 1 1.1 one 1
## 1 NaN 2.2 two 0
## 2 3 NaN three 1
## 3 4 4.4 four 0
## 4 5 5.5 None 1
## 5 NaN NaN NaN NaN
print pd.isnull(df)
## num double string boolean
## 0 False False False False
## 1 True False False False
## 2 False True False False
## 3 False False False False
## 4 False False True False
## 5 True True True True
print pd.isnull(df['num'])
## 0 False
## 1 True
## 2 False
## 3 False
## 4 False
## 5 True
## Name: num, dtype: bool
print pd.isnull(df.ix([0, 'num']))
## False
In PySpark
df = spark.createDataFrame([(1, 1.1, 'one', 'TRUE'), (None, 2.2, 'two', 'FALSE'), (3, None, 'three', 'TRUE'), (4, 4.4, None, 'FALSE'), (5, 5.5, 'five', 'TRUE'), (None, None, None, None)], ["num", "double", "str", "bool"])
def replace(column, value):
return when(column != value, column).otherwise(lit(None))
df = df.withColumn("bool", replace(col("bool"), 'FALSE'))
df.show()
df.select(isnull("num").alias("r1"), isnull(df.double).alias("r2")).collect()
In R
df[complete.cases(df), ]
## num double str bool
## 1 1 1.1 one TRUE
## 4 4 4.4 four FALSE
In Python
dropna() drops rows with any NaN or null values. Parameter how=‘all’ restricts data frame to drop rows containing all NaN or null values, add parameter axis=1 to drop columns with missing values.
df = df.dropna(how='all')
print df
## num double string boolean
## 0 1 1.1 one 1
## 1 NaN 2.2 two 0
## 2 3 NaN three 1
## 3 4 4.4 four 0
## 4 5 5.5 None 1
or
print df.dropna()
## num double string boolean
## 0 1 1.1 one 1
## 3 4 4.4 four 0
In PySpark
df.na.drop(how='all').show()
df.na.drop(how='any').show()
In R
num = c(1, NA, 3, 4, 5, NaN)
double = c(1.1, 2.2, NaN, 4.4, 5.5, NaN)
str = c('one', 'two', 'three', 'four', NA, NA)
bool = c(TRUE, FALSE, TRUE, FALSE, TRUE, NA)
df <- data.frame(num, double, str, bool, stringsAsFactors=FALSE)
print(df)
## num double str bool
## 1 1 1.1 one TRUE
## 2 NA 2.2 two FALSE
## 3 3 NaN three TRUE
## 4 4 4.4 four FALSE
## 5 5 5.5 <NA> TRUE
## 6 NaN NaN <NA> NA
df[is.na(df)] <- 'new'
print(df)
## num double str bool
## 1 1 1.1 one TRUE
## 2 new 2.2 two FALSE
## 3 3 new three TRUE
## 4 4 4.4 four FALSE
## 5 5 5.5 new TRUE
## 6 new new new new
In Python
num = [1, np.nan, 3, 4, 5, np.nan]
double = [1.1, 2.2, np.nan, 4.4, 5.5, np.nan]
string = ['one', 'two', 'three', 'four', None, np.nan]
boolean = [True, False, True, False, True, np.nan]
df = pd.DataFrame(np.column_stack([num, double, string, boolean]), columns=['num', 'double', 'string', 'boolean'])
print df
## num double string boolean
## 0 1 1.1 one 1
## 1 NaN 2.2 two 0
## 2 3 NaN three 1
## 3 4 4.4 four 0
## 4 5 5.5 None 1
## 5 NaN NaN NaN NaN
print df.fillna('new')
## num double string boolean
## 0 1 1.1 one 1
## 1 new 2.2 two 0
## 2 3 new three 1
## 3 4 4.4 four 0
## 4 5 5.5 new 1
## 5 new new new new
In PySpark
df.na.fill(50).show()
df.na.fill({'num': 5, 'bool': 'TRUE'}).show()
In R
a <- c(rep("A", 3), rep("B", 3), rep("C",2))
b <- c(1, 1, 7, 4, 1, 1, 6, 6)
df <- data.frame(a,b)
df
## a b
## 1 A 1
## 2 A 1
## 3 A 7
## 4 B 4
## 5 B 1
## 6 B 1
## 7 C 6
## 8 C 6
duplicated(df$a)
## [1] FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE
df[duplicated(df), ]
## a b
## 2 A 1
## 6 B 1
## 8 C 6
In Python
import pandas as pd
import numpy as np
a = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C']
b = [1, 1, 7, 4, 1, 1, 6, 6]
df = pd.DataFrame(np.column_stack([a, b]), columns=['a', 'b'])
print df
## a b
## 0 A 1
## 1 A 1
## 2 A 7
## 3 B 4
## 4 B 1
## 5 B 1
## 6 C 6
## 7 C 6
print df.duplicated('a')
## 0 False
## 1 True
## 2 True
## 3 False
## 4 True
## 5 True
## 6 False
## 7 True
## dtype: bool
print df[df.duplicated()]
## a b
## 1 A 1
## 5 B 1
## 7 C 6
In PySpark - not applicable
In R
unique(df)
## a b
## 1 A 1
## 3 A 7
## 4 B 4
## 5 B 1
## 7 C 6
In Python
print df.drop_duplicates()
## a b
## 0 A 1
## 2 A 7
## 3 B 4
## 4 B 1
## 6 C 6
In PySpark
df = spark.createDataFrame([('A', 1), ('A', 1), ('A', 7), ('B', 4), ('B', 1), ('B', 1), ('C', 6), ('C', 6)], ["a", "b"])
df.show()
df.dropDuplicates(['a']).show()
df.dropDuplicates().show()