Flow control

if… else

In R

i = -3

if (i < 0) {
  print("Negative number")
} else if (u == 0) {
  print("Zero")
} else {
  print("Positive number")
}
## [1] "Negative number"
y <- if(i < 0) print("Negative number") else print ("Positive number")
## [1] "Negative number"

In Python & PySpark

i = -3

if i < 0:
  print "Negative number"
elif u == 0:
  print "Zero"
else:
  print "Positive number"
## Negative number
y = 'Negative number' if i < 0 else "Positive number"
print y
## Negative number

For loop

Iterate over an object.

In R

a <- c(rep("A", 3), rep("B", 3), rep("C",2))
b <- c(1, 1, 7, 4, 1, 1, 6, 6)
df <- data.frame(a,b)
df
##   a b
## 1 A 1
## 2 A 1
## 3 A 7
## 4 B 4
## 5 B 1
## 6 B 1
## 7 C 6
## 8 C 6
for (i in df) {
  print(i)
}
## [1] A A A B B B C C
## Levels: A B C
## [1] 1 1 7 4 1 1 6 6
for (i in df$a) {
  print(i)
}
## [1] "A"
## [1] "A"
## [1] "A"
## [1] "B"
## [1] "B"
## [1] "B"
## [1] "C"
## [1] "C"
for (i in 1:dim(df)[1]) {
  df[i, 'c'] <- i * df[i, 'b']
}
print(df)
##   a b  c
## 1 A 1  1
## 2 A 1  2
## 3 A 7 21
## 4 B 4 16
## 5 B 1  5
## 6 B 1  6
## 7 C 6 42
## 8 C 6 48

In Python

import pandas as pd
import numpy as np

a = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C']
b = [1,1,7,4,1,1,6,6]
df = pd.DataFrame(np.column_stack([a, b]), columns=['a', 'b'])
df.b = df.b.astype(int)
df.index = list(range(1, len(df)+1))
print df
##    a  b
## 0  A  1
## 1  A  1
## 2  A  7
## 3  B  4
## 4  B  1
## 5  B  1
## 6  C  6
## 7  C  6
for i in df:
  print i
## a
## b
for i in df.a:
  print i
## A
## A
## A
## B
## B
## B
## C
## C
df.index = list(range(1, len(df)+1))
for i in range(1, df.shape[0]+1):
  df.ix[i, 'c'] = i * int(df.ix[i, 'b'])
print df
##    a  b   c
## 1  A  1   1
## 2  A  1   2
## 3  A  7  21
## 4  B  4  16
## 5  B  1   5
## 6  B  1   6
## 7  C  6  42
## 8  C  6  48

In PySpark - Not applicable. DataFrames, same as other distributed data structures, are not iterable

While loop

Loop until a specific condition is met.

In R

i <- 4

while(i > 0) {
  print(df[i, 'b'])
  i <- i - 1
}
## [1] 4
## [1] 7
## [1] 1
## [1] 1

In Python

df.index = list(range(1, len(df)+1))
i = 4

while (i > 0):
  print(df.ix[i, 'b'])
  i = i - 1
## 4
## 7
## 1
## 1

In PySpark - Not applicable. DataFrames, same as other distributed data structures, are not iterable

Break and next

In R

for (i in df$b) {
  if (i == 4) {
    next
  } else if (i == 6) {
    break
  }
  print(i)
}
## [1] 1
## [1] 1
## [1] 7
## [1] 1
## [1] 1

In Python

for i in df.b:
  if i == 4:
    continue
  elif i == 6:
    break
  print i
## 1
## 1
## 7
## 1
## 1

In PySpark - Not applicable. DataFrames, same as other distributed data structures, are not iterable

Repeat loop

Break condition is declared explicitly inside the body to exit the loop.

In R

i <- 6

repeat {
  print(df[i, 'b'])
  i <- i - 1
  if (i < 2) {
    break
  }
}
## [1] 1
## [1] 1
## [1] 4
## [1] 7
## [1] 1

In Python - no repeat loop in python workaround use while true if break

i = 6

while True:
  print df.ix[i, 'b']
  i = i - 1
  if i < 2:
    break
## 1
## 1
## 4
## 7
## 1

Functions

In R

min_max <- function(col) {
  print(paste("Max df$", col, " = ", max(df[col]), sep=""))
  print(paste("Min df$", col, " = ", min(df[col]), sep=""))
}
  
min_max('b')
## [1] "Max df$b = 7"
## [1] "Min df$b = 1"

In Python

def min_max(col):
  print "Max df.{} {} {}".format(col, "=", max(df[col]))
  print "Min df.{}{}{}".format(col, " = ", df[col].min())
  
min_max('b')
## Max df.b = 7
## Min df.b = 1

Apply functions

functions R Python input apply to output
apply(M, 1/2, fun) X X matrix, df row 1, col 2 matrix
lapply(L, FUN) X list, df every element of a list list
sapply(S, FUN) X list, df every element of a list vector or unlist(lapply(…))
mapply(FUN, col1, col2…) X matrix, df selected columns list
map(fun, seq, seq…) X lists every element of list list
filter(fun, list) X list every element of a list list
reduce(fun, list) X list every element of a list single value

apply

In R

M <- matrix(rep(seq(3), 4), ncol=3)
print(M)
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    2    3    1
## [3,]    3    1    2
## [4,]    1    2    3
apply(M, 1, sum)
## [1] 6 6 6 6
apply(M, 2, mean)
## [1] 1.75 2.00 2.25
apply(M, 1, function(x, y) sum(x) + y, y=1)
## [1] 7 7 7 7
apply(M, 2, function(x) length(x[x<2]))
## [1] 2 1 1

In Python

import pandas as pd
import numpy as np
a = [1,2,3,1]
b = [2,3,1,2]
c = [3,1,2,3]
df = pd.DataFrame(np.column_stack([a, b, c]), columns=['a', 'b', 'c'])
print df

df.apply(np.sum, axis=1)
df.apply(np.mean, axis=0)
df.apply(lambda x, y : sum(x) + y, y=1, axis=1)
df.apply(lambda x: len(x[x<2]), axis=0)

lapply

In R

L <- list(a=1:3, b=5:10, c=seq(10, 100, 10))
print(L)
## $a
## [1] 1 2 3
## 
## $b
## [1]  5  6  7  8  9 10
## 
## $c
##  [1]  10  20  30  40  50  60  70  80  90 100
lapply(L, FUN=length)
## $a
## [1] 3
## 
## $b
## [1] 6
## 
## $c
## [1] 10
lapply(L, FUN=median)
## $a
## [1] 2
## 
## $b
## [1] 7.5
## 
## $c
## [1] 55
lapply(L, function(x) x^2)
## $a
## [1] 1 4 9
## 
## $b
## [1]  25  36  49  64  81 100
## 
## $c
##  [1]   100   400   900  1600  2500  3600  4900  6400  8100 10000
lapply(L, "[", 3)
## $a
## [1] 3
## 
## $b
## [1] 7
## 
## $c
## [1] 30

sapply

In R

sapply(L, FUN=length)
##  a  b  c 
##  3  6 10
sapply(L, FUN=median)
##    a    b    c 
##  2.0  7.5 55.0
sapply(L, function(x) x^2)
## $a
## [1] 1 4 9
## 
## $b
## [1]  25  36  49  64  81 100
## 
## $c
##  [1]   100   400   900  1600  2500  3600  4900  6400  8100 10000
sapply(L, "[", 3)
##  a  b  c 
##  3  7 30

mapply

In R

df$d <- mapply(function(x, y) y/x, df$b, df$c)
print(df)
##   a b  c d
## 1 A 1  1 1
## 2 A 1  2 2
## 3 A 7 21 3
## 4 B 4 16 4
## 5 B 1  5 5
## 6 B 1  6 6
## 7 C 6 42 7
## 8 C 6 48 8
exp <- function(x, y) {
  exp <- x^y
}
df$e <- mapply(exp, df$b, df$d)
print(df)
##   a b  c d       e
## 1 A 1  1 1       1
## 2 A 1  2 2       1
## 3 A 7 21 3     343
## 4 B 4 16 4     256
## 5 B 1  5 5       1
## 6 B 1  6 6       1
## 7 C 6 42 7  279936
## 8 C 6 48 8 1679616

map

In Python

temp = (39.2, 36.5, 37.3, 37.8)
F = map(lambda x: (float(9)/5)*x + 32, temp)
print F
## [102.56, 97.7, 99.14, 100.03999999999999]

In Python

a = [1,2,3,4]
b = [5,6,7,8]
c = [-1,-2,-3,-4]
print map(lambda x,y,z: x+y+z, a,b,c)
## [5, 6, 7, 8]

filter

In Python

fib = [0,1,1,2,3,5,8,13,21,34,55]
print filter(lambda x: x % 2, fib)
## [1, 1, 3, 5, 13, 21, 55]

reduce

In Python

print reduce(lambda x,y: x+y, [47,11,42,13])
## 113
print reduce(lambda a,b: a if (a > b) else b, [47,11,42,102,13])
## 102