In R
i = -3
if (i < 0) {
print("Negative number")
} else if (u == 0) {
print("Zero")
} else {
print("Positive number")
}
## [1] "Negative number"
y <- if(i < 0) print("Negative number") else print ("Positive number")
## [1] "Negative number"
In Python & PySpark
i = -3
if i < 0:
print "Negative number"
elif u == 0:
print "Zero"
else:
print "Positive number"
## Negative number
y = 'Negative number' if i < 0 else "Positive number"
print y
## Negative number
Iterate over an object.
In R
a <- c(rep("A", 3), rep("B", 3), rep("C",2))
b <- c(1, 1, 7, 4, 1, 1, 6, 6)
df <- data.frame(a,b)
df
## a b
## 1 A 1
## 2 A 1
## 3 A 7
## 4 B 4
## 5 B 1
## 6 B 1
## 7 C 6
## 8 C 6
for (i in df) {
print(i)
}
## [1] A A A B B B C C
## Levels: A B C
## [1] 1 1 7 4 1 1 6 6
for (i in df$a) {
print(i)
}
## [1] "A"
## [1] "A"
## [1] "A"
## [1] "B"
## [1] "B"
## [1] "B"
## [1] "C"
## [1] "C"
for (i in 1:dim(df)[1]) {
df[i, 'c'] <- i * df[i, 'b']
}
print(df)
## a b c
## 1 A 1 1
## 2 A 1 2
## 3 A 7 21
## 4 B 4 16
## 5 B 1 5
## 6 B 1 6
## 7 C 6 42
## 8 C 6 48
In Python
import pandas as pd
import numpy as np
a = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C']
b = [1,1,7,4,1,1,6,6]
df = pd.DataFrame(np.column_stack([a, b]), columns=['a', 'b'])
df.b = df.b.astype(int)
df.index = list(range(1, len(df)+1))
print df
## a b
## 0 A 1
## 1 A 1
## 2 A 7
## 3 B 4
## 4 B 1
## 5 B 1
## 6 C 6
## 7 C 6
for i in df:
print i
## a
## b
for i in df.a:
print i
## A
## A
## A
## B
## B
## B
## C
## C
df.index = list(range(1, len(df)+1))
for i in range(1, df.shape[0]+1):
df.ix[i, 'c'] = i * int(df.ix[i, 'b'])
print df
## a b c
## 1 A 1 1
## 2 A 1 2
## 3 A 7 21
## 4 B 4 16
## 5 B 1 5
## 6 B 1 6
## 7 C 6 42
## 8 C 6 48
In PySpark - Not applicable. DataFrames, same as other distributed data structures, are not iterable
Loop until a specific condition is met.
In R
i <- 4
while(i > 0) {
print(df[i, 'b'])
i <- i - 1
}
## [1] 4
## [1] 7
## [1] 1
## [1] 1
In Python
df.index = list(range(1, len(df)+1))
i = 4
while (i > 0):
print(df.ix[i, 'b'])
i = i - 1
## 4
## 7
## 1
## 1
In PySpark - Not applicable. DataFrames, same as other distributed data structures, are not iterable
In R
for (i in df$b) {
if (i == 4) {
next
} else if (i == 6) {
break
}
print(i)
}
## [1] 1
## [1] 1
## [1] 7
## [1] 1
## [1] 1
In Python
for i in df.b:
if i == 4:
continue
elif i == 6:
break
print i
## 1
## 1
## 7
## 1
## 1
In PySpark - Not applicable. DataFrames, same as other distributed data structures, are not iterable
Break condition is declared explicitly inside the body to exit the loop.
In R
i <- 6
repeat {
print(df[i, 'b'])
i <- i - 1
if (i < 2) {
break
}
}
## [1] 1
## [1] 1
## [1] 4
## [1] 7
## [1] 1
In Python - no repeat loop in python workaround use while true if break
i = 6
while True:
print df.ix[i, 'b']
i = i - 1
if i < 2:
break
## 1
## 1
## 4
## 7
## 1
In R
min_max <- function(col) {
print(paste("Max df$", col, " = ", max(df[col]), sep=""))
print(paste("Min df$", col, " = ", min(df[col]), sep=""))
}
min_max('b')
## [1] "Max df$b = 7"
## [1] "Min df$b = 1"
In Python
def min_max(col):
print "Max df.{} {} {}".format(col, "=", max(df[col]))
print "Min df.{}{}{}".format(col, " = ", df[col].min())
min_max('b')
## Max df.b = 7
## Min df.b = 1
functions | R | Python | input | apply to | output |
---|---|---|---|---|---|
apply(M, 1/2, fun) | X | X | matrix, df | row 1, col 2 | matrix |
lapply(L, FUN) | X | list, df | every element of a list | list | |
sapply(S, FUN) | X | list, df | every element of a list | vector or unlist(lapply(…)) | |
mapply(FUN, col1, col2…) | X | matrix, df | selected columns | list | |
map(fun, seq, seq…) | X | lists | every element of list | list | |
filter(fun, list) | X | list | every element of a list | list | |
reduce(fun, list) | X | list | every element of a list | single value |
In R
M <- matrix(rep(seq(3), 4), ncol=3)
print(M)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 2 3 1
## [3,] 3 1 2
## [4,] 1 2 3
apply(M, 1, sum)
## [1] 6 6 6 6
apply(M, 2, mean)
## [1] 1.75 2.00 2.25
apply(M, 1, function(x, y) sum(x) + y, y=1)
## [1] 7 7 7 7
apply(M, 2, function(x) length(x[x<2]))
## [1] 2 1 1
In Python
import pandas as pd
import numpy as np
a = [1,2,3,1]
b = [2,3,1,2]
c = [3,1,2,3]
df = pd.DataFrame(np.column_stack([a, b, c]), columns=['a', 'b', 'c'])
print df
df.apply(np.sum, axis=1)
df.apply(np.mean, axis=0)
df.apply(lambda x, y : sum(x) + y, y=1, axis=1)
df.apply(lambda x: len(x[x<2]), axis=0)
In R
L <- list(a=1:3, b=5:10, c=seq(10, 100, 10))
print(L)
## $a
## [1] 1 2 3
##
## $b
## [1] 5 6 7 8 9 10
##
## $c
## [1] 10 20 30 40 50 60 70 80 90 100
lapply(L, FUN=length)
## $a
## [1] 3
##
## $b
## [1] 6
##
## $c
## [1] 10
lapply(L, FUN=median)
## $a
## [1] 2
##
## $b
## [1] 7.5
##
## $c
## [1] 55
lapply(L, function(x) x^2)
## $a
## [1] 1 4 9
##
## $b
## [1] 25 36 49 64 81 100
##
## $c
## [1] 100 400 900 1600 2500 3600 4900 6400 8100 10000
lapply(L, "[", 3)
## $a
## [1] 3
##
## $b
## [1] 7
##
## $c
## [1] 30
In R
sapply(L, FUN=length)
## a b c
## 3 6 10
sapply(L, FUN=median)
## a b c
## 2.0 7.5 55.0
sapply(L, function(x) x^2)
## $a
## [1] 1 4 9
##
## $b
## [1] 25 36 49 64 81 100
##
## $c
## [1] 100 400 900 1600 2500 3600 4900 6400 8100 10000
sapply(L, "[", 3)
## a b c
## 3 7 30
In R
df$d <- mapply(function(x, y) y/x, df$b, df$c)
print(df)
## a b c d
## 1 A 1 1 1
## 2 A 1 2 2
## 3 A 7 21 3
## 4 B 4 16 4
## 5 B 1 5 5
## 6 B 1 6 6
## 7 C 6 42 7
## 8 C 6 48 8
exp <- function(x, y) {
exp <- x^y
}
df$e <- mapply(exp, df$b, df$d)
print(df)
## a b c d e
## 1 A 1 1 1 1
## 2 A 1 2 2 1
## 3 A 7 21 3 343
## 4 B 4 16 4 256
## 5 B 1 5 5 1
## 6 B 1 6 6 1
## 7 C 6 42 7 279936
## 8 C 6 48 8 1679616
In Python
temp = (39.2, 36.5, 37.3, 37.8)
F = map(lambda x: (float(9)/5)*x + 32, temp)
print F
## [102.56, 97.7, 99.14, 100.03999999999999]
In Python
a = [1,2,3,4]
b = [5,6,7,8]
c = [-1,-2,-3,-4]
print map(lambda x,y,z: x+y+z, a,b,c)
## [5, 6, 7, 8]
In Python
fib = [0,1,1,2,3,5,8,13,21,34,55]
print filter(lambda x: x % 2, fib)
## [1, 1, 3, 5, 13, 21, 55]
In Python
print reduce(lambda x,y: x+y, [47,11,42,13])
## 113
print reduce(lambda a,b: a if (a > b) else b, [47,11,42,102,13])
## 102