String

number of characters

In R

x <- "1234567890"
nchar(x)
## [1] 10

In Python & PySpark

x = "1234567890"
len(x)

substring

In R

print (substr(x, 3, 3))
## [1] "3"
print (substring(x, 5, 7))
## [1] "567"

In Python & PySpark

print x[2:3]
print x[4:7]
## 3
## 567

substitute string

In R

sub('123', '567', x)      # Changes only the 1st pattern match per string
## [1] "5674567890"
x = gsub('567', 123, x)   # Changes every occurrence of a pattern match
print (x)
## [1] "1234123890"

In Python & PySpark

print x.replace('123', '567')  # Changes every occurrence of a pattern match
x = x.replace('567', '123')
print x
## 5674567890
## 1234123890

string assignment

In R

substr(x, 4, 4) <- "A"
print (x)
## [1] "123A123890"
substring(x, 9, 12) <- "ABCD"
print (x)
## [1] "123A1238AB"

In Python & PySpark - string cannot be mutated

locate string

In R

print (unlist(regexpr('123', x)[1]))    # Returns position of 1st match in a string
## [1] 1
print (unlist(gregexpr('123', x)))      # Returns positions of every match in a string
## [1] 1 5

In Python & PySpark

print x
print x.find('123')         # Returns position of 1st match in a string
print x.find('89', 0, 7)

import re
print [(a.start(), a.end()) for a in list(re.finditer('123', x))]    # Returns positions of every match in a string
## 1234567890
## 0
## -1
## [(0, 3)]

concatenate string

In R

print (paste("123", "456", sep=""))
## [1] "123456"
print (paste("123", "456", sep=","))
## [1] "123,456"

In Python & PySpark

print "123" + "456"
print "123" + "," + "456"
## 123456
## 123,456

split string

In R

unlist(strsplit(x, "A"))
## [1] "123"  "1238" "B"

In Python & PySpark

x = "123A1238AB"
print x.split("A")     # split all occurrences
print x.split("A", 1)  # limit to specific number of splits 
## ['123', '1238', 'B']
## ['123', '1238AB']

upper/lower case

In R

y = 'aBCDE'
toupper(substring(y, 1, 1))
## [1] "A"
tolower(substring(y, 2, nchar(y)))
## [1] "bcde"

In Python & PySpark

y = 'abCDE'
print y.upper()
print y.lower()
## ABCDE
## abcde

Apply string function to dataframe columns

In R

name <- c('Apple', 'Bill', 'Chris', 'Daniel', 'Eric')
birthyear <- c(2012, 2012, 2013, 2014, 2014)
df <- data.frame(name, birthyear, stringsAsFactors = FALSE)
print(df)
##     name birthyear
## 1  Apple      2012
## 2   Bill      2012
## 3  Chris      2013
## 4 Daniel      2014
## 5   Eric      2014
toupper(df$name)
## [1] "APPLE"  "BILL"   "CHRIS"  "DANIEL" "ERIC"

In Python

import pandas as pd
import numpy as np
data = {'name': ['Apple', 'Bill', 'Chris', 'Daniel', 'Eric'], 'birthyear': [2012, 2012, 2013, 2014, 2014]}
df = pd.DataFrame(data)
print (df.dtypes)
print (df)
df['name'] = df['name'].apply(lambda x: x.upper())
df['birthyear'] = df['birthyear'].astype('str')
print (df.dtypes)
print (df)
## name         object
## birthyear     int64
## dtype: object
##      name  birthyear
## 0   Apple       2012
## 1    Bill       2012
## 2   Chris       2013
## 3  Daniel       2014
## 4    Eric       2014
## name         object
## birthyear    object
## dtype: object
##      name birthyear
## 0   APPLE      2012
## 1    BILL      2012
## 2   CHRIS      2013
## 3  DANIEL      2014
## 4    ERIC      2014

Dates and Timestamps

Format

Format Definition Example
%d day of the month 01-31
%j day of the year 001-366
%a abbreviated weekday Mon
%A unabbreviated weekday Monday
%u weekday as a number 1 (Mon) - 7 (Sun)
%W week as a number 00-53
%m month as a number 00-12
%b abbreviated month Jan
%B unabbreviated month January
%y year 2-digit 17
%Y year 4-digit 2017
%H hour 24 0-24
%I hour 12 0-12
%p AM/PM indicator AM/PM
%M minute 0-60
%S second 0-60

Current date

In R

today <- Sys.Date()
print (format(today, format="%B %d %Y"))
## [1] "May 01 2017"
str(today)
##  Date[1:1], format: "2017-05-01"
current <- Sys.time()                 
print (format(current, "%Y-%m-%d %I:%M:%S%p"))
## [1] "2017-05-01 12:53:12AM"
str(current)
##  POSIXct[1:1], format: "2017-05-01 00:53:12"
current_str <- date()
print (current_str)
## [1] "Mon May 01 00:53:12 2017"
str(current_str)
##  chr "Mon May 01 00:53:12 2017"

In Python

import time
print "Current date %s" % time.strftime('%b %d, %Y')
print "Current date & time " + time.strftime("%c")

import datetime
i = datetime.datetime.now()
print i
print "Current date = %s/%s/%s" % (i.month, i.day, i.year)
## Current date May 01, 2017
## Current date & time 05/01/17 00:53:12
## 2017-05-01 00:53:12.593000
## Current date = 5/1/2017

String to datetime

In R

time <- strptime(date(), format="%a %b %d %H:%M:%S %Y")
str(time)
##  POSIXlt[1:1], format: "2017-05-01 00:53:12"

In Python

import datetime

dt = datetime.datetime.strptime("2016-09-28 20:30:55.78200", '%Y-%m-%d %H:%M:%S.%f')
print dt
print type(dt)
## 2016-09-28 20:30:55.782000
## <type 'datetime.datetime'>

Date difference

In R

dt1 <- as.Date('2017-03-15')
dt2 <- as.Date('2017-05-16')

print (difftime(dt2, dt1, units="days"))
## Time difference of 62 days
print (difftime(dt2, dt1, units="weeks"))
## Time difference of 8.857143 weeks

In Python

from datetime import datetime
 
dt1 = datetime(2017, 3, 15)
dt2 = datetime(2017, 5, 16)
print dt2 - dt1, ' days'
print (dt2 - dt1).days / 7.0, ' weeks'
## 62 days, 0:00:00  days
## 8.85714285714  weeks

Date add or subtract

In R

print (dt2 + 20)
## [1] "2017-06-05"
print (dt2 - 20)
## [1] "2017-04-26"

In Python

from datetime import datetime, timedelta

before = dt2 + timedelta(days=20)
print before
after = dt2 - timedelta(days=20)
print after 
## 2017-06-05 00:00:00
## 2017-04-26 00:00:00

Time difference

In R

tm1 <- as.POSIXct("2017-05-24 23:55:26")
tm2 <- as.POSIXct("25052017 08:32:07", format = "%d%m%Y %H:%M:%S")

print (difftime(tm2, tm1, units="hours"))
## Time difference of 8.611389 hours
print (difftime(tm2, tm1, units="mins"))
## Time difference of 516.6833 mins
print (difftime(tm2, tm1, units="secs"))
## Time difference of 31001 secs

In Python

from datetime import datetime, timedelta
 
tm1 = datetime(2017, 5, 24, 23, 55, 26)
tm2 = datetime(2017, 5, 25, 8, 32, 7 )

diff = tm2 - tm1
secs = diff.total_seconds()
hours = float(secs / 3600)
minutes = float(secs / 60)

print hours, " hours"
print minutes, " minutes"
print secs, " seconds"
## 8.61138888889  hours
## 516.683333333  minutes
## 31001.0  seconds

Time add or subtract

In R

print (tm1 + 30)
## [1] "2017-05-24 23:55:56 PDT"
print (tm1 - 30)
## [1] "2017-05-24 23:54:56 PDT"

In Python

from datetime import datetime, timedelta

tm1 = datetime(2017, 5, 24, 23, 55, 26)
tm2 = datetime(2017, 5, 25, 8, 32, 7 )

before = tm1 + timedelta(seconds=30)
print before
after = tm1 - timedelta(seconds=30)
print after
## 2017-05-24 23:55:56
## 2017-05-24 23:54:56

Data Frame

In Python

import numpy as np
import pandas as pd
Aug = ['2018-08-01', '2018-08-02', '2018-08-03', '2018-08-04']
Sept = ['2018-09-01 10:00:000', '2018-09-02 11:00:000', '2018-09-03 12:00:000', '2018-09-04 13:00:000']
df = pd.DataFrame(np.column_stack([Aug, Sept]), columns=['Aug', 'Sept'])
print(df.dtypes)
df['Aug'] = pd.to_datetime(df['Aug'])
df['Sept'] = pd.to_datetime(df['Sept'])
print(df.dtypes)
print(df)
## Aug     object
## Sept    object
## dtype: object
## Aug     datetime64[ns]
## Sept    datetime64[ns]
## dtype: object
##          Aug                Sept
## 0 2018-08-01 2018-09-01 10:00:00
## 1 2018-08-02 2018-09-02 11:00:00
## 2 2018-08-03 2018-09-03 12:00:00
## 3 2018-08-04 2018-09-04 13:00:00