Eva's Photo Album

String
Dates and Timestamps

String

number of characters

In R

x <- "1234567890"
nchar(x)

## [1] 10

In Python & PySpark

x = "1234567890"
len(x)

substring

In R

print (substr(x, 3, 3))

## [1] "3"

print (substring(x, 5, 7))

## [1] "567"

In Python & PySpark

print x[2:3]
print x[4:7]

## 3
## 567

substitute string

In R

sub('123', '567', x)      # Changes only the 1st pattern match per string

## [1] "5674567890"

x = gsub('567', 123, x)   # Changes every occurrence of a pattern match
print (x)

## [1] "1234123890"

In Python & PySpark

print x.replace('123', '567')  # Changes every occurrence of a pattern match
x = x.replace('567', '123')
print x

## 5674567890
## 1234123890

string assignment

In R

substr(x, 4, 4) <- "A"
print (x)

## [1] "123A123890"

substring(x, 9, 12) <- "ABCD"
print (x)

## [1] "123A1238AB"

In Python & PySpark - string cannot be mutated

locate string

In R

print (unlist(regexpr('123', x)[1]))    # Returns position of 1st match in a string

## [1] 1

print (unlist(gregexpr('123', x)))      # Returns positions of every match in a string

## [1] 1 5

In Python & PySpark

print x
print x.find('123')         # Returns position of 1st match in a string
print x.find('89', 0, 7)

import re
print [(a.start(), a.end()) for a in list(re.finditer('123', x))]    # Returns positions of every match in a string

## 1234567890
## 0
## -1
## [(0, 3)]

concatenate string

In R

print (paste("123", "456", sep=""))

## [1] "123456"

print (paste("123", "456", sep=","))

## [1] "123,456"

In Python & PySpark

print "123" + "456"
print "123" + "," + "456"

## 123456
## 123,456

split string

In R

unlist(strsplit(x, "A"))

## [1] "123"  "1238" "B"

In Python & PySpark

x = "123A1238AB"
print x.split("A")     # split all occurrences
print x.split("A", 1)  # limit to specific number of splits

## ['123', '1238', 'B']
## ['123', '1238AB']

upper/lower case

In R

y = 'aBCDE'
toupper(substring(y, 1, 1))

## [1] "A"

tolower(substring(y, 2, nchar(y)))

## [1] "bcde"

In Python & PySpark

y = 'abCDE'
print y.upper()
print y.lower()

## ABCDE
## abcde

Apply string function to dataframe columns

In R

name <- c('Apple', 'Bill', 'Chris', 'Daniel', 'Eric')
birthyear <- c(2012, 2012, 2013, 2014, 2014)
df <- data.frame(name, birthyear, stringsAsFactors = FALSE)
print(df)

##     name birthyear
## 1  Apple      2012
## 2   Bill      2012
## 3  Chris      2013
## 4 Daniel      2014
## 5   Eric      2014

toupper(df$name)

## [1] "APPLE"  "BILL"   "CHRIS"  "DANIEL" "ERIC"

In Python

import pandas as pd
import numpy as np
data = {'name': ['Apple', 'Bill', 'Chris', 'Daniel', 'Eric'], 'birthyear': [2012, 2012, 2013, 2014, 2014]}
df = pd.DataFrame(data)
print (df.dtypes)
print (df)
df['name'] = df['name'].apply(lambda x: x.upper())
df['birthyear'] = df['birthyear'].astype('str')
print (df.dtypes)
print (df)

## name         object
## birthyear     int64
## dtype: object

##      name  birthyear
## 0   Apple       2012
## 1    Bill       2012
## 2   Chris       2013
## 3  Daniel       2014
## 4    Eric       2014

## name         object
## birthyear    object
## dtype: object

##      name birthyear
## 0   APPLE      2012
## 1    BILL      2012
## 2   CHRIS      2013
## 3  DANIEL      2014
## 4    ERIC      2014

Dates and Timestamps

Format

Format	Definition	Example
%d	day of the month	01-31
%j	day of the year	001-366
%a	abbreviated weekday	Mon
%A	unabbreviated weekday	Monday
%u	weekday as a number	1 (Mon) - 7 (Sun)
%W	week as a number	00-53
%m	month as a number	00-12
%b	abbreviated month	Jan
%B	unabbreviated month	January
%y	year 2-digit	17
%Y	year 4-digit	2017
%H	hour 24	0-24
%I	hour 12	0-12
%p	AM/PM indicator	AM/PM
%M	minute	0-60
%S	second	0-60

Current date

In R

today <- Sys.Date()
print (format(today, format="%B %d %Y"))

## [1] "May 01 2017"

str(today)

##  Date[1:1], format: "2017-05-01"

current <- Sys.time()                 
print (format(current, "%Y-%m-%d %I:%M:%S%p"))

## [1] "2017-05-01 12:53:12AM"

str(current)

##  POSIXct[1:1], format: "2017-05-01 00:53:12"

current_str <- date()
print (current_str)

## [1] "Mon May 01 00:53:12 2017"

str(current_str)

##  chr "Mon May 01 00:53:12 2017"

In Python

import time
print "Current date %s" % time.strftime('%b %d, %Y')
print "Current date & time " + time.strftime("%c")

import datetime
i = datetime.datetime.now()
print i
print "Current date = %s/%s/%s" % (i.month, i.day, i.year)

## Current date May 01, 2017
## Current date & time 05/01/17 00:53:12
## 2017-05-01 00:53:12.593000
## Current date = 5/1/2017

String to datetime

In R

time <- strptime(date(), format="%a %b %d %H:%M:%S %Y")
str(time)

##  POSIXlt[1:1], format: "2017-05-01 00:53:12"

In Python

import datetime

dt = datetime.datetime.strptime("2016-09-28 20:30:55.78200", '%Y-%m-%d %H:%M:%S.%f')
print dt
print type(dt)

## 2016-09-28 20:30:55.782000
## <type 'datetime.datetime'>

Date difference

In R

dt1 <- as.Date('2017-03-15')
dt2 <- as.Date('2017-05-16')

print (difftime(dt2, dt1, units="days"))

## Time difference of 62 days

print (difftime(dt2, dt1, units="weeks"))

## Time difference of 8.857143 weeks

In Python

from datetime import datetime
 
dt1 = datetime(2017, 3, 15)
dt2 = datetime(2017, 5, 16)
print dt2 - dt1, ' days'
print (dt2 - dt1).days / 7.0, ' weeks'

## 62 days, 0:00:00  days
## 8.85714285714  weeks

Date add or subtract

In R

print (dt2 + 20)

## [1] "2017-06-05"

print (dt2 - 20)

## [1] "2017-04-26"

In Python

from datetime import datetime, timedelta

before = dt2 + timedelta(days=20)
print before
after = dt2 - timedelta(days=20)
print after

## 2017-06-05 00:00:00
## 2017-04-26 00:00:00

Time difference

In R

tm1 <- as.POSIXct("2017-05-24 23:55:26")
tm2 <- as.POSIXct("25052017 08:32:07", format = "%d%m%Y %H:%M:%S")

print (difftime(tm2, tm1, units="hours"))

## Time difference of 8.611389 hours

print (difftime(tm2, tm1, units="mins"))

## Time difference of 516.6833 mins

print (difftime(tm2, tm1, units="secs"))

## Time difference of 31001 secs

In Python

from datetime import datetime, timedelta
 
tm1 = datetime(2017, 5, 24, 23, 55, 26)
tm2 = datetime(2017, 5, 25, 8, 32, 7 )

diff = tm2 - tm1
secs = diff.total_seconds()
hours = float(secs / 3600)
minutes = float(secs / 60)

print hours, " hours"
print minutes, " minutes"
print secs, " seconds"

## 8.61138888889  hours
## 516.683333333  minutes
## 31001.0  seconds

Time add or subtract

In R

print (tm1 + 30)

## [1] "2017-05-24 23:55:56 PDT"

print (tm1 - 30)

## [1] "2017-05-24 23:54:56 PDT"

In Python

from datetime import datetime, timedelta

tm1 = datetime(2017, 5, 24, 23, 55, 26)
tm2 = datetime(2017, 5, 25, 8, 32, 7 )

before = tm1 + timedelta(seconds=30)
print before
after = tm1 - timedelta(seconds=30)
print after

## 2017-05-24 23:55:56
## 2017-05-24 23:54:56

Data Frame

In Python

import numpy as np
import pandas as pd
Aug = ['2018-08-01', '2018-08-02', '2018-08-03', '2018-08-04']
Sept = ['2018-09-01 10:00:000', '2018-09-02 11:00:000', '2018-09-03 12:00:000', '2018-09-04 13:00:000']
df = pd.DataFrame(np.column_stack([Aug, Sept]), columns=['Aug', 'Sept'])
print(df.dtypes)
df['Aug'] = pd.to_datetime(df['Aug'])
df['Sept'] = pd.to_datetime(df['Sept'])
print(df.dtypes)
print(df)

## Aug     object
## Sept    object
## dtype: object

## Aug     datetime64[ns]
## Sept    datetime64[ns]
## dtype: object

##          Aug                Sept
## 0 2018-08-01 2018-09-01 10:00:00
## 1 2018-08-02 2018-09-02 11:00:00
## 2 2018-08-03 2018-09-03 12:00:00
## 3 2018-08-04 2018-09-04 13:00:00

String, datetime, file directory handling in R, Python, PySpark

October 9, 2018

String

number of characters

substring

substitute string

string assignment

locate string

concatenate string

split string

upper/lower case

Apply string function to dataframe columns

Dates and Timestamps

Format

Current date

String to datetime

Date difference

Date add or subtract

Time difference

Time add or subtract

Data Frame