In R
x <- "1234567890"
nchar(x)
## [1] 10
In Python & PySpark
x = "1234567890"
len(x)
In R
print (substr(x, 3, 3))
## [1] "3"
print (substring(x, 5, 7))
## [1] "567"
In Python & PySpark
print x[2:3]
print x[4:7]
## 3
## 567
In R
sub('123', '567', x) # Changes only the 1st pattern match per string
## [1] "5674567890"
x = gsub('567', 123, x) # Changes every occurrence of a pattern match
print (x)
## [1] "1234123890"
In Python & PySpark
print x.replace('123', '567') # Changes every occurrence of a pattern match
x = x.replace('567', '123')
print x
## 5674567890
## 1234123890
In R
substr(x, 4, 4) <- "A"
print (x)
## [1] "123A123890"
substring(x, 9, 12) <- "ABCD"
print (x)
## [1] "123A1238AB"
In Python & PySpark - string cannot be mutated
In R
print (unlist(regexpr('123', x)[1])) # Returns position of 1st match in a string
## [1] 1
print (unlist(gregexpr('123', x))) # Returns positions of every match in a string
## [1] 1 5
In Python & PySpark
print x
print x.find('123') # Returns position of 1st match in a string
print x.find('89', 0, 7)
import re
print [(a.start(), a.end()) for a in list(re.finditer('123', x))] # Returns positions of every match in a string
## 1234567890
## 0
## -1
## [(0, 3)]
In R
print (paste("123", "456", sep=""))
## [1] "123456"
print (paste("123", "456", sep=","))
## [1] "123,456"
In Python & PySpark
print "123" + "456"
print "123" + "," + "456"
## 123456
## 123,456
In R
unlist(strsplit(x, "A"))
## [1] "123" "1238" "B"
In Python & PySpark
x = "123A1238AB"
print x.split("A") # split all occurrences
print x.split("A", 1) # limit to specific number of splits
## ['123', '1238', 'B']
## ['123', '1238AB']
In R
y = 'aBCDE'
toupper(substring(y, 1, 1))
## [1] "A"
tolower(substring(y, 2, nchar(y)))
## [1] "bcde"
In Python & PySpark
y = 'abCDE'
print y.upper()
print y.lower()
## ABCDE
## abcde
In R
name <- c('Apple', 'Bill', 'Chris', 'Daniel', 'Eric')
birthyear <- c(2012, 2012, 2013, 2014, 2014)
df <- data.frame(name, birthyear, stringsAsFactors = FALSE)
print(df)
## name birthyear
## 1 Apple 2012
## 2 Bill 2012
## 3 Chris 2013
## 4 Daniel 2014
## 5 Eric 2014
toupper(df$name)
## [1] "APPLE" "BILL" "CHRIS" "DANIEL" "ERIC"
In Python
import pandas as pd
import numpy as np
data = {'name': ['Apple', 'Bill', 'Chris', 'Daniel', 'Eric'], 'birthyear': [2012, 2012, 2013, 2014, 2014]}
df = pd.DataFrame(data)
print (df.dtypes)
print (df)
df['name'] = df['name'].apply(lambda x: x.upper())
df['birthyear'] = df['birthyear'].astype('str')
print (df.dtypes)
print (df)
## name object
## birthyear int64
## dtype: object
## name birthyear
## 0 Apple 2012
## 1 Bill 2012
## 2 Chris 2013
## 3 Daniel 2014
## 4 Eric 2014
## name object
## birthyear object
## dtype: object
## name birthyear
## 0 APPLE 2012
## 1 BILL 2012
## 2 CHRIS 2013
## 3 DANIEL 2014
## 4 ERIC 2014
Format | Definition | Example |
---|---|---|
%d | day of the month | 01-31 |
%j | day of the year | 001-366 |
%a | abbreviated weekday | Mon |
%A | unabbreviated weekday | Monday |
%u | weekday as a number | 1 (Mon) - 7 (Sun) |
%W | week as a number | 00-53 |
%m | month as a number | 00-12 |
%b | abbreviated month | Jan |
%B | unabbreviated month | January |
%y | year 2-digit | 17 |
%Y | year 4-digit | 2017 |
%H | hour 24 | 0-24 |
%I | hour 12 | 0-12 |
%p | AM/PM indicator | AM/PM |
%M | minute | 0-60 |
%S | second | 0-60 |
In R
today <- Sys.Date()
print (format(today, format="%B %d %Y"))
## [1] "May 01 2017"
str(today)
## Date[1:1], format: "2017-05-01"
current <- Sys.time()
print (format(current, "%Y-%m-%d %I:%M:%S%p"))
## [1] "2017-05-01 12:53:12AM"
str(current)
## POSIXct[1:1], format: "2017-05-01 00:53:12"
current_str <- date()
print (current_str)
## [1] "Mon May 01 00:53:12 2017"
str(current_str)
## chr "Mon May 01 00:53:12 2017"
In Python
import time
print "Current date %s" % time.strftime('%b %d, %Y')
print "Current date & time " + time.strftime("%c")
import datetime
i = datetime.datetime.now()
print i
print "Current date = %s/%s/%s" % (i.month, i.day, i.year)
## Current date May 01, 2017
## Current date & time 05/01/17 00:53:12
## 2017-05-01 00:53:12.593000
## Current date = 5/1/2017
In R
time <- strptime(date(), format="%a %b %d %H:%M:%S %Y")
str(time)
## POSIXlt[1:1], format: "2017-05-01 00:53:12"
In Python
import datetime
dt = datetime.datetime.strptime("2016-09-28 20:30:55.78200", '%Y-%m-%d %H:%M:%S.%f')
print dt
print type(dt)
## 2016-09-28 20:30:55.782000
## <type 'datetime.datetime'>
In R
dt1 <- as.Date('2017-03-15')
dt2 <- as.Date('2017-05-16')
print (difftime(dt2, dt1, units="days"))
## Time difference of 62 days
print (difftime(dt2, dt1, units="weeks"))
## Time difference of 8.857143 weeks
In Python
from datetime import datetime
dt1 = datetime(2017, 3, 15)
dt2 = datetime(2017, 5, 16)
print dt2 - dt1, ' days'
print (dt2 - dt1).days / 7.0, ' weeks'
## 62 days, 0:00:00 days
## 8.85714285714 weeks
In R
print (dt2 + 20)
## [1] "2017-06-05"
print (dt2 - 20)
## [1] "2017-04-26"
In Python
from datetime import datetime, timedelta
before = dt2 + timedelta(days=20)
print before
after = dt2 - timedelta(days=20)
print after
## 2017-06-05 00:00:00
## 2017-04-26 00:00:00
In R
tm1 <- as.POSIXct("2017-05-24 23:55:26")
tm2 <- as.POSIXct("25052017 08:32:07", format = "%d%m%Y %H:%M:%S")
print (difftime(tm2, tm1, units="hours"))
## Time difference of 8.611389 hours
print (difftime(tm2, tm1, units="mins"))
## Time difference of 516.6833 mins
print (difftime(tm2, tm1, units="secs"))
## Time difference of 31001 secs
In Python
from datetime import datetime, timedelta
tm1 = datetime(2017, 5, 24, 23, 55, 26)
tm2 = datetime(2017, 5, 25, 8, 32, 7 )
diff = tm2 - tm1
secs = diff.total_seconds()
hours = float(secs / 3600)
minutes = float(secs / 60)
print hours, " hours"
print minutes, " minutes"
print secs, " seconds"
## 8.61138888889 hours
## 516.683333333 minutes
## 31001.0 seconds
In R
print (tm1 + 30)
## [1] "2017-05-24 23:55:56 PDT"
print (tm1 - 30)
## [1] "2017-05-24 23:54:56 PDT"
In Python
from datetime import datetime, timedelta
tm1 = datetime(2017, 5, 24, 23, 55, 26)
tm2 = datetime(2017, 5, 25, 8, 32, 7 )
before = tm1 + timedelta(seconds=30)
print before
after = tm1 - timedelta(seconds=30)
print after
## 2017-05-24 23:55:56
## 2017-05-24 23:54:56
In Python
import numpy as np
import pandas as pd
Aug = ['2018-08-01', '2018-08-02', '2018-08-03', '2018-08-04']
Sept = ['2018-09-01 10:00:000', '2018-09-02 11:00:000', '2018-09-03 12:00:000', '2018-09-04 13:00:000']
df = pd.DataFrame(np.column_stack([Aug, Sept]), columns=['Aug', 'Sept'])
print(df.dtypes)
df['Aug'] = pd.to_datetime(df['Aug'])
df['Sept'] = pd.to_datetime(df['Sept'])
print(df.dtypes)
print(df)
## Aug object
## Sept object
## dtype: object
## Aug datetime64[ns]
## Sept datetime64[ns]
## dtype: object
## Aug Sept
## 0 2018-08-01 2018-09-01 10:00:00
## 1 2018-08-02 2018-09-02 11:00:00
## 2 2018-08-03 2018-09-03 12:00:00
## 3 2018-08-04 2018-09-04 13:00:00