045-002. regular expression patter for range, simplified notation(\d), special characters(\), whitespace(\s), compile()
@
# We will judge if string is composed of numbers
# For that, we use range of number into squared bracket and we append * or + at the end of ]
# Range of number can be denoted as "0-9"
# * means judging if number is contained over 0 times
# + means judging if number is contained over 1 times
# Since there is number ranged from 0 to 9 over 0 times in '1234', patter of '[0-9]*' is matched with '1234'
re.match('[0-9]*', '1234')
# <_sre.SRE_Match object; span=(0, 4), match='1234'>
# Since there is number ranged from 0 to 9 over 1 times in '1234', patter of '[0-9]*' is matched with '1234'
re.match('[0-9]+', '1234')
# <_sre.SRE_Match object; span=(0, 4), match='1234'>
# Since there isn't number ranged from 0 to 9 over 1 times in '1234', patter of '[0-9]*' isn't matched with '1234'
re.match('[0-9]+', 'abcd')
@
# How to use difference between * and +
# Since there is 'a' over 0 times in 'b', patter of 'a*b' is matched with 'b'
re.match('a*b', 'b')
# <_sre.SRE_Match object; span=(0, 1), match='b'>
# Since there isn't 'a' over 1 times in 'b', patter of 'a+b' isn't matched with 'b'
re.match('a+b', 'b')
# Since there is 'a' over 0 times in 'aab', patter of 'a*b' is matched with 'aab'
re.match('a*b', 'aab')
# <_sre.SRE_Match object; span=(0, 3), match='aab'>
# Since there is 'a' over 1 times in 'aab', patter of 'a+b' is matched with 'aab'
re.match('a+b', 'aab')
# <_sre.SRE_Match object; span=(0, 3), match='aab'>
# 'b' in pattern of "a*b" and "a+b" denotes must-have character in target string to be mathced with pattern 'b'
# Since a* should be over 0 times in string to be matched, a* can be matched with 'b'
# But since a+ should be over 1 times in string to be matched, a+ can't be matched with 'b'
# Patterns of a*b and a+b are matched with 'ab', 'aab', 'aaab'
@
# We use * and + to judge sequence of characters
# We use ?(0 or 1) and .(1) to judge one character
# There is 0 or 1 character behind "H", pattern 'H?' is matched with 'H'
re.match('H?', 'H')
# <_sre.SRE_Match object; span=(0, 1), match='H'>
# There is 0 or 1 character behind "H", pattern 'H?' is matched with 'Hi'
re.match('H?', 'Hi')
# <_sre.SRE_Match object; span=(0, 1), match='H'>
# There is 1 character behind "H", pattern 'H.' is matched with 'Hi'
re.match('H.', 'Hi')
# <_sre.SRE_Match object; span=(0, 2), match='Hi'>
@
# When you want to find the number of character(number) in target string, you use [range]{quantity}
# Pattern '[0-9]{3}-[0-9]{4}-[0-9]{4}' is matched with '010-1000-1000'
re.match('[0-9]{3}-[0-9]{4}-[0-9]{4}', '010-1000-1000')
# <_sre.SRE_Match object; span=(0, 13), match='010-1000-1000'>
# Pattern '[0-9]{3}-[0-9]{4}-[0-9]{4}' is not matched with '010-1000-100'
# See [0-9]{4} and 100
re.match('[0-9]{3}-[0-9]{4}-[0-9]{4}', '010-1000-100')
@
# This syntanx [0-9]{3}-[0-9]{4}-[0-9]{4} can be used to designate range of frequency
# [0-9]{2,3}-[0-9]{3,4}-[0-9]{4}
# 2 : starting of range
# 3 : ending of range
# Pattern frequency from 2 to 3, frequency from 3 to 4, 4 is matched with '02-100-1000'
re.match('[0-9]{2,3}-[0-9]{3,4}-[0-9]{4}', '02-100-1000')
# <_sre.SRE_Match object; span=(0, 11), match='02-100-1000'>
# Pattern frequency from 2 to 3, frequency from 3 to 4, 4 is not matched with '02-10-1000'
re.match('[0-9]{2,3}-[0-9]{3,4}-[0-9]{4}', '02-10-1000')
@
# We will talk about range of alphabet
# Range of alphabet is denoted by a-z and A-Z
@
# Since there is character from a-z, A-Z, 0-9 over 1 times(+) in 'Hello1234', pattern '[a-zA-Z0-9]+' is matched with 'Hello1234'
re.match('[a-zA-Z0-9]+', 'Hello1234')
# <_sre.SRE_Match object; span=(0, 9), match='Hello1234'>
# Since there is not character from A-Z, 0-9 over 1 times(+) in 'hello', pattern '[A-Z0-9]+' is not matched with 'hello'
re.match('[A-Z0-9]+', 'hello')
@
# Pattern for range of character in Korean
# Since there is character from 가-힣 over 1 times(+) in '홍길동', pattern '[가-힣]+' is matched with '홍길동'
re.match('[가-힣]+', '홍길동')
# <_sre.SRE_Match object; span=(0, 3), match='홍길동'>
@
# You can use ^ as "not" or "except"
# Since there is not character from ^A-Z over 1 times(+) in 'Hello', pattern '[^A-Z]+' is not matched with 'Hello'
re.match('[^A-Z]+', 'Hello')
# Since there is character from ^A-Z over 1 times(+) in 'hello', pattern '[^A-Z]+' is matched with 'hello'
re.match('[^A-Z]+', 'hello')
# <_sre.SRE_Match object; span=(0, 5), match='hello'>
@
# Don't be confused by '^[A-Z]+' (which denotes starting range) and '[^A-Z]+' (which denotes "not" of range)
# Since there is starting character from A-Z over 1 times(+) in 'Hello', pattern '^[A-Z]+' is matched with 'Hello'
re.search('^[A-Z]+', 'Hello')
# <_sre.SRE_Match object; span=(0, 1), match='H'>
@
# Since there is ending character from 0-9 over 1 times(+) in 'Hello1234', pattern '[0-9]+$' is matched with 'Hello1234'
re.search('[0-9]+$', 'Hello1234')
# <_sre.SRE_Match object; span=(5, 9), match='1234'>
@
# When you want to find special characters(*, +, ?, ., ^, $, (, ) [, ], -, etc), you use \ in front of special characters(*, +, ?, ., ^, $, (, ) [, ], -, etc)
# You don't need to use \ in front of special characters(*, +, ?, ., ^, $, (, ) [, ], -, etc) in [ ]
# However, if error happens, you need to use \ in front to special characters in [ ]
# Since there is character from * over 1 times(+) in '1 ** 2', pattern '\*+' is matched with '1 ** 2'
re.search('\*+', '1 ** 2')
# <_sre.SRE_Match object; span=(2, 4), match='**'>
# Since there is character from $, (, ), a-z, A-Z, 0-9 over 1 times(+) in '$(document)', pattern '[$()a-zA-Z0-9]+' is matched with '$(document)'
re.match('[$()a-zA-Z0-9]+', '$(document)')
# <_sre.SRE_Match object; span=(0, 11), match='$(document)'>
@
# When you want to find monotonous number or monotonous character, you can conviniently use \d, \D, \w, \W
# \d: is identical to [0-9]. Monotonous number
# \D: is identical to [^0-9]. All characters except for number
# \w: is identical to [a-zA-Z0-9_]. Alphabet characters, numbers, underscore character
# \W: is identical to [^a-zA-Z0-9_]. All characters excpet for alphabet characters, numbers, underscore character
# Since there is character from [0-9] over 1 times(+) in '1234', pattern '\d+' is matched with '1234'
re.match('\d+', '1234')
# <_sre.SRE_Match object; span=(0, 4), match='1234'>
# Since there is character from [^0-9] over 1 times(+) in 'Hello', pattern '\D+' is matched with 'Hello'
re.match('\D+', 'Hello')
# <_sre.SRE_Match object; span=(0, 5), match='Hello'>
# Since there is character from [a-zA-Z0-9_] over 1 times(+) in 'Hello_1234', pattern '\w+' is matched with 'Hello_1234'
re.match('\w+', 'Hello_1234')
# <_sre.SRE_Match object; span=(0, 10), match='Hello_1234'>
# Since there is character from [^a-zA-Z0-9_] over 1 times(+) in 'Hello_1234', pattern '\w+' is matched with 'Hello_1234'
re.match('\W+', '(:)')
# <_sre.SRE_Match object; span=(0, 3), match='(:)'>
@
# We will deal with white space
# Whitespace can be denoted by ' ', \s, \S
# \s is identical to [ \t\n\r\f\v]
# ' '(space), \t(tab) \n(new line), \r(carage return), \f(form feed), \v(vertical tab)
# \S is identical to [^ \t\n\r\f\v]
# not including ' ', including \t, \n, \r, \f, \v
# Since there is character from [a-zA-Z0-9 ] over 1 times(+) in 'Hello 1234', pattern '[a-zA-Z0-9 ]+' is matched with 'Hello 1234'
# I denote whitespace by ' '
re.match('[a-zA-Z0-9 ]+', 'Hello 1234')
# <_sre.SRE_Match object; span=(0, 10), match='Hello 1234'>
# Since there is character from [a-zA-Z0-9\s] over 1 times(+) in 'Hello 1234', pattern '[a-zA-Z0-9\s]+' is matched with 'Hello 1234'
# I denote whitespace by \s
re.match('[a-zA-Z0-9\s]+', 'Hello 1234')
# <_sre.SRE_Match object; span=(0, 10), match='Hello 1234'>
@
# Tip
# It's not efficient to designate same regular expression pattern in match() and search()
# If you use same regular expression pattern in match() and search(), it's beneficial to make same regular expression pattern as regular expression object by using compile() and then I can invoke match() and search() on that object
# I create regular expression pattern as object
p = re.compile('[0-9]+')
# I invoke match() with passing target string '1234' on instance p
p.match('1234')
# <_sre.SRE_Match object; span=(0, 4), match='1234'>
# I invoke search() with passing target string 'hello' on instance p
p.search('hello')