{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1 align=\"center\">Re module</h1>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Syntax of regular expressions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# DOTALL == 16\n",
"from re import search, DOTALL\n",
"\n",
"print(search('.', 'c') != None and\n",
" search('.', 'D') != None and\n",
" search('.', '_') != None and\n",
" search('.', '5') != None and\n",
" search('.', ' ') != None and\n",
" search('.', '@') != None and\n",
" search('.', '\\n') == None and\n",
" search('.', '\\n', DOTALL) != None)\n",
"print()\n",
"\n",
"# w for \"word\"\n",
"print(search('\\w', 'c') != None and\n",
" search('\\w', 'D') != None and\n",
" search('\\w', '_') != None and\n",
" search('\\w', '5') != None and\n",
" search('\\w', ' ') == None and\n",
" search('\\w', '@') == None and\n",
" search('\\w', '\\n') == None)\n",
" \n",
"print(search('\\W', 'c') == None and\n",
" search('\\W', 'D') == None and\n",
" search('\\W', '_') == None and\n",
" search('\\W', '5') == None and\n",
" search('\\W', ' ') != None and\n",
" search('\\W', '@') != None and\n",
" search('\\W', '\\n') != None)\n",
"print()\n",
"\n",
"# d for \"digit\"\n",
"print(search('\\d', 'c') == None and\n",
" search('\\d', 'D') == None and\n",
" search('\\d', '_') == None and\n",
" search('\\d', '5') != None and\n",
" search('\\d', ' ') == None and\n",
" search('\\d', '@') == None and\n",
" search('\\d', '\\n') == None)\n",
"\n",
"print(search('\\D', 'c') != None and\n",
" search('\\D', 'D') != None and\n",
" search('\\D', '_') != None and\n",
" search('\\D', '5') == None and\n",
" search('\\D', ' ') != None and\n",
" search('\\D', '@') != None and\n",
" search('\\D', '\\n') != None)\n",
"print()\n",
"\n",
"# s for \"space\"\n",
"print(search('\\s', 'c') == None and\n",
" search('\\s', 'D') == None and\n",
" search('\\s', '_') == None and\n",
" search('\\s', '5') == None and\n",
" search('\\s', ' ') != None and\n",
" search('\\s', '@') == None and\n",
" search('\\s', '\\n') != None)\n",
"\n",
"print(search('\\S', 'c') != None and\n",
" search('\\S', 'D') != None and\n",
" search('\\S', '_') != None and\n",
" search('\\S', '5') != None and\n",
" search('\\S', ' ') == None and\n",
" search('\\S', '@') != None and\n",
" search('\\S', '\\n') == None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import search\n",
"\n",
"# Character classes\n",
"print(search('[2-5d-h[\\]^]', '3') != None and\n",
" search('[2-5d-h[\\]^]', 'e') != None and\n",
" search('[\\^2-5d-h[\\]]', '[') != None and\n",
" search('[\\^2-5d-h[\\]]', ']') != None and\n",
" search('[\\^2-5d-h[\\]]', '^') != None) \n",
"print(search('[2-5d-h^]', '6') == None and search('[\\^2-5d-h]', 'c') == None)\n",
"print()\n",
"\n",
"print(search('[^^2-5d-h[\\]]', '3') == None and\n",
" search('[^^2-5d-h[\\]]', 'e') == None and\n",
" search('[^^2-5d-h[\\]]', '[') == None and\n",
" search('[^^2-5d-h[\\]]', ']') == None and\n",
" search('[^^2-5d-h[\\]]', '^') == None) \n",
"print(search('[^^2-5d-h[\\]]', '6') != None and\n",
" search('[^^2-5d-h[\\]]', 'c') != None) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import search, escape\n",
"\n",
"print(search(escape('[^ab](*'), '0[^ab](*A') != None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import search\n",
"\n",
"# Alternation\n",
"print(search('123|abc|\\|@', '123') != None and\n",
" search('123|abc|\\|@', 'abc') != None and\n",
" search('123|abc|\\|\\\\\\\\@', '|\\@') != None and\n",
" search(r'123|abc|\\|\\\\@', '|\\@') != None) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import search\n",
"\n",
"# Quantifiers\n",
"print(search('a?', '') != None and\n",
" search('a?', 'a') != None)\n",
"print(search('a*', '') != None and\n",
" search('a*', 'a') != None and\n",
" search('a*', 'aa') != None and\n",
" search('a*', 'aaa') != None)\n",
"print(search('a+', 'a') != None and\n",
" search('a+', 'aa') != None and\n",
" search('a+', 'aaa') != None)\n",
"print()\n",
"\n",
"print(search('a{3}', 'aaa') != None)\n",
"# No space before or after the comma!\n",
"print(search('a{2,4}', 'aa') != None and\n",
" search('a{2,4}', 'aaa') != None and\n",
" search('a{2,4}', 'aaaa') != None)\n",
"print(search('a{,3}', '') != None and\n",
" search('a{,3}', 'a') != None and\n",
" search('a{,3}', 'aa') != None and\n",
" search('a{,3}', 'aaa') != None)\n",
"print(search('a{3,}', 'aaa') != None and\n",
" search('a{3,}', 'aaaa') != None and\n",
" search('a{3,}', 'aaaaa') != None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import search\n",
"\n",
"# Quantifiers: (by default) greedy versus reluctant\n",
"search('a?', 'aaaa'), search('a??', 'aaaa'),\\\n",
"search('a*', 'aaaaa'), search('a*?', 'aaaaa'),\\\n",
"search('a+', 'aaaaa'), search('a+?', 'aaaaa'),\\\n",
"search('a{2,3}', 'aaaaa'), search('a{2,3}?', 'aaaaa'),\\\n",
"search('a{2,}', 'aaaaa'), search('a{2,}?', 'aaaaa')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Alternatively, replace MULTILINE by M\n",
"# MULTILINE == M == 8\n",
"from re import search, MULTILINE\n",
"\n",
"print(search(r'\\bab', ' ab') != None and search(r'ab\\b', 'ab!') != None)\n",
"print(search('\\Bab', '_ab') != None and search('ab\\B', 'ab2') != None)\n",
"\n",
"print(search('^ab', 'abc') != None and\n",
" search('^ab', 'cab') == None and\n",
" search('^ab', 'c\\nab') == None and\n",
" search('^ab', 'c\\nab', MULTILINE) != None)\n",
"print(search('ab$', 'cab') != None and\n",
" search('ab$', 'abc') == None and\n",
" search('ab$', 'ab\\nc') == None and\n",
" search('ab$', 'ab\\nc', MULTILINE) != None)\n",
"print()\n",
"\n",
"print(search('\\Aab', 'abc') != None and\n",
" search('\\Aab', 'cab') == None and\n",
" search('\\Aab', 'c\\nab') == None)\n",
"print(search('ab\\Z', 'cab') != None and\n",
" search('ab\\Z', 'abc') == None and\n",
" search('ab\\Z', 'ab\\nc') == None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Alternatively, replace IGNORECASE by I, ASCII by A\n",
"# IGNORECASE == I == 2\n",
"# ASCII == A == 256\n",
"from re import search, IGNORECASE, ASCII\n",
"\n",
"print(search('aBcD', 'AbcD') == None and search('aBcD', 'AbcD', IGNORECASE) != None)\n",
"print(search('\\w', 'î') != None and search('\\w', 'î', ASCII) == None)\n",
"print(search(r'\\bab', 'éab') == None and search(r'\\bab', 'éab', ASCII) != None)\n",
"print(search('ab\\B', 'abü') != None and search('ab\\B', 'abü', ASCII) == None)\n",
"print()\n",
"\n",
"print(search(r'\\beF', 'àEf', IGNORECASE + ASCII) != None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import search\n",
"\n",
"print(search(r'(\\w+) (\\w+) \\1 (\\w+) \\1 \\3', 'abc def abc ghi abc ghi') != None)\n",
"print(search(r'(\\d)\\1(2)', '332') != None)\n",
"# Capturing and noncapturing parentheses\n",
"print(search(r'(?:\\w+) (\\w+) \\1 (\\w+) (?:\\w+) (\\w+) \\3 \\2',\n",
" 'abc def def ghi jkl mno mno ghi') != None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import findall\n",
"\n",
"print(findall('aba', 'abababa'))\n",
"print(findall('(ab)a', 'abababa'))\n",
"print(findall('(ab)(a)', 'abababa'))\n",
"print(findall('((ab)a)', 'abababa'))\n",
"print(findall('(a|b)+', 'abacaba'))\n",
"print(findall('(?:a|b)+', 'abacaba'))\n",
"print(findall('((a|b)+)', 'abacaba'))\n",
"print()\n",
"\n",
"print(findall('a+', 'aaa'))\n",
"print(findall('a*', 'aaa'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import findall\n",
"\n",
"print(findall('a', 'aaabaab'))\n",
"# a followed by b\n",
"print(findall('a(?=b)', 'aaabaab'))\n",
"# a not followed by b\n",
"print(findall('a(?!b)', 'aaabaab'))\n",
"# a preceded by b\n",
"print(findall('(?<=b)a', 'aaabaab'))\n",
"# a not preceded by b\n",
"print(findall('(?<!b)a', 'aaabaab'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import search\n",
"\n",
"# If group (here, 2) matches then match this, (optionally) else match that\n",
"print(search('_(abc)(\\d{3})?(?(2)aaa)', '_abc345aaa') != None and\n",
" search('_(abc)(\\d{3})?(?(2)aaa|BBB)', '_abc345aaa') != None and\n",
" search('_(abc)(\\d{3})?(?(2)aaa|BBB)', '_abcBBB') != None)\n",
"print(search('_(abc)(?P<word_2>\\d{3})?(?(word_2)aaa)', '_abc345aaa') != None and\n",
" search('_(abc)(?P<word_2>\\d{3})?(?(word_2)aaa|BBB)', '_abc345aaa') != None and\n",
" search('_(abc)(?P<word_2>\\d{3})?(?(word_2)aaa|BBB)', '_abcBBB') != None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Alternatively, replace VERBOSE by X\n",
"# VERBOSE == X == 64\n",
"# DEBUG == 128\n",
"from re import search, VERBOSE, DEBUG\n",
"\n",
"print(search('''\n",
" \\d{3} # Comments\n",
" ( |-) # and spaces\n",
" [a-z] # are ignored\n",
" ''', '845-f', VERBOSE) != None)\n",
"print()\n",
"\n",
"print(search('\\d{3}( |-)[a-z]', '845-f', DEBUG))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Functions"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"All occurrences of f(pattern, ...) below can be replaced by compile(pattern).f(...)\n",
"\n",
"No occurrence of compile(pattern).f(...) below can be replaced by f(pattern, ...)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Functions that return a Match object"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import compile, search\n",
"\n",
"print(search('cd', 'abcde'))\n",
"print(compile('cd').search('abcde', 2))\n",
"print(compile('cd').search('abcde', 3))\n",
"print(compile('cd').search('abcde', 2, 5))\n",
"print(compile('cd').search('abcde', 2, 3))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import compile, match\n",
"\n",
"print(match('cd', 'cde'))\n",
"print(match('cd', 'abcde'))\n",
"print(compile('cd').match('abcde', 2))\n",
"print(compile('cd').match('abcde', 3))\n",
"print(compile('cd').match('abcde', 2, 5))\n",
"print(compile('cd').match('abcde', 2, 3))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import fullmatch\n",
"\n",
"print(fullmatch('cd', 'cde'))\n",
"print(fullmatch('cde', 'cde'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import compile, findall\n",
"\n",
"print(findall('.', 'abcdef'),\n",
" compile('.').findall('abcdef', 2),\n",
" compile('.').findall('abcdef', 2, 5))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import compile, finditer\n",
"\n",
"matches = finditer('.', 'abcdef')\n",
"for match in matches:\n",
" print(match)\n",
"print()\n",
"\n",
"matches = compile('.').finditer('abcdef', 2)\n",
"for match in matches:\n",
" print(match)\n",
"print()\n",
"\n",
"matches = compile('.').finditer('abcdef', 2, 5)\n",
"for match in matches:\n",
" print(match)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Functions applied to a Match object"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import search\n",
"\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group())\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group(0))\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group(1))\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group(2))\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group(3))\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group(1, 3))\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group(0, 2, 1))\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').groups())\n",
"print()\n",
"\n",
"print(search('\\w+ (?P<word_1>\\w+) (?P<word_2>\\w+ (?P<word_3>\\w+))',\n",
" 'ab cd ef gh ij').group('word_2'))\n",
"print(search('\\w+ (?P<word_1>\\w+) (?P<word_2>\\w+ (?P<word_3>\\w+))',\n",
" 'ab cd ef gh ij').group('word_1', 'word_3'))\n",
"print(search('\\w+ (?P<word_1>\\w+) (?P<word_2>\\w+ (?P<word_3>\\w+))',\n",
" 'ab cd ef gh ij').groupdict())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import search\n",
"\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').start())\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').start(0))\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').start(1))\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').start(2))\n",
"print()\n",
"\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').end())\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').end(0))\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').end(1))\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').end(2))\n",
"print()\n",
"\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').span())\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').span(0))\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').span(1))\n",
"print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').span(2))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Functions that do not return a Match object"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import split\n",
"\n",
"print(split(':!', 'ab:!cd:ef!gh:!ij'))\n",
"print(split('(:!)', 'ab:!cd:ef!gh:!ij'))\n",
"print()\n",
"\n",
"print(split(' ', 'ab cd ef gh ij'))\n",
"print(split(' ', 'ab cd ef gh ij', 0))\n",
"print(split(' ', 'ab cd ef gh ij', 6))\n",
"print(split(' ', 'ab cd ef gh ij', 3))\n",
"print(split(' ', 'ab cd ef gh ij', 2))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import sub, subn\n",
"\n",
"print(sub('aba', '*', 'ababababababa'))\n",
"print(sub('aba', '*', 'ababababababa', 0))\n",
"print(sub('aba', '*', 'ababababababa', 4))\n",
"print(sub('aba', '*', 'ababababababa', 2))\n",
"print(sub('aba', '*', 'ababababababa', 1))\n",
"print()\n",
"\n",
"print(subn('aba', '*', 'ababababababa'))\n",
"print(subn('aba', '*', 'ababababababa', 0))\n",
"print(subn('aba', '*', 'ababababababa', 4))\n",
"print(subn('aba', '*', 'ababababababa', 2))\n",
"print(subn('aba', '*', 'ababababababa', 1))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from re import sub, subn\n",
"\n",
"print(sub(r'(?:\\w+) (\\w+) \\1 (\\w+) (?:\\w+) (\\w+) \\3 \\2', r'*\\2--\\g<1>2!\\3', \n",
" 'abc def def ghi jkl mno mno ghi ABC DEF DEF GHI JKL MNO MNO GHI'))\n",
"print(sub('\\w+ (?P<word_1>\\w+) (?P<word_2>\\w+ (?P<word_3>\\w+))', r'\\g<word_2>--\\g<word_1>2\\3',\n",
" 'ab cd ef gh ij AB CD EF GH IJ'))\n",
"print()\n",
"\n",
"print(subn(r'(?:\\w+) (\\w+) \\1 (\\w+) (?:\\w+) (\\w+) \\3 \\2', r'*\\2--\\g<1>2!\\3', \n",
" 'abc def def ghi jkl mno mno ghi ABC DEF DEF GHI JKL MNO MNO GHI'))\n",
"print(subn('\\w+ (?P<word_1>\\w+) (?P<word_2>\\w+ (?P<word_3>\\w+))', r'\\g<word_2>--\\g<word_1>2\\3',\n",
" 'ab cd ef gh ij AB CD EF GH IJ'))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Resource created 10 years ago.
file: re_module.ipynb