{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h1 align=\"center\">Re module</h1>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Syntax of regular expressions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# DOTALL == 16\n",
    "from re import search, DOTALL\n",
    "\n",
    "print(search('.', 'c') != None and\n",
    "      search('.', 'D') != None and\n",
    "      search('.', '_') != None and\n",
    "      search('.', '5') != None and\n",
    "      search('.', ' ') != None and\n",
    "      search('.', '@') != None and\n",
    "      search('.', '\\n') == None and\n",
    "      search('.', '\\n', DOTALL) != None)\n",
    "print()\n",
    "\n",
    "# w for \"word\"\n",
    "print(search('\\w', 'c') != None and\n",
    "      search('\\w', 'D') != None and\n",
    "      search('\\w', '_') != None and\n",
    "      search('\\w', '5') != None and\n",
    "      search('\\w', ' ') == None and\n",
    "      search('\\w', '@') == None and\n",
    "      search('\\w', '\\n') == None)\n",
    "    \n",
    "print(search('\\W', 'c') == None and\n",
    "      search('\\W', 'D') == None and\n",
    "      search('\\W', '_') == None and\n",
    "      search('\\W', '5') == None and\n",
    "      search('\\W', ' ') != None and\n",
    "      search('\\W', '@') != None and\n",
    "      search('\\W', '\\n') != None)\n",
    "print()\n",
    "\n",
    "# d for \"digit\"\n",
    "print(search('\\d', 'c') == None and\n",
    "      search('\\d', 'D') == None and\n",
    "      search('\\d', '_') == None and\n",
    "      search('\\d', '5') != None and\n",
    "      search('\\d', ' ') == None and\n",
    "      search('\\d', '@') == None and\n",
    "      search('\\d', '\\n') == None)\n",
    "\n",
    "print(search('\\D', 'c') != None and\n",
    "      search('\\D', 'D') != None and\n",
    "      search('\\D', '_') != None and\n",
    "      search('\\D', '5') == None and\n",
    "      search('\\D', ' ') != None and\n",
    "      search('\\D', '@') != None and\n",
    "      search('\\D', '\\n') != None)\n",
    "print()\n",
    "\n",
    "# s for \"space\"\n",
    "print(search('\\s', 'c') == None and\n",
    "      search('\\s', 'D') == None and\n",
    "      search('\\s', '_') == None and\n",
    "      search('\\s', '5') == None and\n",
    "      search('\\s', ' ') != None and\n",
    "      search('\\s', '@') == None and\n",
    "      search('\\s', '\\n') != None)\n",
    "\n",
    "print(search('\\S', 'c') != None and\n",
    "      search('\\S', 'D') != None and\n",
    "      search('\\S', '_') != None and\n",
    "      search('\\S', '5') != None and\n",
    "      search('\\S', ' ') == None and\n",
    "      search('\\S', '@') != None and\n",
    "      search('\\S', '\\n') == None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import search\n",
    "\n",
    "# Character classes\n",
    "print(search('[2-5d-h[\\]^]', '3') != None and\n",
    "      search('[2-5d-h[\\]^]', 'e') != None and\n",
    "      search('[\\^2-5d-h[\\]]', '[') != None and\n",
    "      search('[\\^2-5d-h[\\]]', ']') != None and\n",
    "      search('[\\^2-5d-h[\\]]', '^') != None)    \n",
    "print(search('[2-5d-h^]', '6') == None and search('[\\^2-5d-h]', 'c') == None)\n",
    "print()\n",
    "\n",
    "print(search('[^^2-5d-h[\\]]', '3') == None and\n",
    "      search('[^^2-5d-h[\\]]', 'e') == None and\n",
    "      search('[^^2-5d-h[\\]]', '[') == None and\n",
    "      search('[^^2-5d-h[\\]]', ']') == None  and\n",
    "      search('[^^2-5d-h[\\]]', '^') == None)    \n",
    "print(search('[^^2-5d-h[\\]]', '6') != None and\n",
    "      search('[^^2-5d-h[\\]]', 'c') != None)      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import search, escape\n",
    "\n",
    "print(search(escape('[^ab](*'), '0[^ab](*A') != None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import search\n",
    "\n",
    "# Alternation\n",
    "print(search('123|abc|\\|@', '123') != None and\n",
    "      search('123|abc|\\|@', 'abc') != None and\n",
    "      search('123|abc|\\|\\\\\\\\@', '|\\@') != None and\n",
    "      search(r'123|abc|\\|\\\\@', '|\\@') != None)      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import search\n",
    "\n",
    "# Quantifiers\n",
    "print(search('a?', '') != None and\n",
    "      search('a?', 'a') != None)\n",
    "print(search('a*', '') != None and\n",
    "      search('a*', 'a') != None and\n",
    "      search('a*', 'aa') != None and\n",
    "      search('a*', 'aaa') != None)\n",
    "print(search('a+', 'a') != None and\n",
    "      search('a+', 'aa') != None and\n",
    "      search('a+', 'aaa') != None)\n",
    "print()\n",
    "\n",
    "print(search('a{3}', 'aaa') != None)\n",
    "# No space before or after the comma!\n",
    "print(search('a{2,4}', 'aa') != None and\n",
    "      search('a{2,4}', 'aaa') != None and\n",
    "      search('a{2,4}', 'aaaa') != None)\n",
    "print(search('a{,3}', '') != None and\n",
    "      search('a{,3}', 'a') != None and\n",
    "      search('a{,3}', 'aa') != None and\n",
    "      search('a{,3}', 'aaa') != None)\n",
    "print(search('a{3,}', 'aaa') != None and\n",
    "      search('a{3,}', 'aaaa') != None and\n",
    "      search('a{3,}', 'aaaaa') != None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import search\n",
    "\n",
    "# Quantifiers: (by default) greedy versus reluctant\n",
    "search('a?', 'aaaa'), search('a??', 'aaaa'),\\\n",
    "search('a*', 'aaaaa'), search('a*?', 'aaaaa'),\\\n",
    "search('a+', 'aaaaa'), search('a+?', 'aaaaa'),\\\n",
    "search('a{2,3}', 'aaaaa'), search('a{2,3}?', 'aaaaa'),\\\n",
    "search('a{2,}', 'aaaaa'), search('a{2,}?', 'aaaaa')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Alternatively, replace MULTILINE by M\n",
    "# MULTILINE == M == 8\n",
    "from re import search, MULTILINE\n",
    "\n",
    "print(search(r'\\bab', ' ab') != None and search(r'ab\\b', 'ab!') != None)\n",
    "print(search('\\Bab', '_ab') != None and search('ab\\B', 'ab2') != None)\n",
    "\n",
    "print(search('^ab', 'abc') != None and\n",
    "      search('^ab', 'cab') == None and\n",
    "      search('^ab', 'c\\nab') == None and\n",
    "      search('^ab', 'c\\nab', MULTILINE) != None)\n",
    "print(search('ab$', 'cab') != None and\n",
    "      search('ab$', 'abc') == None and\n",
    "      search('ab$', 'ab\\nc') == None and\n",
    "      search('ab$', 'ab\\nc', MULTILINE) != None)\n",
    "print()\n",
    "\n",
    "print(search('\\Aab', 'abc') != None and\n",
    "      search('\\Aab', 'cab') == None and\n",
    "      search('\\Aab', 'c\\nab') == None)\n",
    "print(search('ab\\Z', 'cab') != None and\n",
    "      search('ab\\Z', 'abc') == None and\n",
    "      search('ab\\Z', 'ab\\nc') == None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Alternatively, replace IGNORECASE by I, ASCII by A\n",
    "# IGNORECASE == I == 2\n",
    "# ASCII == A == 256\n",
    "from re import search, IGNORECASE, ASCII\n",
    "\n",
    "print(search('aBcD', 'AbcD') == None and search('aBcD', 'AbcD', IGNORECASE) != None)\n",
    "print(search('\\w', 'î') != None and search('\\w', 'î', ASCII) == None)\n",
    "print(search(r'\\bab', 'éab') == None and search(r'\\bab', 'éab', ASCII) != None)\n",
    "print(search('ab\\B', 'abü') != None and search('ab\\B', 'abü', ASCII) == None)\n",
    "print()\n",
    "\n",
    "print(search(r'\\beF', 'àEf', IGNORECASE + ASCII) != None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import search\n",
    "\n",
    "print(search(r'(\\w+) (\\w+) \\1 (\\w+) \\1 \\3', 'abc def abc ghi abc ghi') != None)\n",
    "print(search(r'(\\d)\\1(2)', '332') != None)\n",
    "# Capturing and noncapturing parentheses\n",
    "print(search(r'(?:\\w+) (\\w+) \\1 (\\w+) (?:\\w+) (\\w+) \\3 \\2',\n",
    "             'abc def def ghi jkl mno mno ghi') != None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import findall\n",
    "\n",
    "print(findall('aba', 'abababa'))\n",
    "print(findall('(ab)a', 'abababa'))\n",
    "print(findall('(ab)(a)', 'abababa'))\n",
    "print(findall('((ab)a)', 'abababa'))\n",
    "print(findall('(a|b)+', 'abacaba'))\n",
    "print(findall('(?:a|b)+', 'abacaba'))\n",
    "print(findall('((a|b)+)', 'abacaba'))\n",
    "print()\n",
    "\n",
    "print(findall('a+', 'aaa'))\n",
    "print(findall('a*', 'aaa'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import findall\n",
    "\n",
    "print(findall('a', 'aaabaab'))\n",
    "# a followed by b\n",
    "print(findall('a(?=b)', 'aaabaab'))\n",
    "# a not followed by b\n",
    "print(findall('a(?!b)', 'aaabaab'))\n",
    "# a preceded by b\n",
    "print(findall('(?<=b)a', 'aaabaab'))\n",
    "# a not preceded by b\n",
    "print(findall('(?<!b)a', 'aaabaab'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import search\n",
    "\n",
    "# If group (here, 2) matches then match this, (optionally) else match that\n",
    "print(search('_(abc)(\\d{3})?(?(2)aaa)', '_abc345aaa') != None and\n",
    "      search('_(abc)(\\d{3})?(?(2)aaa|BBB)', '_abc345aaa') != None and\n",
    "      search('_(abc)(\\d{3})?(?(2)aaa|BBB)', '_abcBBB') != None)\n",
    "print(search('_(abc)(?P<word_2>\\d{3})?(?(word_2)aaa)', '_abc345aaa') != None and\n",
    "      search('_(abc)(?P<word_2>\\d{3})?(?(word_2)aaa|BBB)', '_abc345aaa') != None and\n",
    "      search('_(abc)(?P<word_2>\\d{3})?(?(word_2)aaa|BBB)', '_abcBBB') != None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Alternatively, replace VERBOSE by X\n",
    "# VERBOSE == X == 64\n",
    "# DEBUG == 128\n",
    "from re import search, VERBOSE, DEBUG\n",
    "\n",
    "print(search('''\n",
    "             \\d{3}   # Comments\n",
    "             ( |-)   # and spaces\n",
    "             [a-z]   # are ignored\n",
    "             ''', '845-f', VERBOSE) != None)\n",
    "print()\n",
    "\n",
    "print(search('\\d{3}( |-)[a-z]', '845-f', DEBUG))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Functions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "All occurrences of f(pattern, ...) below can be replaced by compile(pattern).f(...)\n",
    "\n",
    "No occurrence of compile(pattern).f(...) below can be replaced by f(pattern, ...)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Functions that return a Match object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import compile, search\n",
    "\n",
    "print(search('cd', 'abcde'))\n",
    "print(compile('cd').search('abcde', 2))\n",
    "print(compile('cd').search('abcde', 3))\n",
    "print(compile('cd').search('abcde', 2, 5))\n",
    "print(compile('cd').search('abcde', 2, 3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import compile, match\n",
    "\n",
    "print(match('cd', 'cde'))\n",
    "print(match('cd', 'abcde'))\n",
    "print(compile('cd').match('abcde', 2))\n",
    "print(compile('cd').match('abcde', 3))\n",
    "print(compile('cd').match('abcde', 2, 5))\n",
    "print(compile('cd').match('abcde', 2, 3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import fullmatch\n",
    "\n",
    "print(fullmatch('cd', 'cde'))\n",
    "print(fullmatch('cde', 'cde'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import compile, findall\n",
    "\n",
    "print(findall('.', 'abcdef'),\n",
    "      compile('.').findall('abcdef', 2),\n",
    "      compile('.').findall('abcdef', 2, 5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import compile, finditer\n",
    "\n",
    "matches = finditer('.', 'abcdef')\n",
    "for match in matches:\n",
    "    print(match)\n",
    "print()\n",
    "\n",
    "matches = compile('.').finditer('abcdef', 2)\n",
    "for match in matches:\n",
    "    print(match)\n",
    "print()\n",
    "\n",
    "matches = compile('.').finditer('abcdef', 2, 5)\n",
    "for match in matches:\n",
    "    print(match)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Functions applied to a Match object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import search\n",
    "\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group())\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group(0))\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group(1))\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group(2))\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group(3))\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group(1, 3))\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').group(0, 2, 1))\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').groups())\n",
    "print()\n",
    "\n",
    "print(search('\\w+ (?P<word_1>\\w+) (?P<word_2>\\w+ (?P<word_3>\\w+))',\n",
    "             'ab cd ef gh ij').group('word_2'))\n",
    "print(search('\\w+ (?P<word_1>\\w+) (?P<word_2>\\w+ (?P<word_3>\\w+))',\n",
    "             'ab cd ef gh ij').group('word_1', 'word_3'))\n",
    "print(search('\\w+ (?P<word_1>\\w+) (?P<word_2>\\w+ (?P<word_3>\\w+))',\n",
    "             'ab cd ef gh ij').groupdict())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import search\n",
    "\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').start())\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').start(0))\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').start(1))\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').start(2))\n",
    "print()\n",
    "\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').end())\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').end(0))\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').end(1))\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').end(2))\n",
    "print()\n",
    "\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').span())\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').span(0))\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').span(1))\n",
    "print(search('\\w+ (\\w+) (\\w+ (\\w+))', 'ab cd ef gh ij').span(2))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Functions that do not return a Match object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import split\n",
    "\n",
    "print(split(':!', 'ab:!cd:ef!gh:!ij'))\n",
    "print(split('(:!)', 'ab:!cd:ef!gh:!ij'))\n",
    "print()\n",
    "\n",
    "print(split(' ', 'ab cd ef gh ij'))\n",
    "print(split(' ', 'ab cd ef gh ij', 0))\n",
    "print(split(' ', 'ab cd ef gh ij', 6))\n",
    "print(split(' ', 'ab cd ef gh ij', 3))\n",
    "print(split(' ', 'ab cd ef gh ij', 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import sub, subn\n",
    "\n",
    "print(sub('aba', '*', 'ababababababa'))\n",
    "print(sub('aba', '*', 'ababababababa', 0))\n",
    "print(sub('aba', '*', 'ababababababa', 4))\n",
    "print(sub('aba', '*', 'ababababababa', 2))\n",
    "print(sub('aba', '*', 'ababababababa', 1))\n",
    "print()\n",
    "\n",
    "print(subn('aba', '*', 'ababababababa'))\n",
    "print(subn('aba', '*', 'ababababababa', 0))\n",
    "print(subn('aba', '*', 'ababababababa', 4))\n",
    "print(subn('aba', '*', 'ababababababa', 2))\n",
    "print(subn('aba', '*', 'ababababababa', 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from re import sub, subn\n",
    "\n",
    "print(sub(r'(?:\\w+) (\\w+) \\1 (\\w+) (?:\\w+) (\\w+) \\3 \\2', r'*\\2--\\g<1>2!\\3',  \n",
    "             'abc def def ghi jkl mno mno ghi ABC DEF DEF GHI JKL MNO MNO GHI'))\n",
    "print(sub('\\w+ (?P<word_1>\\w+) (?P<word_2>\\w+ (?P<word_3>\\w+))', r'\\g<word_2>--\\g<word_1>2\\3',\n",
    "          'ab cd ef gh ij AB CD EF GH IJ'))\n",
    "print()\n",
    "\n",
    "print(subn(r'(?:\\w+) (\\w+) \\1 (\\w+) (?:\\w+) (\\w+) \\3 \\2', r'*\\2--\\g<1>2!\\3',  \n",
    "             'abc def def ghi jkl mno mno ghi ABC DEF DEF GHI JKL MNO MNO GHI'))\n",
    "print(subn('\\w+ (?P<word_1>\\w+) (?P<word_2>\\w+ (?P<word_3>\\w+))', r'\\g<word_2>--\\g<word_1>2\\3',\n",
    "          'ab cd ef gh ij AB CD EF GH IJ'))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

Resource created Wednesday 26 August 2015, 10:26:32 AM.

file: re_module.ipynb


Back to top

COMP9021 15s2 (Principles of Programming) is powered by WebCMS3
CRICOS Provider No. 00098G