<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;"># Extracts titles from a front page of the Sydney Morning Herald.
#
# Written by Eric Martin for COMP9021

# Note that some nonASCII characters do not display properly
# when the program is from from Idle, but do display properly
# when the program is run from the command line.


import re


def extract_text(line, pattern):
    extracted_text = pattern.search(line)
    if extracted_text:
        title = extracted_text.groups()[0]
        print(title.replace('&amp;nbsp;', ''))

# We look for text of the form title=....&gt;TITLE&lt;/a&gt;&lt;/h3&gt;
full_title_pattern = re.compile('title=[^&gt;]*&gt;([^&lt;]*)&lt;/a&gt;&lt;/h3&gt;')
# In some cases, &lt;/a&gt;&lt;/3&gt; is at the beginning of the next line
title_at_end_of_line_pattern = re.compile('[^&gt;]*&gt;([^&lt;]*)\n$')
end_tags_at_start_of_next_line_pattern = re.compile('^&lt;/a&gt;&lt;/h3&gt;')

with open('SMH.txt', 'r') as file:
    line = file.readline()
    for next_line in file:
        if end_tags_at_start_of_next_line_pattern.search(next_line):
            extract_text(line, title_at_end_of_line_pattern)
        else:
            extract_text(line, full_title_pattern)
        line = next_line
    # Process last line in the unique possible way
    extract_text(line, full_title_pattern)
</pre></body></html>