unify.py

View in browser Download
# Written by Eric Martin for COMP9021

'''Valid terms are inductively defined as variables
or expressions of the form f(t_1, ... ,t_n)
where n >=0 and f is  an n-ary function symbol
(in case n = 0, f is a constant).

Variables and function symbols consist of
alphanumeric characters and underscores.
Variables start with an uppercase letter or an underscore.
Functions symbols start with a lowercase letter.

Whenever a term is output, it is "nicely formatted"
in that a single space is inserted after and only after a comma.
'''

def variables(term):
    '''
    >>> variables('x')
    set()
    >>> variables('X')
    {'X'}
    >>> variables('f(X, a, X)')
    {'X'}
    >>> variables('f(X_0, Y, X_0)') == {'Y', 'X_0'}
    True
    >>> (variables('f(c, f(X, f(a, Z, b), f(f(X, Z, U), a, T)), f(a, U, a))')
    ...                                                == {'U', 'T', 'X', 'Z'})
    True
    '''
    variables = set()
    in_word = False
    in_variable = False
    for c in term:
        if c in ',() ':
            in_word = False
            if in_variable:
                variables.add(variable)
                in_variable = False
        elif not in_word:
            in_word = True
            if c.isupper() or c == '_':
                in_variable = True
                variable = c
        elif in_variable:
            variable += c
    # Case where term is a variable
    if in_variable:
        variables.add(variable)
    return variables
            

def rename_variables(term_1, term_2):
    '''
    returns term_2 where all variables that occur
    in both term_1 and term_2 have been renamed
    to new variables.
    
    >>> rename_variables('a', 'a')
    'a'
    >>> rename_variables('X', 'Y')
    'Y'
    >>> rename_variables('X', 'X')
    'X_0'
    >>> rename_variables('f(a, X, Y)', 'g(b,   Y,Z, Y)')
    'g(b, Y_0, Z, Y_0)'
    >>> rename_variables('f(a, X,  g( Y, b), h  ( h(h(Z))))',
    ...                  'h(h(f(U,Y,  g (Z,  Z)) ))')
    'h(h(f(U, Y_0, g(Z_0, Z_0))))'
    '''
    variables_1 = variables(term_1)
    variables_2 = variables(term_2)
    substitutions = {}
    # Any variable Var that occurs in both term_1 and term_2
    # will be renamed to Var_i where i is the least natural number
    # that makes Var_i a new variable (that is, occurring neither
    # in term_1 nor in term_2 nor in the set of variables
    # that have been created so far, if any).
    for variable in variables_1 & variables_2:
        i = 0
        while variable + '_' + str(i) in variables_1 | variables_2:
            i += 1
        substitutions[variable] = variable + '_' + str(i)
    return nicely_formatted(substitute(term_2, substitutions))
            

def tokens(term):
    '''Returns a decomposition of term as a list of
    commas, opening parentheses, closing parentheses,
    function symbols, and variables.
    >>> tokens('h( f(a1,   U_0))')
    ['h', '(', 'f', '(', 'a1', ',', 'U_0', ')', ')']
    '''
    term = term.replace(' ', '')
    tokens = []
    word = ''
    for c in term:
        if c in ',()':
            if word:
                tokens.append(word)
                word = ''
            tokens.append(c)
        else:
            word += c
    # Case where term is a variable or a constant.
    if word:
        tokens.append(word)
    return tokens


def substitute(term, substitutions):
    '''
    >>> substitute('f(a, X , g(Y,  b), h(h(h(Z))))', {'X': 'X_0', 'Z': 'Z_0'})
    'f(a, X_0, g(Y, b), h(h(h(Z_0))))'
    >>> substitute('h(h(f(U,Y,   g(Z , Z))))', {'Z': 'ZZZ'})
    'h(h(f(U, Y, g(ZZZ, ZZZ))))'
    '''
    term_tokens = tokens(term)
    for i in range(len(term_tokens)):
        if term_tokens[i] in substitutions:
            term_tokens[i] = substitutions[term_tokens[i]]
    return nicely_formatted(''.join(term_tokens))


def unify(term_1, term_2):
    '''
    Returns a most general unifier of term_1 and term_2 if one exists,
    in the form of a dictionnary whose keys are variables and
    whose values are the terms that should be substituted
    for the associated variables; otherwise returns False.
    
    >>> unify('X', 'X')
    {}
    >>> unify('X', 'a')
    {'X': 'a'}
    >>> unify('X', 'Y') == {'X': 'Y'} or unify('X', 'Y') == {'Y': 'X'}
    True
    >>> (unify('f(X, Y)', 'f(Y, X)') == {'X': 'Y'} or
    ...               unify('f(X, Y)', 'f(Y, X)') == {'Y': 'X'})
    True
    >>> unify('f(X1, h(X1), X2)', 'f(g(X3), X4, X3)') == {'X4': 'h(X1)',
    ... 'X2': 'X3', 'X1': 'g(X3)'} or unify('f(X1, h(X1), X2)',
    ...  'f(g(X3), X4, X3)') == {'X4': 'h(g(X3))', 'X2': 'X3', 'X1': 'g(X3)'}
    True
    >>> unifier = unify('f(X1 ,g(X2, X3), X2, b)',
    ...                          'f(g(h(a, X5), X2), X1, h(a, X4), X4)')
    >>> for _ in range(len(unifier)):
    ...           unifier = {variable: substitute(unifier[variable], unifier)
    ...                  for variable in unifier}
    >>> unifier == {'X3': 'h(a, b)', 'X1': 'g(h(a, b), h(a, b))',
    ...                                'X4': 'b', 'X5': 'b', 'X2': 'h(a, b)'}
    True
    >>> unify('f(X, Y, U)', 'f(Y, U, g(X))')
    False
    '''
    return unify_identities([[term_1, term_2]])


def unify_identities(identities):
    assignments = {}
    while identities:
        term_1, term_2 = identities.pop()
        if term_1 == term_2:
            continue
        # Identify of the form: Variable = term
        if is_variable(term_1):
            # Occurrence check (omitted in most Prolog implementations)
            if term_1 in tokens(term_2):
                return False
            assignments[term_1] = term_2
            # Replace all occurrences of term_1 by term_2
            # in all terms in identities
            for identity in identities:
                for i in range(2):
                    identity[i] = substitute(identity[i], {term_1: term_2})
            continue
        # Identify of the form: term = Variable
        # Transform into: Variable = term
        if is_variable(term_2):
            identities.append([term_2, term_1])
            continue
        # Identify of the form: f(t_1, ..., t_n) = g(t'_1, ..., t'_m)
        # f and g should be identical, n and m should be equal.
        decomposition_1 = parse(term_1)
        decomposition_2 = parse(term_2)
        if decomposition_1[0] != decomposition_2[0] or\
                        len(decomposition_1) != len(decomposition_2):
            return False
        # Then add the identities t_1 = t'_1, ..., t_n = t'_n
        for i in range(1, len(decomposition_1)):
            identities.append([decomposition_1[i], decomposition_2[i]])
    return assignments


def check_validity(term):
    '''
    >>> check_validity('x_12')
    True
    >>> check_validity('X')
    True
    >>> check_validity('_alpha')
    True
    >>> check_validity('sum(Two, three)')
    True
    >>> check_validity('f(g_1(a, B, c), g_2(A_, _4, pib))')
    True
    >>> check_validity('g(_, fun(fun(fun(fun(g(X, b))))), X)')
    True
    >>> check_validity('f(c, f(X, f(a, Z, b), f(f(X, Z, U), a, T)), f(a, U, a))'
    ...               )
    True
    >>> check_validity('2')
    False
    >>> check_validity('()')
    False
    >>> check_validity('Sum(Two, Three)')
    False
    >>> check_validity('f(g(a, b, c)), d)')
    False
    >>> check_validity('g(a, b, )')
    False
    '''
    term = term.replace(' ', '')
    if not term:
        return False
    # A term is built from alphanumeric characters, underscores,
    # commas, and parentheses.
    if any(not c.isalnum() and c not in '_,()' for c in term):
        return False
    # When we find an opening parenthesis, we increment
    # non_closed_opening_parentheses_nb.
    # When we find a closing parenthesis, we decrement
    # non_closed_opening_parentheses_nb.
    # non_closed_opening_parentheses_nb should never become negative
    # and in case it becomes nonnull, then it should eventually
    # become null with no symbol following.
    non_closed_opening_parentheses_nb = 0
    has_parentheses = False
    start_of_token = term[0]
    # A term starts with a letter or an underscore.
    if start_of_token.isnumeric() or start_of_token in ',()':
        return False 
    for c in term[1: ]:
        # Do not want to read a character
        # after all opening parentheses have been closed.
        if has_parentheses and non_closed_opening_parentheses_nb == 0:
            return False
        # Reading a variable or function symbol beyond its first symbol.
        if c not in ',()' and start_of_token not in ',()':
            continue
        # A comma should follow a term or a closing parenthesis.
        if c == ',' and (start_of_token.isnumeric() or start_of_token in ',('):
            return False           
        elif c == '(':
            has_parentheses = True
            non_closed_opening_parentheses_nb += 1
            # An opening parenthesis should follow a function symbol.
            if not start_of_token.islower():
                return False
        elif c == ')':
            non_closed_opening_parentheses_nb -= 1
            # A closing parenthesis should close an opening parenthesis
            # and follow a term or a closing parenthesis.
            if non_closed_opening_parentheses_nb < 0 or\
                        start_of_token.isnumeric() or\
                        start_of_token in ',(':
                return False
        start_of_token = c
    if non_closed_opening_parentheses_nb:
        return False
    return True
            
    
def parse(term):
    '''With term of the form f(t_1, ..., t_n), returns
    [f, t_1, ..., t_n]

    >>> parse('x_12')
    ['x_12']
    >>> parse('X')
    ['X']
    >>> parse('_alpha')
    ['_alpha']
    >>> parse('sum(Two,  three)')
    ['sum', 'Two', 'three']
    >>> parse('f(g_1( a , B, c), g_2(A_,   _4, pib))')
    ['f', 'g_1(a,B,c)', 'g_2(A_, _4, pib)']
    >>> parse('g(_, fun(  fun  ( fun(fun(g(  X,b))))), X)')
    ['g', '_', 'fun(fun(fun(fun(g(X,b)))))', 'X']
    '''
    term = term.replace(' ', '')
    words = []
    word = ''
    non_closed_opening_parentheses_nb = 0
    for c in term:
        # word is t_1 or ... or t_{n-1} from f(t_1, ..., t_n)
        if c == ',' and non_closed_opening_parentheses_nb == 1:
            words.append(word)
            word = ''
        elif c == '(':
            # word is f from f(t_1, ..., t_n)
            if non_closed_opening_parentheses_nb == 0:
                words.append(nicely_formatted(word))
                word = ''
            else:                
                word += '('
            non_closed_opening_parentheses_nb += 1
        elif c == ')':
            non_closed_opening_parentheses_nb -= 1
            if non_closed_opening_parentheses_nb:
                    word += ')'
        else:
            word += c
    # word is t_n from f(t_1, ..., t_n)
    # or the whole term in case it is a constant or a variable.
    words.append(nicely_formatted(word))
    return words


def is_variable(term):
    return term[0].isupper() or term[0] == '_' 


def nicely_formatted(term):
    return term.replace(' ', '').replace(',', ', ')


if __name__ == '__main__':
    import doctest
    doctest.testmod()
Resource created Wednesday 19 August 2015, 10:07:42 AM.
file: unify.py
CRICOS Provider No. 00098G