Source code for pyformlang.indexed_grammar.indexed_grammar

"""
Representation of an indexed grammar
"""

from typing import Any, Iterable, AbstractSet

import pyformlang

from .duplication_rule import DuplicationRule
from .production_rule import ProductionRule
from .rules import Rules



[docs]
class IndexedGrammar:
    """ Describes an indexed grammar.

    Parameters
    ----------
    rules : :class:`~pyformlang.indexed_grammar.Rules`
        The rules of the grammar, in reduced form put into a Rule
    start_variable : Any, optional
        The start symbol of the indexed grammar
    """

    def __init__(self,
                 rules: Rules,
                 start_variable: Any = "S"):
        self.rules = rules
        self.start_variable = start_variable
        # Precompute all non-terminals
        self.non_terminals = rules.non_terminals
        self.non_terminals.append(self.start_variable)
        self.non_terminals = set(self.non_terminals)
        # We cache the marked items in case of future update of the query
        self.marked = {}
        # Initialize the marked symbols
        # Mark the identity
        for non_terminal_a in self.non_terminals:
            self.marked[non_terminal_a] = set()
            temp = frozenset({non_terminal_a})
            self.marked[non_terminal_a].add(temp)
        # Mark all end symbols
        for non_terminal_a in self.non_terminals:
            if exists(self.rules.rules,
                      lambda x: x.is_end_rule()
                      and x.left_term == non_terminal_a):
                self.marked[non_terminal_a].add(frozenset())

    @property
    def terminals(self) -> Iterable[Any]:
        """Get all the terminals in the grammar

        Returns
        ----------
        terminals : iterable of any
            The terminals used in the rules
        """
        return self.rules.terminals

    def _duplication_processing(self, rule: DuplicationRule):
        """Processes a duplication rule

        Parameters
        ----------
        rule : :class:`~pyformlang.indexed_grammar.DuplicationRule`
            The duplication rule to process
        """
        was_modified = False
        need_stop = False
        right_term_marked0 = []
        for marked_term0 in self.marked[rule.right_terms[0]]:
            right_term_marked1 = []
            for marked_term1 in self.marked[rule.right_terms[1]]:
                if marked_term0 <= marked_term1:
                    temp = marked_term1
                elif marked_term1 <= marked_term0:
                    temp = marked_term0
                else:
                    temp = marked_term0.union(marked_term1)
                # Check if it was marked before
                if temp not in self.marked[rule.left_term]:
                    was_modified = True
                    if rule.left_term == rule.right_terms[0]:
                        right_term_marked0.append(temp)
                    elif rule.left_term == rule.right_terms[1]:
                        right_term_marked1.append(temp)
                    else:
                        self.marked[rule.left_term].add(temp)
                    # Stop condition, no need to continue
                    if rule.left_term == self.start_variable and len(
                            temp) == 0:
                        need_stop = True
            for temp in right_term_marked1:
                self.marked[rule.right_terms[1]].add(temp)
        for temp in right_term_marked0:
            self.marked[rule.right_terms[0]].add(temp)

        return was_modified, need_stop

    def _production_process(self, rule: ProductionRule):
        """Processes a production rule

        Parameters
        ----------
        rule : :class:`~pyformlang.indexed_grammar.ProductionRule`
            The production rule to process
        """
        was_modified = False
        # f_rules contains the consumption rules associated with
        # the current production symbol
        f_rules = self.rules.consumption_rules.setdefault(
            rule.production, [])
        # l_rules contains the left symbol plus what is marked on
        # the right side
        l_temp = [(x.left_term,
                   self.marked[x.right]) for x in f_rules]
        marked_symbols = [x.left_term for x in f_rules]
        # Process all combinations of consumption rule
        was_modified |= addrec_bis(l_temp,
                                   self.marked[rule.left_term],
                                   self.marked[rule.right_term])
        # End condition
        if frozenset() in self.marked[self.start_variable]:
            return was_modified, True
        # Is it useful?
        if rule.right_term in marked_symbols:
            for term in [term for term in l_temp
                         if rule.right_term == term[0]]:
                for sub_term in [sub_term
                                 for sub_term in term[1]
                                 if sub_term not in
                                 self.marked[rule.left_term]]:
                    was_modified = True
                    self.marked[rule.left_term].add(sub_term)
                    if (rule.left_term == self.start_variable and
                            len(sub_term) == 0):
                        return was_modified, True
        # Edge case
        if frozenset() in self.marked[rule.right_term]:
            if frozenset() not in self.marked[rule.left_term]:
                was_modified = True
                self.marked[rule.left_term].add(frozenset())
        return was_modified, False


[docs]
    def is_empty(self) -> bool:
        """Checks whether the grammar generates a word or not

        Returns
        ----------
        is_empty : bool
            Whether the grammar is empty or not
        """
        # To know when no more modification are done
        was_modified = True
        while was_modified:
            was_modified = False
            for rule in self.rules.rules:
                # If we have a duplication rule, we mark all combinations of
                # the sets marked on the right side for the symbol on the left
                # side
                if rule.is_duplication():
                    dup_res = self._duplication_processing(rule)
                    was_modified |= dup_res[0]
                    if dup_res[1]:
                        return False
                elif rule.is_production():
                    prod_res = self._production_process(rule)
                    if prod_res[1]:
                        return False
                    was_modified |= prod_res[0]
        if frozenset() in self.marked[self.start_variable]:
            return False
        return True


    def __bool__(self):
        return not self.is_empty()


[docs]
    def get_reachable_non_terminals(self) -> AbstractSet[Any]:
        """ Get the reachable symbols

        Returns
        ----------
        reachables : set of any
            The reachable symbols from the start state
        """
        # Preprocess
        reachable_from = {}
        consumption_rules = self.rules.consumption_rules
        for rule in self.rules.rules:
            if rule.is_duplication():
                left = rule.left_term
                right0 = rule.right_terms[0]
                right1 = rule.right_terms[1]
                if left not in reachable_from:
                    reachable_from[left] = set()
                reachable_from[left].add(right0)
                reachable_from[left].add(right1)
            if rule.is_production():
                left = rule.left_term
                right = rule.right_term
                if left not in reachable_from:
                    reachable_from[left] = set()
                reachable_from[left].add(right)
        for key in consumption_rules:
            for rule in consumption_rules[key]:
                left = rule.left_term
                right = rule.right
                if left not in reachable_from:
                    reachable_from[left] = set()
                reachable_from[left].add(right)
        # Processing
        to_process = [self.start_variable]
        reachables = {self.start_variable}
        while to_process:
            current = to_process.pop()
            for symbol in reachable_from.get(current, []):
                if symbol not in reachables:
                    reachables.add(symbol)
                    to_process.append(symbol)
        return reachables



[docs]
    def get_generating_non_terminals(self) -> AbstractSet[Any]:
        """ Get the generating symbols

        Returns
        ----------
        generating : set of any
            The generating symbols from the start state
        """
        # Preprocess
        generating_from = {}
        duplication_pointer = {}
        generating = set()
        to_process = []
        self._preprocess_rules_generating(duplication_pointer, generating,
                                          generating_from, to_process)
        self._preprocess_consumption_rules_generating(generating_from)
        # Processing
        while to_process:
            current = to_process.pop()
            for symbol in generating_from.get(current, []):
                if symbol not in generating:
                    generating.add(symbol)
                    to_process.append(symbol)
            for duplication in duplication_pointer.get(current, []):
                duplication[1] -= 1
                if duplication[1] == 0:
                    if duplication[0] not in generating:
                        generating.add(duplication[0])
                        to_process.append(duplication[0])
        return generating


    def _preprocess_consumption_rules_generating(self, generating_from):
        for key in self.rules.consumption_rules:
            for rule in self.rules.consumption_rules[key]:
                left = rule.left_term
                right = rule.right
                if right in generating_from:
                    generating_from[right].add(left)
                else:
                    generating_from[right] = {left}

    def _preprocess_rules_generating(self, duplication_pointer, generating,
                                     generating_from, to_process):
        for rule in self.rules.rules:
            if rule.is_duplication():
                left = rule.left_term
                right0 = rule.right_terms[0]
                right1 = rule.right_terms[1]
                temp = [left, 2]
                if right0 in duplication_pointer:
                    duplication_pointer[right0].append(temp)
                else:
                    duplication_pointer[right0] = [temp]
                if right1 in duplication_pointer:
                    duplication_pointer[right1].append(temp)
                else:
                    duplication_pointer[right1] = [temp]
            if rule.is_production():
                left = rule.left_term
                right = rule.right_term
                if right in generating_from:
                    generating_from[right].add(left)
                else:
                    generating_from[right] = {left}
            if rule.is_end_rule():
                left = rule.left_term
                if left not in generating:
                    generating.add(left)
                    to_process.append(left)


[docs]
    def remove_useless_rules(self) -> "IndexedGrammar":
        """ Remove useless rules in the grammar

        More precisely, we remove rules which do not contain only generating \
        or  reachable non terminals.

        Returns
        ----------
        i_grammar : :class:`~pyformlang.indexed_grammar.IndexedGrammar`
            The indexed grammar which useless rules
        """
        l_rules = []
        generating = self.get_generating_non_terminals()
        reachables = self.get_reachable_non_terminals()
        consumption_rules = self.rules.consumption_rules
        for rule in self.rules.rules:
            if rule.is_duplication():
                left = rule.left_term
                right0 = rule.right_terms[0]
                right1 = rule.right_terms[1]
                if all(x in generating and x in reachables for x in
                       [left, right0, right1]):
                    l_rules.append(rule)
            if rule.is_production():
                left = rule.left_term
                right = rule.right_term
                if all(x in generating and x in reachables for x in
                       [left, right]):
                    l_rules.append(rule)
            if rule.is_end_rule():
                left = rule.left_term
                if left in generating and left in reachables:
                    l_rules.append(rule)
        for key in consumption_rules:
            for rule in consumption_rules[key]:
                left = rule.left_term
                right = rule.right
                if all(x in generating and x in reachables for x in
                       [left, right]):
                    l_rules.append(rule)
        rules = Rules(l_rules, self.rules.optim)
        return IndexedGrammar(rules)



[docs]
    def intersection(self, other: Any) -> "IndexedGrammar":
        """ Computes the intersection of the current indexed grammar with the \
        other object

        Equivalent to
        --------------
          >> indexed_grammar and regex

        Parameters
        ----------
        other : any
            The object to intersect with

        Returns
        ----------
        i_grammar : :class:`~pyformlang.indexed_grammar.IndexedGrammar`
            The indexed grammar which useless rules

        Raises
        ------
        NotImplementedError
            When trying to intersection with something else than a regular
            expression or a finite automaton
        """
        if isinstance(other, pyformlang.regular_expression.Regex):
            other = other.to_epsilon_nfa()
        if isinstance(other, pyformlang.finite_automaton.FiniteAutomaton):
            fst = other.to_fst()
            return fst.intersection(self)
        raise NotImplementedError


    def __and__(self, other):
        """ Computes the intersection of the current indexed grammar with the
        other object

        Parameters
        ----------
        other : any
            The object to intersect with

        Returns
        ----------
        i_grammar : :class:`~pyformlang.indexed_grammar.IndexedGrammar`
            The indexed grammar which useless rules
        """
        return self.intersection(other)



def exists(list_elements, check_function):
    """exists
    Check whether at least an element x of l is True for f(x)
    :param list_elements: A list of elements to test
    :param check_function: The checking function (takes one parameter and  \
    return a boolean)
    """
    for element in list_elements:
        if check_function(element):
            return True
    return False


def addrec_bis(l_sets, marked_left, marked_right):
    """addrec_bis
    Optimized version of addrec
    :param l_sets: a list containing tuples (C, M) where:
        * C is a non-terminal on the left of a consumption rule
        * M is the set of the marked set for the right non-terminal in the
        production rule
    :param marked_left: Sets which are marked for the non-terminal on the
    left of the production rule
    :param marked_right: Sets which are marked for the non-terminal on the
    right of the production rule
    """
    was_modified = False
    for marked in list(marked_right):
        l_temp = [x for x in l_sets if x[0] in marked]
        s_temp = [x[0] for x in l_temp]
        # At least one symbol to consider
        if frozenset(s_temp) == marked and len(marked) > 0:
            was_modified |= addrec_ter(l_temp, marked_left)
    return was_modified


def addrec_ter(l_sets, marked_left):
    """addrec
    Explores all possible combination of consumption rules to mark a
    production rule.
    :param l_sets: a list containing tuples (C, M) where:
        * C is a non-terminal on the left of a consumption rule
        * M is the set of the marked set for the right non-terminal in the
        production rule
    :param marked_left: Sets which are marked for the non-terminal on the
    left of the production rule
    :return Whether an element was actually marked
    """
    # End condition, nothing left to process
    temp_in = [x[0] for x in l_sets]
    exists_after = [
        exists(l_sets[index + 1:], lambda x: x[0] == l_sets[index][0])
        for index in range(len(l_sets))]
    exists_before = [l_sets[index][0] in temp_in[:index]
                     for index in range(len(l_sets))]
    marked_sets = [l_sets[index][1] for index in range(len(l_sets))]
    marked_sets = [sorted(x, key=lambda x: -len(x)) for x in marked_sets]
    # Try to optimize by having an order of the sets
    sorted_zip = sorted(zip(exists_after, exists_before, marked_sets),
                        key=lambda x: -len(x[2]))
    exists_after, exists_before, marked_sets = \
        zip(*sorted_zip)
    res = False
    # contains tuples of index, temp_set
    to_process = [(0, frozenset())]
    done = set()
    while to_process:
        index, new_temp = to_process.pop()
        if index >= len(l_sets):
            # Check if at least one non-terminal was considered, then if the
            # set of non-terminals considered is marked of the right
            # non-terminal in the production rule, then if a new set is
            # marked or not
            if new_temp not in marked_left:
                marked_left.add(new_temp)
                res = True
            continue
        if exists_before[index] or exists_after[index]:
            to_append = (index + 1, new_temp)
            to_process.append(to_append)
        if not exists_before[index]:
            # For all sets which were marked for the current consumption rule
            for marked_set in marked_sets[index]:
                if marked_set <= new_temp:
                    to_append = (index + 1, new_temp)
                elif new_temp <= marked_set:
                    to_append = (index + 1, marked_set)
                else:
                    to_append = (index + 1, new_temp.union(marked_set))
                if to_append not in done:
                    done.add(to_append)
                    to_process.append(to_append)
    return res