Source code for pyformlang.indexed_grammar.indexed_grammar

"""
Representation of an indexed grammar
"""

from typing import Any, Iterable, AbstractSet

import pyformlang

from .duplication_rule import DuplicationRule
from .production_rule import ProductionRule
from .rules import Rules


[docs]class IndexedGrammar: """ Describes an indexed grammar. Parameters ---------- rules : :class:`~pyformlang.indexed_grammar.Rules` The rules of the grammar, in reduced form put into a Rule start_variable : Any, optional The start symbol of the indexed grammar """ def __init__(self, rules: Rules, start_variable: Any = "S"): self.rules = rules self.start_variable = start_variable # Precompute all non-terminals self.non_terminals = rules.non_terminals self.non_terminals.append(self.start_variable) self.non_terminals = set(self.non_terminals) # We cache the marked items in case of future update of the query self.marked = {} # Initialize the marked symbols # Mark the identity for non_terminal_a in self.non_terminals: self.marked[non_terminal_a] = set() temp = frozenset({non_terminal_a}) self.marked[non_terminal_a].add(temp) # Mark all end symbols for non_terminal_a in self.non_terminals: if exists(self.rules.rules, lambda x: x.is_end_rule() and x.left_term == non_terminal_a): self.marked[non_terminal_a].add(frozenset()) @property def terminals(self) -> Iterable[Any]: """Get all the terminals in the grammar Returns ---------- terminals : iterable of any The terminals used in the rules """ return self.rules.terminals def _duplication_processing(self, rule: DuplicationRule): """Processes a duplication rule Parameters ---------- rule : :class:`~pyformlang.indexed_grammar.DuplicationRule` The duplication rule to process """ was_modified = False need_stop = False right_term_marked0 = [] for marked_term0 in self.marked[rule.right_terms[0]]: right_term_marked1 = [] for marked_term1 in self.marked[rule.right_terms[1]]: if marked_term0 <= marked_term1: temp = marked_term1 elif marked_term1 <= marked_term0: temp = marked_term0 else: temp = marked_term0.union(marked_term1) # Check if it was marked before if temp not in self.marked[rule.left_term]: was_modified = True if rule.left_term == rule.right_terms[0]: right_term_marked0.append(temp) elif rule.left_term == rule.right_terms[1]: right_term_marked1.append(temp) else: self.marked[rule.left_term].add(temp) # Stop condition, no need to continue if rule.left_term == self.start_variable and len( temp) == 0: need_stop = True for temp in right_term_marked1: self.marked[rule.right_terms[1]].add(temp) for temp in right_term_marked0: self.marked[rule.right_terms[0]].add(temp) return was_modified, need_stop def _production_process(self, rule: ProductionRule): """Processes a production rule Parameters ---------- rule : :class:`~pyformlang.indexed_grammar.ProductionRule` The production rule to process """ was_modified = False # f_rules contains the consumption rules associated with # the current production symbol f_rules = self.rules.consumption_rules.setdefault( rule.production, []) # l_rules contains the left symbol plus what is marked on # the right side l_temp = [(x.left_term, self.marked[x.right]) for x in f_rules] marked_symbols = [x.left_term for x in f_rules] # Process all combinations of consumption rule was_modified |= addrec_bis(l_temp, self.marked[rule.left_term], self.marked[rule.right_term]) # End condition if frozenset() in self.marked[self.start_variable]: return was_modified, True # Is it useful? if rule.right_term in marked_symbols: for term in [term for term in l_temp if rule.right_term == term[0]]: for sub_term in [sub_term for sub_term in term[1] if sub_term not in self.marked[rule.left_term]]: was_modified = True self.marked[rule.left_term].add(sub_term) if (rule.left_term == self.start_variable and len(sub_term) == 0): return was_modified, True # Edge case if frozenset() in self.marked[rule.right_term]: if frozenset() not in self.marked[rule.left_term]: was_modified = True self.marked[rule.left_term].add(frozenset()) return was_modified, False
[docs] def is_empty(self) -> bool: """Checks whether the grammar generates a word or not Returns ---------- is_empty : bool Whether the grammar is empty or not """ # To know when no more modification are done was_modified = True while was_modified: was_modified = False for rule in self.rules.rules: # If we have a duplication rule, we mark all combinations of # the sets marked on the right side for the symbol on the left # side if rule.is_duplication(): dup_res = self._duplication_processing(rule) was_modified |= dup_res[0] if dup_res[1]: return False elif rule.is_production(): prod_res = self._production_process(rule) if prod_res[1]: return False was_modified |= prod_res[0] if frozenset() in self.marked[self.start_variable]: return False return True
def __bool__(self): return not self.is_empty()
[docs] def get_reachable_non_terminals(self) -> AbstractSet[Any]: """ Get the reachable symbols Returns ---------- reachables : set of any The reachable symbols from the start state """ # Preprocess reachable_from = {} consumption_rules = self.rules.consumption_rules for rule in self.rules.rules: if rule.is_duplication(): left = rule.left_term right0 = rule.right_terms[0] right1 = rule.right_terms[1] if left not in reachable_from: reachable_from[left] = set() reachable_from[left].add(right0) reachable_from[left].add(right1) if rule.is_production(): left = rule.left_term right = rule.right_term if left not in reachable_from: reachable_from[left] = set() reachable_from[left].add(right) for key in consumption_rules: for rule in consumption_rules[key]: left = rule.left_term right = rule.right if left not in reachable_from: reachable_from[left] = set() reachable_from[left].add(right) # Processing to_process = [self.start_variable] reachables = {self.start_variable} while to_process: current = to_process.pop() for symbol in reachable_from.get(current, []): if symbol not in reachables: reachables.add(symbol) to_process.append(symbol) return reachables
[docs] def get_generating_non_terminals(self) -> AbstractSet[Any]: """ Get the generating symbols Returns ---------- generating : set of any The generating symbols from the start state """ # Preprocess generating_from = {} duplication_pointer = {} generating = set() to_process = [] self._preprocess_rules_generating(duplication_pointer, generating, generating_from, to_process) self._preprocess_consumption_rules_generating(generating_from) # Processing while to_process: current = to_process.pop() for symbol in generating_from.get(current, []): if symbol not in generating: generating.add(symbol) to_process.append(symbol) for duplication in duplication_pointer.get(current, []): duplication[1] -= 1 if duplication[1] == 0: if duplication[0] not in generating: generating.add(duplication[0]) to_process.append(duplication[0]) return generating
def _preprocess_consumption_rules_generating(self, generating_from): for key in self.rules.consumption_rules: for rule in self.rules.consumption_rules[key]: left = rule.left_term right = rule.right if right in generating_from: generating_from[right].add(left) else: generating_from[right] = {left} def _preprocess_rules_generating(self, duplication_pointer, generating, generating_from, to_process): for rule in self.rules.rules: if rule.is_duplication(): left = rule.left_term right0 = rule.right_terms[0] right1 = rule.right_terms[1] temp = [left, 2] if right0 in duplication_pointer: duplication_pointer[right0].append(temp) else: duplication_pointer[right0] = [temp] if right1 in duplication_pointer: duplication_pointer[right1].append(temp) else: duplication_pointer[right1] = [temp] if rule.is_production(): left = rule.left_term right = rule.right_term if right in generating_from: generating_from[right].add(left) else: generating_from[right] = {left} if rule.is_end_rule(): left = rule.left_term if left not in generating: generating.add(left) to_process.append(left)
[docs] def remove_useless_rules(self) -> "IndexedGrammar": """ Remove useless rules in the grammar More precisely, we remove rules which do not contain only generating \ or reachable non terminals. Returns ---------- i_grammar : :class:`~pyformlang.indexed_grammar.IndexedGrammar` The indexed grammar which useless rules """ l_rules = [] generating = self.get_generating_non_terminals() reachables = self.get_reachable_non_terminals() consumption_rules = self.rules.consumption_rules for rule in self.rules.rules: if rule.is_duplication(): left = rule.left_term right0 = rule.right_terms[0] right1 = rule.right_terms[1] if all(x in generating and x in reachables for x in [left, right0, right1]): l_rules.append(rule) if rule.is_production(): left = rule.left_term right = rule.right_term if all(x in generating and x in reachables for x in [left, right]): l_rules.append(rule) if rule.is_end_rule(): left = rule.left_term if left in generating and left in reachables: l_rules.append(rule) for key in consumption_rules: for rule in consumption_rules[key]: left = rule.left_term right = rule.right if all(x in generating and x in reachables for x in [left, right]): l_rules.append(rule) rules = Rules(l_rules, self.rules.optim) return IndexedGrammar(rules)
[docs] def intersection(self, other: Any) -> "IndexedGrammar": """ Computes the intersection of the current indexed grammar with the \ other object Equivalent to -------------- >> indexed_grammar and regex Parameters ---------- other : any The object to intersect with Returns ---------- i_grammar : :class:`~pyformlang.indexed_grammar.IndexedGrammar` The indexed grammar which useless rules Raises ------ NotImplementedError When trying to intersection with something else than a regular expression or a finite automaton """ if isinstance(other, pyformlang.regular_expression.Regex): other = other.to_epsilon_nfa() if isinstance(other, pyformlang.finite_automaton.FiniteAutomaton): fst = other.to_fst() return fst.intersection(self) raise NotImplementedError
def __and__(self, other): """ Computes the intersection of the current indexed grammar with the other object Parameters ---------- other : any The object to intersect with Returns ---------- i_grammar : :class:`~pyformlang.indexed_grammar.IndexedGrammar` The indexed grammar which useless rules """ return self.intersection(other)
def exists(list_elements, check_function): """exists Check whether at least an element x of l is True for f(x) :param list_elements: A list of elements to test :param check_function: The checking function (takes one parameter and \ return a boolean) """ for element in list_elements: if check_function(element): return True return False def addrec_bis(l_sets, marked_left, marked_right): """addrec_bis Optimized version of addrec :param l_sets: a list containing tuples (C, M) where: * C is a non-terminal on the left of a consumption rule * M is the set of the marked set for the right non-terminal in the production rule :param marked_left: Sets which are marked for the non-terminal on the left of the production rule :param marked_right: Sets which are marked for the non-terminal on the right of the production rule """ was_modified = False for marked in list(marked_right): l_temp = [x for x in l_sets if x[0] in marked] s_temp = [x[0] for x in l_temp] # At least one symbol to consider if frozenset(s_temp) == marked and len(marked) > 0: was_modified |= addrec_ter(l_temp, marked_left) return was_modified def addrec_ter(l_sets, marked_left): """addrec Explores all possible combination of consumption rules to mark a production rule. :param l_sets: a list containing tuples (C, M) where: * C is a non-terminal on the left of a consumption rule * M is the set of the marked set for the right non-terminal in the production rule :param marked_left: Sets which are marked for the non-terminal on the left of the production rule :return Whether an element was actually marked """ # End condition, nothing left to process temp_in = [x[0] for x in l_sets] exists_after = [ exists(l_sets[index + 1:], lambda x: x[0] == l_sets[index][0]) for index in range(len(l_sets))] exists_before = [l_sets[index][0] in temp_in[:index] for index in range(len(l_sets))] marked_sets = [l_sets[index][1] for index in range(len(l_sets))] marked_sets = [sorted(x, key=lambda x: -len(x)) for x in marked_sets] # Try to optimize by having an order of the sets sorted_zip = sorted(zip(exists_after, exists_before, marked_sets), key=lambda x: -len(x[2])) exists_after, exists_before, marked_sets = \ zip(*sorted_zip) res = False # contains tuples of index, temp_set to_process = [(0, frozenset())] done = set() while to_process: index, new_temp = to_process.pop() if index >= len(l_sets): # Check if at least one non-terminal was considered, then if the # set of non-terminals considered is marked of the right # non-terminal in the production rule, then if a new set is # marked or not if new_temp not in marked_left: marked_left.add(new_temp) res = True continue if exists_before[index] or exists_after[index]: to_append = (index + 1, new_temp) to_process.append(to_append) if not exists_before[index]: # For all sets which were marked for the current consumption rule for marked_set in marked_sets[index]: if marked_set <= new_temp: to_append = (index + 1, new_temp) elif new_temp <= marked_set: to_append = (index + 1, marked_set) else: to_append = (index + 1, new_temp.union(marked_set)) if to_append not in done: done.add(to_append) to_process.append(to_append) return res