Source code for pyformlang.regular_expression.regex

"""
Representation of a regular expression
"""
from typing import Iterable

from pyformlang import finite_automaton
# pylint: disable=cyclic-import
import pyformlang.regular_expression.regex_objects
from pyformlang import cfg
from pyformlang.finite_automaton import State
# pylint: disable=cyclic-import
from pyformlang.regular_expression.regex_reader import RegexReader
from pyformlang import regular_expression


[docs]class Regex(RegexReader): """ Represents a regular expression Pyformlang implements the operators of textbooks, which deviate slightly \ from the operators in Python. For a representation closer to Python one, \ please use :class:`~pyformlang.regular_expression.PythonRegex` * The concatenation can be represented either by a space or a dot (.) * The union is represented either by | or + * The Kleene star is represented by * * The epsilon symbol can either be "epsilon" or $ It is also possible to use parentheses. All symbols except the space, ., \ |, +, *, (, ), epsilon and $ can be part of the alphabet. All \ other common regex operators (such as []) are syntactic sugar that can be \ reduced to the previous operators. Another main difference is that the \ alphabet is not reduced to single characters as it is the case in Python. \ For example, "python" is a single symbol in Pyformlang, whereas it is the \ concatenation of six symbols in regular Python. All special characters except epsilon can be escaped with a backslash (\ double backslash \\ in strings). Parameters ---------- regex : str The regex represented as a string Raises ------ MisformedRegexError If the regular expression is misformed. Examples -------- >>> regex = Regex("abc|d") Check if the symbol "abc" is accepted >>> regex.accepts(["abc"]) True Check if the word composed of the symbols "a", "b" and "c" is accepted >>> regex.accepts(["a", "b", "c"]) False Check if the symbol "d" is accepted >>> regex.accepts(["d"]) # True >>> regex1 = Regex("a b") >>> regex_concat = regex.concatenate(regex1) >>> regex_concat.accepts(["d", "a", "b"]) True >>> print(regex_concat.get_tree_str()) Operator(Concatenation) Operator(Union) Symbol(abc) Symbol(d) Operator(Concatenation) Symbol(a) Symbol(b) Give the equivalent finite-state automaton >>> regex_concat.to_epsilon_nfa() """ def __init__(self, regex): self.head = None self.sons = None super().__init__(regex) self._counter = 0 self._initialize_enfa() self._enfa = None def _initialize_enfa(self): self._enfa = finite_automaton.EpsilonNFA()
[docs] def get_number_symbols(self) -> int: """ Gives the number of symbols in the regex Returns ---------- n_symbols : int The number of symbols in the regex Examples -------- >>> regex = Regex("a|b*") >>> regex.get_number_symbols() 2 The two symbols are "a" and "b". """ if self.sons: return sum(son.get_number_symbols() for son in self.sons) return 1
[docs] def get_number_operators(self) -> int: """ Gives the number of operators in the regex Returns ---------- n_operators : int The number of operators in the regex Examples -------- >>> regex = Regex("a|b*") >>> regex.get_number_operators() 2 The two operators are "|" and "*". """ if self.sons: return 1 + sum(son.get_number_operators() for son in self.sons) return 0
[docs] def to_epsilon_nfa(self): """ Transforms the regular expression into an epsilon NFA Returns ---------- enfa : :class:`~pyformlang.finite_automaton.EpsilonNFA` An epsilon NFA equivalent to the regex Examples -------- >>> regex = Regex("abc|d") >>> regex.to_epsilon_nfa() """ self._initialize_enfa() s_initial = self._set_and_get_initial_state_in_enfa() s_final = self._set_and_get_final_state_in_enfa() self._process_to_enfa(s_initial, s_final) return self._enfa
def _set_and_get_final_state_in_enfa(self): s_final = self._get_next_state_enfa() self._enfa.add_final_state(s_final) return s_final def _get_next_state_enfa(self): s_final = finite_automaton.State(self._counter) self._counter += 1 return s_final def _set_and_get_initial_state_in_enfa(self): s_initial = self._get_next_state_enfa() self._enfa.add_start_state(s_initial) return s_initial def _process_to_enfa(self, s_from: State, s_to: State): """ Internal function to add a regex to a given epsilon NFA Parameters ---------- s_from : :class:`~pyformlang.finite_automaton.State` The source state s_to : :class:`~pyformlang.finite_automaton.State` The destination state """ if self.sons: self._process_to_enfa_when_sons(s_from, s_to) else: self._process_to_enfa_when_no_son(s_from, s_to) def _process_to_enfa_when_no_son(self, s_from, s_to): if isinstance(self.head, pyformlang.regular_expression.regex_objects.Epsilon): self._add_epsilon_transition_in_enfa_between(s_from, s_to) elif not isinstance(self.head, pyformlang.regular_expression.regex_objects.Empty): symbol = finite_automaton.Symbol(self.head.value) self._enfa.add_transition(s_from, symbol, s_to) def _process_to_enfa_when_sons(self, s_from, s_to): if isinstance( self.head, pyformlang.regular_expression.regex_objects.Concatenation): self._process_to_enfa_concatenation(s_from, s_to) elif isinstance(self.head, pyformlang.regular_expression.regex_objects.Union): self._process_to_enfa_union(s_from, s_to) elif isinstance( self.head, pyformlang.regular_expression.regex_objects.KleeneStar): self._process_to_enfa_kleene_star(s_from, s_to) def _process_to_enfa_kleene_star(self, s_from, s_to): # pylint: disable=protected-access state_first = self._get_next_state_enfa() state_second = self._get_next_state_enfa() self._add_epsilon_transition_in_enfa_between(state_second, state_first) self._add_epsilon_transition_in_enfa_between(s_from, s_to) self._add_epsilon_transition_in_enfa_between(s_from, state_first) self._add_epsilon_transition_in_enfa_between(state_second, s_to) self._process_to_enfa_son(state_first, state_second, 0) def _process_to_enfa_union(self, s_from, s_to): son_number = 0 self._create_union_branch_in_enfa(s_from, s_to, son_number) son_number = 1 self._create_union_branch_in_enfa(s_from, s_to, son_number) def _create_union_branch_in_enfa(self, s_from, s_to, son_number): state0 = self._get_next_state_enfa() state2 = self._get_next_state_enfa() self._add_epsilon_transition_in_enfa_between(s_from, state0) self._add_epsilon_transition_in_enfa_between(state2, s_to) self._process_to_enfa_son(state0, state2, son_number) def _process_to_enfa_concatenation(self, s_from, s_to): state0 = self._get_next_state_enfa() state1 = self._get_next_state_enfa() self._add_epsilon_transition_in_enfa_between(state0, state1) self._process_to_enfa_son(s_from, state0, 0) self._process_to_enfa_son(state1, s_to, 1) def _add_epsilon_transition_in_enfa_between(self, state0, state1): self._enfa.add_transition(state0, finite_automaton.Epsilon(), state1) def _process_to_enfa_son(self, s_from, s_to, index_son): # pylint: disable=protected-access self.sons[index_son]._counter = self._counter self.sons[index_son]._enfa = self._enfa self.sons[index_son]._process_to_enfa(s_from, s_to) self._counter = self.sons[index_son]._counter
[docs] def get_tree_str(self, depth: int = 0) -> str: """ Get a string representation of the tree behind the regex Parameters ---------- depth: int The current depth, 0 by default Returns ------- representation: str The tree representation Examples -------- >>> regex = Regex("abc|d*") >>> print(regex.get_tree_str()) Operator(Union) Symbol(abc) Operator(Kleene Star) Symbol(d) """ temp = " " * depth + str(self.head) + "\n" for son in self.sons: temp += son.get_tree_str(depth + 1) return temp
[docs] def to_cfg(self, starting_symbol="S") -> "CFG": """ Turns the regex into a context-free grammar Parameters ---------- starting_symbol : :class:`~pyformlang.cfg.Variable`, optional The starting symbol Returns ------- cfg : :class:`~pyformlang.cfg.CFG` An equivalent context-free grammar Examples -------- >>> regex = Regex("(a|b)* c") >>> my_cfg = regex.to_cfg() >>> my_cfg.contains(["c"]) True """ productions, _ = self._get_production(starting_symbol) cfg_res = cfg.CFG(start_symbol=cfg.utils.to_variable(starting_symbol), productions=set(productions)) return cfg_res
def _get_production(self, current_symbol, count=0): next_symbols = [] next_productions = [] for son in self.sons: next_symbol = "A" + str(count) count += 1 # pylint: disable=protected-access new_prods, count = son._get_production(next_symbol, count) next_symbols.append(next_symbol) next_productions += new_prods new_prods = self.head.get_cfg_rules(current_symbol, next_symbols) next_productions += new_prods return next_productions, count def __repr__(self): return self.head.get_str_repr([str(son) for son in self.sons])
[docs] def union(self, other: "Regex") -> "Regex": """ Makes the union with another regex Equivalent to: >>> regex0 or regex1 Parameters ---------- other : :class:`~pyformlang.regular_expression.Regex` The other regex Returns ---------- regex : :class:`~pyformlang.regular_expression.Regex` The union of the two regex Examples -------- >>> regex0 = Regex("a b") >>> regex1 = Regex("c") >>> regex_union = regex0.union(regex1) >>> regex_union.accepts(["a", "b"]) >>> regex_union.accepts(["c"]) Or equivalently: >>> regex_union = regex0 or regex1 >>> regex_union.accepts(["a", "b"]) """ regex = Regex("") regex.head = pyformlang.regular_expression.regex_objects.Union() regex.sons = [self, other] return regex
def __or__(self, other): """ Makes the union with another regex Parameters ---------- other : :class:`~pyformlang.regular_expression.Regex` The other regex Returns ---------- regex : :class:`~pyformlang.regular_expression.Regex` The union of the two regex Examples -------- >>> regex0 = Regex("a b") >>> regex1 = Regex("c") >>> regex_union = regex0.union(regex1) >>> regex_union.accepts(["a", "b"]) True >>> regex_union.accepts(["c"]) True Or equivalently: >>> regex_union = regex0 or regex1 >>> regex_union.accepts(["a", "b"]) True """ return self.union(other)
[docs] def concatenate(self, other: "Regex") -> "Regex": """ Concatenates a regular expression with an other one Equivalent to: >>> regex0 + regex1 Parameters ---------- other : :class:`~pyformlang.regular_expression.Regex` The other regex Returns ---------- regex : :class:`~pyformlang.regular_expression.Regex` The concatenation of the two regex Examples -------- >>> regex0 = Regex("a b") >>> regex1 = Regex("c") >>> regex_union = regex0.concatenate(regex1) >>> regex_union.accepts(["a", "b"]) False >>> regex_union.accepts(["a", "b", "c"]) True Or equivalently: >>> regex_union = regex0 + regex1 >>> regex_union.accepts(["a", "b", "c"]) True """ regex = Regex("") regex.head = \ pyformlang.regular_expression.regex_objects.Concatenation() regex.sons = [self, other] return regex
def __add__(self, other): """ Concatenates a regular expression with an other one Parameters ---------- other : :class:`~pyformlang.regular_expression.Regex` The other regex Returns ---------- regex : :class:`~pyformlang.regular_expression.Regex` The concatenation of the two regex Examples -------- >>> regex0 = Regex("a b") >>> regex1 = Regex("c") >>> regex_union = regex0.concatenate(regex1) >>> regex_union.accepts(["a", "b"]) False >>> regex_union.accepts(["a", "b", "c"]) True Or equivalently: >>> regex_union = regex0 + regex1 >>> regex_union.accepts(["a", "b", "c"]) True """ return self.concatenate(other)
[docs] def kleene_star(self) -> "Regex": """ Makes the kleene star of the current regex Returns ---------- regex : :class:`~pyformlang.regular_expression.Regex` The kleene star of the current regex Examples -------- >>> regex = Regex("a") >>> regex_kleene = regex.kleene_star() >>> regex_kleene.accepts([]) True >>> regex_kleene.accepts(["a", "a", "a"]) True """ regex = Regex("") regex.head = pyformlang.regular_expression.regex_objects.KleeneStar() regex.sons = [self] return regex
[docs] def from_string(self, regex_str: str): """ Construct a regex from a string. For internal usage. Equivalent to the constructor of Regex Parameters ---------- regex_str : str The string representation of the regex Returns ------- regex : :class:`~pyformlang.regular_expression.Regex` The regex Examples -------- >>> regex.from_string("a b c") , which is equivalent to: >>> Regex("a b c") """ return Regex(regex_str)
[docs] def accepts(self, word: Iterable[str]) -> bool: """ Check if a word matches (completely) the regex Parameters ---------- word : iterable of str The word to check Returns ------- is_accepted : bool Whether the word is recognized or not Examples -------- >>> regex = Regex("abc|d") Check if the symbol "abc" is accepted >>> regex.accepts(["abc"]) True """ if self._enfa is None: self._enfa = self.to_epsilon_nfa() return self._enfa.accepts(word)
[docs] @classmethod def from_python_regex(cls, regex): """ Creates a regex from a string using the python way to write it. Careful: Not everything is implemented, check PythonRegex class \ documentation for more details. It is equivalent to calling PythonRegex constructor directly. Parameters ---------- regex : str The regex given as a string or compile regex Returns ------- python_regex : :class:`~pyformlang.regular_expression.PythonRegex` The regex Examples -------- >>> Regex.from_python_regex("a+[cd]") """ return regular_expression.PythonRegex(regex)