"""
Representation of a regular expression
"""
from typing import Iterable
from pyformlang import finite_automaton
# pylint: disable=cyclic-import
import pyformlang.regular_expression.regex_objects
from pyformlang import cfg
from pyformlang.finite_automaton import State
# pylint: disable=cyclic-import
from pyformlang.regular_expression.regex_reader import RegexReader
from pyformlang import regular_expression
[docs]class Regex(RegexReader):
""" Represents a regular expression
Pyformlang implements the operators of textbooks, which deviate slightly \
from the operators in Python. For a representation closer to Python one, \
please use :class:`~pyformlang.regular_expression.PythonRegex`
* The concatenation can be represented either by a space or a dot (.)
* The union is represented either by | or +
* The Kleene star is represented by *
* The epsilon symbol can either be "epsilon" or $
It is also possible to use parentheses. All symbols except the space, ., \
|, +, *, (, ), epsilon and $ can be part of the alphabet. All \
other common regex operators (such as []) are syntactic sugar that can be \
reduced to the previous operators. Another main difference is that the \
alphabet is not reduced to single characters as it is the case in Python. \
For example, "python" is a single symbol in Pyformlang, whereas it is the \
concatenation of six symbols in regular Python.
All special characters except epsilon can be escaped with a backslash (\
double backslash \\ in strings).
Parameters
----------
regex : str
The regex represented as a string
Raises
------
MisformedRegexError
If the regular expression is misformed.
Examples
--------
>>> regex = Regex("abc|d")
Check if the symbol "abc" is accepted
>>> regex.accepts(["abc"])
True
Check if the word composed of the symbols "a", "b" and "c" is accepted
>>> regex.accepts(["a", "b", "c"])
False
Check if the symbol "d" is accepted
>>> regex.accepts(["d"]) # True
>>> regex1 = Regex("a b")
>>> regex_concat = regex.concatenate(regex1)
>>> regex_concat.accepts(["d", "a", "b"])
True
>>> print(regex_concat.get_tree_str())
Operator(Concatenation)
Operator(Union)
Symbol(abc)
Symbol(d)
Operator(Concatenation)
Symbol(a)
Symbol(b)
Give the equivalent finite-state automaton
>>> regex_concat.to_epsilon_nfa()
"""
def __init__(self, regex):
self.head = None
self.sons = None
super().__init__(regex)
self._counter = 0
self._initialize_enfa()
self._enfa = None
def _initialize_enfa(self):
self._enfa = finite_automaton.EpsilonNFA()
[docs] def get_number_symbols(self) -> int:
""" Gives the number of symbols in the regex
Returns
----------
n_symbols : int
The number of symbols in the regex
Examples
--------
>>> regex = Regex("a|b*")
>>> regex.get_number_symbols()
2
The two symbols are "a" and "b".
"""
if self.sons:
return sum(son.get_number_symbols() for son in self.sons)
return 1
[docs] def get_number_operators(self) -> int:
""" Gives the number of operators in the regex
Returns
----------
n_operators : int
The number of operators in the regex
Examples
--------
>>> regex = Regex("a|b*")
>>> regex.get_number_operators()
2
The two operators are "|" and "*".
"""
if self.sons:
return 1 + sum(son.get_number_operators() for son in self.sons)
return 0
[docs] def to_epsilon_nfa(self):
""" Transforms the regular expression into an epsilon NFA
Returns
----------
enfa : :class:`~pyformlang.finite_automaton.EpsilonNFA`
An epsilon NFA equivalent to the regex
Examples
--------
>>> regex = Regex("abc|d")
>>> regex.to_epsilon_nfa()
"""
self._initialize_enfa()
s_initial = self._set_and_get_initial_state_in_enfa()
s_final = self._set_and_get_final_state_in_enfa()
self._process_to_enfa(s_initial, s_final)
return self._enfa
def _set_and_get_final_state_in_enfa(self):
s_final = self._get_next_state_enfa()
self._enfa.add_final_state(s_final)
return s_final
def _get_next_state_enfa(self):
s_final = finite_automaton.State(self._counter)
self._counter += 1
return s_final
def _set_and_get_initial_state_in_enfa(self):
s_initial = self._get_next_state_enfa()
self._enfa.add_start_state(s_initial)
return s_initial
def _process_to_enfa(self, s_from: State, s_to: State):
""" Internal function to add a regex to a given epsilon NFA
Parameters
----------
s_from : :class:`~pyformlang.finite_automaton.State`
The source state
s_to : :class:`~pyformlang.finite_automaton.State`
The destination state
"""
if self.sons:
self._process_to_enfa_when_sons(s_from, s_to)
else:
self._process_to_enfa_when_no_son(s_from, s_to)
def _process_to_enfa_when_no_son(self, s_from, s_to):
if isinstance(self.head,
pyformlang.regular_expression.regex_objects.Epsilon):
self._add_epsilon_transition_in_enfa_between(s_from, s_to)
elif not isinstance(self.head,
pyformlang.regular_expression.regex_objects.Empty):
symbol = finite_automaton.Symbol(self.head.value)
self._enfa.add_transition(s_from, symbol, s_to)
def _process_to_enfa_when_sons(self, s_from, s_to):
if isinstance(
self.head,
pyformlang.regular_expression.regex_objects.Concatenation):
self._process_to_enfa_concatenation(s_from, s_to)
elif isinstance(self.head,
pyformlang.regular_expression.regex_objects.Union):
self._process_to_enfa_union(s_from, s_to)
elif isinstance(
self.head,
pyformlang.regular_expression.regex_objects.KleeneStar):
self._process_to_enfa_kleene_star(s_from, s_to)
def _process_to_enfa_kleene_star(self, s_from, s_to):
# pylint: disable=protected-access
state_first = self._get_next_state_enfa()
state_second = self._get_next_state_enfa()
self._add_epsilon_transition_in_enfa_between(state_second, state_first)
self._add_epsilon_transition_in_enfa_between(s_from, s_to)
self._add_epsilon_transition_in_enfa_between(s_from, state_first)
self._add_epsilon_transition_in_enfa_between(state_second, s_to)
self._process_to_enfa_son(state_first, state_second, 0)
def _process_to_enfa_union(self, s_from, s_to):
son_number = 0
self._create_union_branch_in_enfa(s_from, s_to, son_number)
son_number = 1
self._create_union_branch_in_enfa(s_from, s_to, son_number)
def _create_union_branch_in_enfa(self, s_from, s_to, son_number):
state0 = self._get_next_state_enfa()
state2 = self._get_next_state_enfa()
self._add_epsilon_transition_in_enfa_between(s_from, state0)
self._add_epsilon_transition_in_enfa_between(state2, s_to)
self._process_to_enfa_son(state0, state2, son_number)
def _process_to_enfa_concatenation(self, s_from, s_to):
state0 = self._get_next_state_enfa()
state1 = self._get_next_state_enfa()
self._add_epsilon_transition_in_enfa_between(state0, state1)
self._process_to_enfa_son(s_from, state0, 0)
self._process_to_enfa_son(state1, s_to, 1)
def _add_epsilon_transition_in_enfa_between(self, state0, state1):
self._enfa.add_transition(state0, finite_automaton.Epsilon(), state1)
def _process_to_enfa_son(self, s_from, s_to, index_son):
# pylint: disable=protected-access
self.sons[index_son]._counter = self._counter
self.sons[index_son]._enfa = self._enfa
self.sons[index_son]._process_to_enfa(s_from, s_to)
self._counter = self.sons[index_son]._counter
[docs] def get_tree_str(self, depth: int = 0) -> str:
""" Get a string representation of the tree behind the regex
Parameters
----------
depth: int
The current depth, 0 by default
Returns
-------
representation: str
The tree representation
Examples
--------
>>> regex = Regex("abc|d*")
>>> print(regex.get_tree_str())
Operator(Union)
Symbol(abc)
Operator(Kleene Star)
Symbol(d)
"""
temp = " " * depth + str(self.head) + "\n"
for son in self.sons:
temp += son.get_tree_str(depth + 1)
return temp
[docs] def to_cfg(self, starting_symbol="S") -> "CFG":
"""
Turns the regex into a context-free grammar
Parameters
----------
starting_symbol : :class:`~pyformlang.cfg.Variable`, optional
The starting symbol
Returns
-------
cfg : :class:`~pyformlang.cfg.CFG`
An equivalent context-free grammar
Examples
--------
>>> regex = Regex("(a|b)* c")
>>> my_cfg = regex.to_cfg()
>>> my_cfg.contains(["c"])
True
"""
productions, _ = self._get_production(starting_symbol)
cfg_res = cfg.CFG(start_symbol=cfg.utils.to_variable(starting_symbol),
productions=set(productions))
return cfg_res
def _get_production(self, current_symbol, count=0):
next_symbols = []
next_productions = []
for son in self.sons:
next_symbol = "A" + str(count)
count += 1
# pylint: disable=protected-access
new_prods, count = son._get_production(next_symbol, count)
next_symbols.append(next_symbol)
next_productions += new_prods
new_prods = self.head.get_cfg_rules(current_symbol, next_symbols)
next_productions += new_prods
return next_productions, count
def __repr__(self):
return self.head.get_str_repr([str(son) for son in self.sons])
[docs] def union(self, other: "Regex") -> "Regex":
""" Makes the union with another regex
Equivalent to:
>>> regex0 or regex1
Parameters
----------
other : :class:`~pyformlang.regular_expression.Regex`
The other regex
Returns
----------
regex : :class:`~pyformlang.regular_expression.Regex`
The union of the two regex
Examples
--------
>>> regex0 = Regex("a b")
>>> regex1 = Regex("c")
>>> regex_union = regex0.union(regex1)
>>> regex_union.accepts(["a", "b"])
>>> regex_union.accepts(["c"])
Or equivalently:
>>> regex_union = regex0 or regex1
>>> regex_union.accepts(["a", "b"])
"""
regex = Regex("")
regex.head = pyformlang.regular_expression.regex_objects.Union()
regex.sons = [self, other]
return regex
def __or__(self, other):
""" Makes the union with another regex
Parameters
----------
other : :class:`~pyformlang.regular_expression.Regex`
The other regex
Returns
----------
regex : :class:`~pyformlang.regular_expression.Regex`
The union of the two regex
Examples
--------
>>> regex0 = Regex("a b")
>>> regex1 = Regex("c")
>>> regex_union = regex0.union(regex1)
>>> regex_union.accepts(["a", "b"])
True
>>> regex_union.accepts(["c"])
True
Or equivalently:
>>> regex_union = regex0 or regex1
>>> regex_union.accepts(["a", "b"])
True
"""
return self.union(other)
[docs] def concatenate(self, other: "Regex") -> "Regex":
""" Concatenates a regular expression with an other one
Equivalent to:
>>> regex0 + regex1
Parameters
----------
other : :class:`~pyformlang.regular_expression.Regex`
The other regex
Returns
----------
regex : :class:`~pyformlang.regular_expression.Regex`
The concatenation of the two regex
Examples
--------
>>> regex0 = Regex("a b")
>>> regex1 = Regex("c")
>>> regex_union = regex0.concatenate(regex1)
>>> regex_union.accepts(["a", "b"])
False
>>> regex_union.accepts(["a", "b", "c"])
True
Or equivalently:
>>> regex_union = regex0 + regex1
>>> regex_union.accepts(["a", "b", "c"])
True
"""
regex = Regex("")
regex.head = \
pyformlang.regular_expression.regex_objects.Concatenation()
regex.sons = [self, other]
return regex
def __add__(self, other):
""" Concatenates a regular expression with an other one
Parameters
----------
other : :class:`~pyformlang.regular_expression.Regex`
The other regex
Returns
----------
regex : :class:`~pyformlang.regular_expression.Regex`
The concatenation of the two regex
Examples
--------
>>> regex0 = Regex("a b")
>>> regex1 = Regex("c")
>>> regex_union = regex0.concatenate(regex1)
>>> regex_union.accepts(["a", "b"])
False
>>> regex_union.accepts(["a", "b", "c"])
True
Or equivalently:
>>> regex_union = regex0 + regex1
>>> regex_union.accepts(["a", "b", "c"])
True
"""
return self.concatenate(other)
[docs] def kleene_star(self) -> "Regex":
""" Makes the kleene star of the current regex
Returns
----------
regex : :class:`~pyformlang.regular_expression.Regex`
The kleene star of the current regex
Examples
--------
>>> regex = Regex("a")
>>> regex_kleene = regex.kleene_star()
>>> regex_kleene.accepts([])
True
>>> regex_kleene.accepts(["a", "a", "a"])
True
"""
regex = Regex("")
regex.head = pyformlang.regular_expression.regex_objects.KleeneStar()
regex.sons = [self]
return regex
[docs] def from_string(self, regex_str: str):
""" Construct a regex from a string. For internal usage.
Equivalent to the constructor of Regex
Parameters
----------
regex_str : str
The string representation of the regex
Returns
-------
regex : :class:`~pyformlang.regular_expression.Regex`
The regex
Examples
--------
>>> regex.from_string("a b c")
, which is equivalent to:
>>> Regex("a b c")
"""
return Regex(regex_str)
[docs] def accepts(self, word: Iterable[str]) -> bool:
"""
Check if a word matches (completely) the regex
Parameters
----------
word : iterable of str
The word to check
Returns
-------
is_accepted : bool
Whether the word is recognized or not
Examples
--------
>>> regex = Regex("abc|d")
Check if the symbol "abc" is accepted
>>> regex.accepts(["abc"])
True
"""
if self._enfa is None:
self._enfa = self.to_epsilon_nfa()
return self._enfa.accepts(word)
[docs] @classmethod
def from_python_regex(cls, regex):
"""
Creates a regex from a string using the python way to write it.
Careful:
Not everything is implemented, check PythonRegex class \
documentation for more details.
It is equivalent to calling PythonRegex constructor directly.
Parameters
----------
regex : str
The regex given as a string or compile regex
Returns
-------
python_regex : :class:`~pyformlang.regular_expression.PythonRegex`
The regex
Examples
--------
>>> Regex.from_python_regex("a+[cd]")
"""
return regular_expression.PythonRegex(regex)