Source code for pyformlang.regular_expression.python_regex

"""
A class to read Python format regex
"""

import re
import string
import unicodedata

# pylint: disable=cyclic-import
from pyformlang.regular_expression import regex, MisformedRegexError
from pyformlang.regular_expression.regex_reader import \
    WRONG_PARENTHESIS_MESSAGE

PRINTABLES = list(string.printable)

TRANSFORMATIONS = {
    "|": "\\|",
    "(": "\\(",
    ")": "\\)",
    "*": "\\*",
    "+": "\\+",
    ".": "\\.",
    "$": "\\$",
    "\n": "",
    " ": "\\ ",
    '\\': '\\\\',
    "?": "\\?"
}

RECOMBINE = {
    "\\b": "\b",
    "\\n": "\n",
    "\\r": "\r",
    "\\t": "\t",
    "\\f": "\f"
}

ESCAPED_PRINTABLES = [TRANSFORMATIONS.get(x, x)
                      for x in PRINTABLES
                      if TRANSFORMATIONS.get(x, x)]

DOT_REPLACEMENT = "(" + "|".join(ESCAPED_PRINTABLES) + ")"

TO_ESCAPE_IN_BRACKETS = "(+*)?"

SHORTCUTS = {
    " ": "\\ ",  # We have to do this due to how Regex separate words
    r"\d": "[0-9]",
    r"\s": "[\\ \t\n\r\f\v]",
    r"\w": "[a-zA-Z0-9_]"
}

HEXASTRING = "0123456789ABCDEF"
OCTAL = "01234567"
ESCAPED_OCTAL = ["\\0", "\\1", "\\2", "\\3", "\\4", "\\5", "\\6", "\\7"]


[docs] class PythonRegex(regex.Regex): """ Represents a regular expression as used in Python. It adds the following features to the basic regex: * Set of characters with [] * Inverse set of character with [^...] * positive closure + * . for all printable characters * ? for optional character/group * Repetition of characters with {m} and {n,m}q * Shortcuts: \\d, \\s, \\w Parameters ---------- python_regex : Union[str, Pattern[str]] The regex represented as a string or a compiled regex ( re.compile(...)) Raises ------ MisformedRegexError If the regular expression is misformed. Examples -------- Python regular expressions wrapper >>> from pyformlang.regular_expression import PythonRegex >>> p_regex = PythonRegex("a+[cd]") >>> p_regex.accepts(["a", "a", "d"]) True As the alphabet is composed of single characters, one could also write >>> p_regex.accepts("aad") True >>> p_regex.accepts(["d"]) False """ def __init__(self, python_regex): if not isinstance(python_regex, str): python_regex = python_regex.pattern else: re.compile(python_regex) # Check if it is valid self._python_regex = python_regex self._replace_shortcuts() self._escape_in_brackets() self._preprocess_brackets() self._preprocess_positive_closure() self._preprocess_optional() self._separate() self._python_regex = self._python_regex.lstrip('\b') super().__init__(self._python_regex) def _separate(self): regex_temp = [] for symbol in self._python_regex: if self._should_escape_next_symbol(regex_temp): regex_temp[-1] += symbol else: regex_temp.append(symbol) regex_temp = self._recombine(regex_temp) regex_temp_dot = [] for symbol in regex_temp: if symbol == ".": regex_temp_dot.append(DOT_REPLACEMENT) else: regex_temp_dot.append(symbol) self._python_regex = " ".join(regex_temp_dot) def _preprocess_brackets(self): regex_temp = [] in_brackets = 0 in_brackets_temp = [] for symbol in self._python_regex: if symbol == "[" and not self._should_escape_next_symbol(regex_temp) and \ (in_brackets == 0 or not self._should_escape_next_symbol(in_brackets_temp[-1])): in_brackets += 1 in_brackets_temp.append([]) elif symbol == "]" and in_brackets >= 1 and not self._should_escape_next_symbol(in_brackets_temp[-1]): if len(in_brackets_temp) == 1: regex_temp.append("(") regex_temp += self._preprocess_brackets_content( in_brackets_temp[-1]) regex_temp.append(")") else: in_brackets_temp[-2].append( "(" + "".join( self._preprocess_brackets_content( in_brackets_temp[-1])) + ")") in_brackets -= 1 in_brackets_temp.pop() elif in_brackets > 0: if self._should_escape_next_symbol(in_brackets_temp[-1]): in_brackets_temp[-1][-1] += symbol elif symbol == "|": in_brackets_temp[-1].append("\\|") else: in_brackets_temp[-1].append(symbol) else: if self._should_escape_next_symbol(regex_temp): regex_temp[-1] += symbol else: regex_temp.append(symbol) self._python_regex = "".join(regex_temp) @staticmethod def _recombine(regex_to_recombine): temp = [] idx = 0 while idx < len(regex_to_recombine): if regex_to_recombine[idx] == "\\x" and idx < len(regex_to_recombine) - 2 \ and regex_to_recombine[idx + 1] in HEXASTRING \ and regex_to_recombine[idx + 2] in HEXASTRING: next_str = "".join(regex_to_recombine[idx + 1:idx + 3]) s_trans = chr(int(next_str, 16)) temp.append(TRANSFORMATIONS.get(s_trans, s_trans)) idx += 3 elif regex_to_recombine[idx] in ESCAPED_OCTAL \ and idx < len(regex_to_recombine) - 2 \ and regex_to_recombine[idx + 1] in OCTAL \ and regex_to_recombine[idx + 2] in OCTAL: next_str = "".join(regex_to_recombine[idx:idx + 3])[1:] s_trans = chr(int(next_str, 8)) temp.append(TRANSFORMATIONS.get(s_trans, s_trans)) idx += 3 elif regex_to_recombine[idx] == "\\N": idx_end = idx while regex_to_recombine[idx_end] != "}": idx_end += 1 name = "".join(regex_to_recombine[idx + 2: idx_end]) name = unicodedata.lookup(name) temp.append(TRANSFORMATIONS.get(name, name)) idx = idx_end + 1 elif regex_to_recombine[idx] == "\\u": unicode_str = "".join(regex_to_recombine[idx + 1: idx + 5]) decoded = chr(int(unicode_str, 16)) temp.append(TRANSFORMATIONS.get(decoded, decoded)) idx = idx + 5 elif regex_to_recombine[idx] == "\\U": unicode_str = "".join(regex_to_recombine[idx + 1: idx + 9]) decoded = chr(int(unicode_str, 16)) temp.append(TRANSFORMATIONS.get(decoded, decoded)) idx = idx + 9 else: temp.append(regex_to_recombine[idx]) idx += 1 res = [] for x in temp: if x in RECOMBINE: res.append(RECOMBINE[x]) else: res.append(x) return res def _preprocess_brackets_content(self, bracket_content): bracket_content_temp = [] previous_is_valid_for_range = False for i, symbol in enumerate(bracket_content): # We have a range if symbol == "-" and not self._should_escape_next_symbol(bracket_content_temp): if not previous_is_valid_for_range or i == len(bracket_content) - 1: # False alarm, no range bracket_content_temp.append("-") previous_is_valid_for_range = True else: # We insert all the characters in the range bracket_content[i - 1] = self._recombine(bracket_content[i - 1]) for j in range(ord(bracket_content[i - 1][-1]) + 1, ord(bracket_content[i + 1][-1])): next_char = chr(j) if next_char in TRANSFORMATIONS: bracket_content_temp.append(TRANSFORMATIONS[next_char]) else: bracket_content_temp.append(next_char) previous_is_valid_for_range = False else: if self._should_escape_next_symbol(bracket_content_temp): bracket_content_temp[-1] += symbol else: bracket_content_temp.append(symbol) if (i != 0 and bracket_content[i - 1] == "-" and not previous_is_valid_for_range): previous_is_valid_for_range = False else: previous_is_valid_for_range = True bracket_content_temp = self._preprocess_negation(bracket_content_temp) bracket_content_temp = self._insert_or(bracket_content_temp) bracket_content_temp = self._recombine(bracket_content_temp) return bracket_content_temp @staticmethod def _preprocess_negation(bracket_content): if not bracket_content or bracket_content[0] != "^": return bracket_content # We inverse everything return [x for x in ESCAPED_PRINTABLES if x not in bracket_content] @staticmethod def _insert_or(l_to_modify): res = [] for x in l_to_modify: res.append(x) res.append("|") if res: return res[:-1] return res def _find_previous_opening_parenthesis(self, split_sequence): counter = 0 for i in range(len(split_sequence) - 1, -1, -1): temp = split_sequence[i] if temp == ")": counter += 1 elif temp == "(" and counter == 1: return i elif temp == "(": counter -= 1 raise MisformedRegexError(WRONG_PARENTHESIS_MESSAGE, self._python_regex) def _preprocess_positive_closure(self): regex_temp = [] for symbol in self._python_regex: if symbol != "+" or (self._should_escape_next_symbol(regex_temp)): if self._should_escape_next_symbol(regex_temp): regex_temp[-1] += symbol else: regex_temp.append(symbol) elif regex_temp[-1] != ")": regex_temp.append(regex_temp[-1]) regex_temp.append("*") else: pos_opening = \ self._find_previous_opening_parenthesis(regex_temp) for j in range(pos_opening, len(regex_temp)): regex_temp.append(regex_temp[j]) regex_temp.append("*") regex_temp = self._add_repetition(regex_temp) self._python_regex = "".join(regex_temp) @staticmethod def _is_repetition(regex_list, idx): if regex_list[idx] == "{": end = idx for i in range(idx + 1, len(regex_list)): if regex_list[i] == "}": end = i break inner = "".join(regex_list[idx + 1:end]) if "," in inner: split = inner.split(",") if len(split) != 2 or not split[0].isdigit() or not split[1].isdigit(): return None return int(split[0]), int(split[1]), end if inner.isdigit(): return int(inner), end return None @staticmethod def _find_repeated_sequence(regex_list): if regex_list[-1] != ")": return [regex_list[-1]] res = [")"] counter = -1 for i in range(len(regex_list) - 2, -1, -1): if regex_list[i] == "(": counter += 1 res.append("(") if counter == 0: return res[::-1] elif regex_list[i] == ")": counter -= 1 res.append(")") else: res.append(regex_list[i]) return [] def _add_repetition(self, regex_list): res = [] idx = 0 while idx < len(regex_list): rep = self._is_repetition(regex_list, idx) if rep is None: res.append(regex_list[idx]) idx += 1 elif len(rep) == 2: n_rep, end = rep repeated = self._find_repeated_sequence(res) for _ in range(n_rep - 1): res.extend(repeated) idx = end + 1 elif len(rep) == 3: min_rep, max_rep, end = rep repeated = self._find_repeated_sequence(res) for _ in range(min_rep - 1): res.extend(repeated) for _ in range(min_rep, max_rep): res.extend(repeated) res.append("?") idx = end + 1 return res def _preprocess_optional(self): regex_temp = [] for symbol in self._python_regex: if symbol == "?": if regex_temp[-1] == ")": regex_temp[-1] = "|$)" elif regex_temp[-1] == "\\": regex_temp[-1] = "?" else: regex_temp[-1] = "(" + regex_temp[-1] + "|$)" else: if self._should_escape_next_symbol(regex_temp): regex_temp[-1] += symbol else: regex_temp.append(symbol) self._python_regex = "".join(regex_temp) @staticmethod def _should_escape_next_symbol(regex_temp): return regex_temp and regex_temp[-1] == "\\" def _escape_in_brackets(self): regex_temp = [] in_brackets = False for symbol in self._python_regex: if (symbol == "[" and not self._should_escape_next_symbol(regex_temp)): in_brackets = True elif (symbol == "]" and not self._should_escape_next_symbol(regex_temp)): in_brackets = False if (in_brackets and not self._should_escape_next_symbol(regex_temp) and symbol in TO_ESCAPE_IN_BRACKETS): regex_temp.append("\\" + symbol) elif self._should_escape_next_symbol(regex_temp): regex_temp[-1] += symbol else: regex_temp.append(symbol) self._python_regex = "".join(regex_temp) def _replace_shortcuts(self): for to_replace, replacement in SHORTCUTS.items(): self._python_regex = self._python_regex.replace(to_replace, replacement)