Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract better substrings #12

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 91 additions & 35 deletions src/esmre.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

# esmre.py - clue-indexed regular expressions module
# Copyright (C) 2007-2008 Tideway Systems Limited.
# Copyright (C) 2021-2024 BMC Software.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
Expand All @@ -22,16 +23,57 @@
import esm
import threading

class InBackslashState:
def __init__(self, parent_state):
self.parent_state = parent_state

def process_byte(self, ch):
if not ch.isalnum():
try:
self.parent_state.append_to_current_hint(ch)
except AttributeError:
# Parent does not have append_to_current_hint
pass

return self.parent_state

if ch.isdigit():
return InBackslashNumberState(self.parent_state)

if ch == "N":
return InNamedUnicodeState(self.parent_state)

try:
self.parent_state.bank_current_hint_with_last_byte()
except AttributeError:
pass

return self.parent_state


class InBackslashNumberState:
def __init__(self, parent_state):
self.parent_state = parent_state

def process_byte(self, ch):
if not ch.isdigit():
return self.parent_state

return self


class InBackslashState(object):
class InNamedUnicodeState:
def __init__(self, parent_state):
self.parent_state = parent_state

def process_byte(self, ch):
if ch == "{":
return InBracesState(self.parent_state)

return self.parent_state


class InClassState(object):
class InClassState:
def __init__(self, parent_state):
self.parent_state = parent_state

Expand All @@ -46,7 +88,7 @@ def process_byte(self, ch):
return self


class InBracesState(object):
class InBracesState:
def __init__(self, parent_state):
self.parent_state = parent_state

Expand All @@ -58,10 +100,13 @@ def process_byte(self, ch):
return self


class CollectingState(object):
class CollectingState:
def __init__(self):
self.hints = [""]

def finish(self):
pass

def process_byte(self, ch):
self.update_hints(ch)
return self.next_state(ch)
Expand All @@ -77,21 +122,18 @@ def bank_current_hint_and_forget_last_byte(self):

self.hints.append("")

def forget_all_hints(self):
self.hints = [""]

def append_to_current_hint(self, ch):
self.hints[-1] += ch

def update_hints(self, ch):
if ch in "?*{":
self.bank_current_hint_and_forget_last_byte()

elif ch in "+.^$([\\":
elif ch in "+.^$([":
self.bank_current_hint_with_last_byte()

elif ch == "|":
self.forget_all_hints()
elif ch in "|\\":
pass

else:
self.append_to_current_hint(ch)
Expand Down Expand Up @@ -120,11 +162,21 @@ def alternation_state(self):


class RootState(CollectingState):
def __init__(self):
CollectingState.__init__(self)
self.alternate_hints = []

def finish(self):
self.alternate_hints.append(self.hints)
self.hints = [""]

def alternation_state(self):
raise StopIteration
self.alternate_hints.append(self.hints)
self.hints = [""]
return self


class StartOfGroupState(object):
class StartOfGroupState:
def __init__(self, parent_state):
self.parent_state = parent_state

Expand Down Expand Up @@ -162,18 +214,22 @@ def alternation_state(self):
return self


class StartOfExtensionGroupState(object):
class StartOfExtensionGroupState:
def __init__(self, parent_state):
self.parent_state = parent_state

def process_byte(self, ch):
if ch == "P":
return MaybeStartOfNamedGroupState(self.parent_state)

elif ch == ":":
return InGroupState(self.parent_state)

else:
return IgnoredGroupState(self.parent_state).process_byte(ch)


class MaybeStartOfNamedGroupState(object):
class MaybeStartOfNamedGroupState:
def __init__(self, parent_state):
self.parent_state = parent_state

Expand All @@ -184,7 +240,7 @@ def process_byte(self, ch):
return IgnoredGroupState(self.parent_state)


class InNamedGroupNameState(object):
class InNamedGroupNameState:
def __init__(self, parent_state):
self.parent_state = parent_state

Expand All @@ -207,58 +263,59 @@ def hints(regex):
for ch in regex:
state = state.process_byte(ch)

state.finish()

except StopIteration:
pass

def flattened(hints):
for item in hints:
all_hints = state.alternate_hints

def flattened(l):
for item in l:
if isinstance(item, list):
for i in flattened(item):
yield i
else:
yield item

return [hint for hint in flattened(state.hints) if hint]


def shortlist(hints):
if not hints:
return []
def best(hints):
return max((hint for hint in flattened(hints)), key=len)

best = ""
result = {best(hints).lower() for hints in all_hints}

for hint in hints:
if len(hint) > len(best):
best = hint
if all(result):
return result

return [best]
return set()


class Index(object):
class Index:
def __init__(self):
self.esm = esm.Index()
self.hintless_objects = list()
self.fixed = False
self.lock = threading.Lock()


def enter(self, regex, obj):
self.lock.acquire()
try:

if self.fixed:
raise TypeError("enter() cannot be called after query()")

keywords = shortlist(hints(regex))
keywords = hints(regex)

if not keywords:
self.hintless_objects.append(obj)

for hint in shortlist(hints(regex)):
self.esm.enter(hint.lower(), obj)
for hint in keywords:
self.esm.enter(hint, obj)

finally:
self.lock.release()


def query(self, string):
self.lock.acquire()
try:
Expand All @@ -270,6 +327,5 @@ def query(self, string):
finally:
self.lock.release()

return self.hintless_objects + [
obj for (_, obj) in self.esm.query(string.lower())
]
return self.hintless_objects + \
[obj for (_, obj) in self.esm.query(string.lower())]
Loading