67 lines
2.2 KiB
Python
67 lines
2.2 KiB
Python
"""
|
|
SentenceSlice: access words by index, sub-sentences by slices
|
|
"""
|
|
|
|
import re
|
|
import reprlib
|
|
|
|
|
|
RE_TOKEN = re.compile('\w+|\s+|[^\w\s]+')
|
|
RE_WORD = re.compile('\w+')
|
|
RE_PUNCTUATION = re.compile('[^\w\s]+')
|
|
|
|
|
|
class SentenceSlice:
|
|
|
|
def __init__(self, text):
|
|
self.text = text
|
|
self.tokens = RE_TOKEN.findall(text)
|
|
self.words = [t for t in self.tokens if RE_WORD.match(t)]
|
|
self.word_index = [i for i, t in enumerate(self.tokens)
|
|
if RE_WORD.match(t)]
|
|
|
|
def __repr__(self):
|
|
return 'SentenceSlice(%s)' % reprlib.repr(self.text)
|
|
|
|
def __getitem__(self, position):
|
|
if isinstance(position, slice):
|
|
if position.step is not None:
|
|
raise LookupError('slice step is not supported')
|
|
start, stop = self._handle_defaults(position)
|
|
start, stop = self._widen(start, stop)
|
|
tokens = self.tokens[start:stop]
|
|
return SentenceSlice(''.join(tokens))
|
|
else:
|
|
return self.words[position]
|
|
|
|
def __len__(self, index):
|
|
return len(self.words)
|
|
|
|
# helper functions -- implementation detail
|
|
def _handle_defaults(self, position):
|
|
"""handle missing or overflow/underflow start/stop"""
|
|
if position.start is None: # missing
|
|
start = 0
|
|
elif position.start >= len(self.word_index): # overflow
|
|
start = len(self.tokens)
|
|
else:
|
|
start = self.word_index[position.start]
|
|
if (position.stop is None # missing
|
|
or position.stop > len(self.word_index)): # overflow
|
|
stop = self.word_index[-1]
|
|
else:
|
|
stop = self.word_index[position.stop-1]
|
|
return start, stop + 1 # stop after last word selected
|
|
|
|
def _widen(self, start, stop):
|
|
"""widen range of tokens to get punctuation to the left of
|
|
start and to the right of stop"""
|
|
if start < len(self.tokens):
|
|
while (start > 0 and
|
|
RE_PUNCTUATION.match(self.tokens[start-1])):
|
|
start -= 1
|
|
while (stop < len(self.tokens) and
|
|
RE_PUNCTUATION.match(self.tokens[stop])):
|
|
stop += 1
|
|
return start, stop
|