updated from Atlas
This commit is contained in:
66
attic/sequences/sentence_slice.py
Normal file
66
attic/sequences/sentence_slice.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""
|
||||
SentenceSlice: access words by index, sub-sentences by slices
|
||||
"""
|
||||
|
||||
import re
|
||||
import reprlib
|
||||
|
||||
|
||||
RE_TOKEN = re.compile('\w+|\s+|[^\w\s]+')
|
||||
RE_WORD = re.compile('\w+')
|
||||
RE_PUNCTUATION = re.compile('[^\w\s]+')
|
||||
|
||||
|
||||
class SentenceSlice:
|
||||
|
||||
def __init__(self, text):
|
||||
self.text = text
|
||||
self.tokens = RE_TOKEN.findall(text)
|
||||
self.words = [t for t in self.tokens if RE_WORD.match(t)]
|
||||
self.word_index = [i for i, t in enumerate(self.tokens)
|
||||
if RE_WORD.match(t)]
|
||||
|
||||
def __repr__(self):
|
||||
return 'SentenceSlice(%s)' % reprlib.repr(self.text)
|
||||
|
||||
def __getitem__(self, position):
|
||||
if isinstance(position, slice):
|
||||
if position.step is not None:
|
||||
raise LookupError('slice step is not supported')
|
||||
start, stop = self._handle_defaults(position)
|
||||
start, stop = self._widen(start, stop)
|
||||
tokens = self.tokens[start:stop]
|
||||
return SentenceSlice(''.join(tokens))
|
||||
else:
|
||||
return self.words[position]
|
||||
|
||||
def __len__(self, index):
|
||||
return len(self.words)
|
||||
|
||||
# helper functions -- implementation detail
|
||||
def _handle_defaults(self, position):
|
||||
"""handle missing or overflow/underflow start/stop"""
|
||||
if position.start is None: # missing
|
||||
start = 0
|
||||
elif position.start >= len(self.word_index): # overflow
|
||||
start = len(self.tokens)
|
||||
else:
|
||||
start = self.word_index[position.start]
|
||||
if (position.stop is None # missing
|
||||
or position.stop > len(self.word_index)): # overflow
|
||||
stop = self.word_index[-1]
|
||||
else:
|
||||
stop = self.word_index[position.stop-1]
|
||||
return start, stop + 1 # stop after last word selected
|
||||
|
||||
def _widen(self, start, stop):
|
||||
"""widen range of tokens to get punctuation to the left of
|
||||
start and to the right of stop"""
|
||||
if start < len(self.tokens):
|
||||
while (start > 0 and
|
||||
RE_PUNCTUATION.match(self.tokens[start-1])):
|
||||
start -= 1
|
||||
while (stop < len(self.tokens) and
|
||||
RE_PUNCTUATION.match(self.tokens[stop])):
|
||||
stop += 1
|
||||
return start, stop
|
||||
Reference in New Issue
Block a user