example-code-2e/sequences/sentence_slice.py

"""
SentenceSlice: access words by index, sub-sentences by slices
"""

import re
import reprlib


RE_TOKEN = re.compile('\w+|\s+|[^\w\s]+')
RE_WORD = re.compile('\w+')
RE_PUNCTUATION = re.compile('[^\w\s]+')


class SentenceSlice:

    def __init__(self, text):
        self.text = text
        self.tokens = RE_TOKEN.findall(text)
        self.words = [t for t in self.tokens if RE_WORD.match(t)]
        self.word_index = [i for i, t in enumerate(self.tokens)
                           if RE_WORD.match(t)]

    def __repr__(self):
        return 'SentenceSlice(%s)' % reprlib.repr(self.text)

    def __getitem__(self, position):
        if isinstance(position, slice):
            if position.step is not None:
                raise LookupError('slice step is not supported')
            start, stop = self._handle_defaults(position)
            start, stop = self._widen(start, stop)
            tokens = self.tokens[start:stop]
            return SentenceSlice(''.join(tokens))
        else:
            return self.words[position]

    def __len__(self, index):
        return len(self.words)

    # helper functions -- implementation detail
    def _handle_defaults(self, position):
        """handle missing or overflow/underflow start/stop"""
        if position.start is None:  # missing
            start = 0
        elif position.start >= len(self.word_index):  # overflow
            start = len(self.tokens)
        else:
            start = self.word_index[position.start]
        if (position.stop is None  # missing
                or position.stop > len(self.word_index)):  # overflow
            stop = self.word_index[-1]
        else:
            stop = self.word_index[position.stop-1]
        return start, stop + 1  # stop after last word selected

    def _widen(self, start, stop):
        """widen range of tokens to get punctuation to the left of
           start and to the right of stop"""
        if start < len(self.tokens):
            while (start > 0 and
                    RE_PUNCTUATION.match(self.tokens[start-1])):
                start -= 1
        while (stop < len(self.tokens) and
                RE_PUNCTUATION.match(self.tokens[stop])):
            stop += 1
        return start, stop