pytudes/ipynb/portman.py

# Generate a portmantout word
# Peter Norvig
# See https://github.com/norvig/pytudes/blob/master/ipynb/Portmantout.ipynb

from collections import defaultdict, Counter
from typing import List, Tuple, Set, Dict, Any

Word = str
class Wordset(set): """A set of words."""
Step = Tuple[int, str] # An (overlap, word) pair.
OVERLAP, WORD = 0, 1 # Indexes of the two parts of a Step.
Path = List[Step] # A list of steps.
Bridge = (int, Step,...) # An excess letter count and step(s), e.g. (1, (2, 'arrow')).
EXCESS, STEPS = 0, slice(1, None) # Indexes of the two parts of a bridge.

W = Wordset(open('wordlist.asc').read().split())

def portman(P: Path) -> Word:
    """Compute the portmantout string S from the path P."""
    return ''.join(word[overlap:] for (overlap, word) in P)

def natalie(W: Wordset, start=None) -> Path:
    """Return a portmantout path containing all words in W."""
    precompute(W)
    word = start or first(W.unused)
    used(W, word)
    P = [(0, word)]
    while W.unused:
        steps = unused_step(W, word) or bridging_steps(W, word)
        for (overlap, word) in steps:
            P.append((overlap, word))
            used(W, word)
    return P

def unused_step(W: Wordset, prev_word: Word) -> List[Step]:
    """Return [(overlap, unused_word)] or []."""
    for suf in suffixes(prev_word):
        for unused_word in W.startswith.get(suf, ()):
            overlap = len(suf)
            return [(overlap, unused_word)]
    return []

def bridging_steps(W: Wordset, prev_word: Word) -> List[Step]:
    """The steps from the shortest bridge that bridges
    from a suffix of prev_word to a prefix of an unused word."""
    bridge = min(W.bridges[suf][pre]
                 for suf in suffixes(prev_word) if suf in W.bridges
                 for pre in W.bridges[suf] if W.startswith[pre])
    return bridge[STEPS]

def precompute(W):
    """Precompute and cache data structures for W. The .subwords and .bridges
    data structures are static and only need to be computed once; .unused and
    .startswith are dynamic and must be recomputed on each call to `natalie`."""
    if not hasattr(W, 'subwords') or not hasattr(W, 'bridges'):
        W.subwords = subwords(W)
        W.bridges  = build_bridges(W)
    W.unused       = W - W.subwords
    W.startswith   = compute_startswith(W.unused)

def used(W, word):
    """Remove word from `W.unused` and, for each prefix, from `W.startswith[pre]`."""
    assert word in W, f'used "{word}", which is not in the word set'
    if word in W.unused:
        W.unused.remove(word)
        for pre in prefixes(word):
            W.startswith[pre].remove(word)
            if not W.startswith[pre]:
                del W.startswith[pre]

def first(iterable, default=None): return next(iter(iterable), default)

def multimap(pairs) -> Dict[Any, set]:
    """Given (key, val) pairs, make a dict of {key: {val,...}}."""
    result = defaultdict(set)
    for key, val in pairs:
        result[key].add(val)
    return result

def compute_startswith(words) -> Dict[str, Set[Word]]:
    """A dict mapping a prefix to all the words it starts:
    {'somet': {'something', 'sometimes'},...}."""
    return multimap((pre, w) for w in words for pre in prefixes(w))

def subwords(W: Wordset) -> Set[str]:
    """All the words in W that are subparts of some other word."""
    return {subword for w in W for subword in subparts(w) & W}

def suffixes(word) -> List[str]:
    """All non-empty proper suffixes of word, longest first."""
    return [word[i:] for i in range(1, len(word))]

def prefixes(word) -> List[str]:
    """All non-empty proper prefixes of word."""
    return [word[:i] for i in range(1, len(word))]

def subparts(word) -> Set[str]:
    """All non-empty proper substrings of word"""
    return {word[i:j]
            for i in range(len(word))
            for j in range(i + 1, len(word) + (i > 0))}

def splits(word) -> List[Tuple[int, str, str]]:
    """A sequence of (excess, pre, suf) tuples."""
    return [(excess, word[:i], word[i+excess:])
            for excess in range(len(word) - 1)
            for i in range(1, len(word) - excess)]

def try_bridge(bridges, pre, suf, excess, word, step2=None):
    """Store a new bridge if it has less excess than the previous bridges[pre][suf]."""
    if suf not in bridges[pre] or excess < bridges[pre][suf][EXCESS]:
        bridge = (excess, (len(pre), word))
        if step2: bridge +=  (step2,)
        bridges[pre][suf] = bridge

def build_bridges(W: Wordset, maxlen=5, end='qujvz'):
    """A table of bridges[pre][suf] == (excess, (overlap, word)), e.g.
    bridges['ar']['c'] == (0, (2, 'arc'))."""
    bridges         = defaultdict(dict)
    shortwords      = [w for w in W if len(w) <= maxlen + (w[-1] in end)]
    shortstartswith = compute_startswith(shortwords)
    # One-word bridges
    for word in shortwords:
        for excess, pre, suf, in splits(word):
            try_bridge(bridges, pre, suf, excess, word)
    # Two-word bridges
    for word1 in shortwords:
        for suf in suffixes(word1):
            for word2 in shortstartswith[suf]:
                excess = len(word1) + len(word2) - len(suf) - 2
                A, B = word1[0], word2[-1]
                if A != B:
                    step2 = (len(suf), word2)
                    try_bridge(bridges, A, B, excess, word1, step2)
    return bridges

if __name__ == "__main__":
    W = Wordset(open('wordlist.asc').read().split())
    print(portman(natalie(W)))