pytudes/ipynb/portman.py
2020-06-22 16:52:41 -07:00

139 lines
5.4 KiB
Python

# Generate a portmantout word
# Peter Norvig
# See https://github.com/norvig/pytudes/blob/master/ipynb/Portmantout.ipynb
from collections import defaultdict, Counter
from typing import List, Tuple, Set, Dict, Any
Word = str
class Wordset(set): """A set of words."""
Step = Tuple[int, str] # An (overlap, word) pair.
OVERLAP, WORD = 0, 1 # Indexes of the two parts of a Step.
Path = List[Step] # A list of steps.
Bridge = (int, Step,...) # An excess letter count and step(s), e.g. (1, (2, 'arrow')).
EXCESS, STEPS = 0, slice(1, None) # Indexes of the two parts of a bridge.
W = Wordset(open('wordlist.asc').read().split())
def portman(P: Path) -> Word:
"""Compute the portmantout string S from the path P."""
return ''.join(word[overlap:] for (overlap, word) in P)
def natalie(W: Wordset, start=None) -> Path:
"""Return a portmantout path containing all words in W."""
precompute(W)
word = start or first(W.unused)
used(W, word)
P = [(0, word)]
while W.unused:
steps = unused_step(W, word) or bridging_steps(W, word)
for (overlap, word) in steps:
P.append((overlap, word))
used(W, word)
return P
def unused_step(W: Wordset, prev_word: Word) -> List[Step]:
"""Return [(overlap, unused_word)] or []."""
for suf in suffixes(prev_word):
for unused_word in W.startswith.get(suf, ()):
overlap = len(suf)
return [(overlap, unused_word)]
return []
def bridging_steps(W: Wordset, prev_word: Word) -> List[Step]:
"""The steps from the shortest bridge that bridges
from a suffix of prev_word to a prefix of an unused word."""
bridge = min(W.bridges[suf][pre]
for suf in suffixes(prev_word) if suf in W.bridges
for pre in W.bridges[suf] if W.startswith[pre])
return bridge[STEPS]
def precompute(W):
"""Precompute and cache data structures for W. The .subwords and .bridges
data structures are static and only need to be computed once; .unused and
.startswith are dynamic and must be recomputed on each call to `natalie`."""
if not hasattr(W, 'subwords') or not hasattr(W, 'bridges'):
W.subwords = subwords(W)
W.bridges = build_bridges(W)
W.unused = W - W.subwords
W.startswith = compute_startswith(W.unused)
def used(W, word):
"""Remove word from `W.unused` and, for each prefix, from `W.startswith[pre]`."""
assert word in W, f'used "{word}", which is not in the word set'
if word in W.unused:
W.unused.remove(word)
for pre in prefixes(word):
W.startswith[pre].remove(word)
if not W.startswith[pre]:
del W.startswith[pre]
def first(iterable, default=None): return next(iter(iterable), default)
def multimap(pairs) -> Dict[Any, set]:
"""Given (key, val) pairs, make a dict of {key: {val,...}}."""
result = defaultdict(set)
for key, val in pairs:
result[key].add(val)
return result
def compute_startswith(words) -> Dict[str, Set[Word]]:
"""A dict mapping a prefix to all the words it starts:
{'somet': {'something', 'sometimes'},...}."""
return multimap((pre, w) for w in words for pre in prefixes(w))
def subwords(W: Wordset) -> Set[str]:
"""All the words in W that are subparts of some other word."""
return {subword for w in W for subword in subparts(w) & W}
def suffixes(word) -> List[str]:
"""All non-empty proper suffixes of word, longest first."""
return [word[i:] for i in range(1, len(word))]
def prefixes(word) -> List[str]:
"""All non-empty proper prefixes of word."""
return [word[:i] for i in range(1, len(word))]
def subparts(word) -> Set[str]:
"""All non-empty proper substrings of word"""
return {word[i:j]
for i in range(len(word))
for j in range(i + 1, len(word) + (i > 0))}
def splits(word) -> List[Tuple[int, str, str]]:
"""A sequence of (excess, pre, suf) tuples."""
return [(excess, word[:i], word[i+excess:])
for excess in range(len(word) - 1)
for i in range(1, len(word) - excess)]
def try_bridge(bridges, pre, suf, excess, word, step2=None):
"""Store a new bridge if it has less excess than the previous bridges[pre][suf]."""
if suf not in bridges[pre] or excess < bridges[pre][suf][EXCESS]:
bridge = (excess, (len(pre), word))
if step2: bridge += (step2,)
bridges[pre][suf] = bridge
def build_bridges(W: Wordset, maxlen=5, end='qujvz'):
"""A table of bridges[pre][suf] == (excess, (overlap, word)), e.g.
bridges['ar']['c'] == (0, (2, 'arc'))."""
bridges = defaultdict(dict)
shortwords = [w for w in W if len(w) <= maxlen + (w[-1] in end)]
shortstartswith = compute_startswith(shortwords)
# One-word bridges
for word in shortwords:
for excess, pre, suf, in splits(word):
try_bridge(bridges, pre, suf, excess, word)
# Two-word bridges
for word1 in shortwords:
for suf in suffixes(word1):
for word2 in shortstartswith[suf]:
excess = len(word1) + len(word2) - len(suf) - 2
A, B = word1[0], word2[-1]
if A != B:
step2 = (len(suf), word2)
try_bridge(bridges, A, B, excess, word1, step2)
return bridges
if __name__ == "__main__":
W = Wordset(open('wordlist.asc').read().split())
print(portman(natalie(W)))