pytudes/py/pal2.py
2018-10-17 01:48:46 +02:00

269 lines
10 KiB
Python

from __future__ import print_function
import random, re, bisect, time
try:
xrange # Python 2
except NameError:
xrange = range # Python 3
"""Produce Panama-ish Palindromes. Copyright (C) 2002-2008, Peter Norvig."""
################ Checking for Palindromes
def is_panama(s):
"Test if string s is a Panama-ish palindrome."
return is_palindrome(s) and is_unique(phrases(s))
def is_palindrome(s):
"Test if a string is a palindrome."
s1 = canonical(s)
return s1 == reversestr(s1)
def phrases(s):
"Break a string s into comma-separated phrases."
return [phrase.strip() for phrase in s.split(',')]
def canonical(word, sub=re.compile('''[-* \t\n\r.,;!?:()`"']''').sub):
"The canonical form for comparing: lowercase, no blanks or punctuation."
return sub('', word).lower()
################ Utilities
def reversestr(x):
"Reverse a string."
return x[::-1]
def is_unique(seq):
"Return true if seq has no duplicate elements."
return len(seq) == len(set(seq))
def update(obj, **entries):
"Change attributes of obj, according to the keyword args."
obj.__dict__.update(entries)
return obj
################ Reading in a dictionary
class PalDict:
"""A dictionary from which you can find canonical words that start or end
with a given canonical substring, and find the true name of a
canonical word with d.truename[canonicalword]."""
def __init__(self, k=1000, filename='npdict.txt'):
words, rwords, truename = [], [], {'': '', 'panama': 'Panama!'}
for tword in open(filename).read().splitlines():
word = canonical(tword)
words.append(word)
rwords.append(reversestr(word))
truename[word] = tword
words.sort()
rwords.sort()
update(self, k=k, words=words, rwords=rwords, truename=truename,
reversibles={}, rangek=range(k), tryharder=False)
def startswith(self, prefix):
"""Return up to k canonical words that start with prefix.
If there are more than k, choose from them at random."""
return self._k_startingwith(self.words, prefix)
def endswith(self, rsuffix):
"""Return up to k canonical words that end with the reversed suffix.
If you want words ending in 'ing', ask for d.endswith('gni').
If there are more than k, choose from them at random."""
return map(reversestr, self._k_startingwith(self.rwords, rsuffix))
def __contains__(self, word):
return word in self.truename
def reversible_words(self):
"Find words that have a reverse in the dict, like {'Camus': 'Sumac'}"
if not self.reversibles:
reversibles = self.reversibles
for rw in self.rwords:
if rw in self:
w = reversestr(rw)
if w != rw and w not in reversibles:
reversibles[w] = rw
self.reversibles = reversibles
return self.reversibles
def _k_startingwith(self, words, prefix):
start = bisect.bisect_left(words, prefix)
end = bisect.bisect(words, prefix + 'zzzz')
n = end - start
if self.k >= n: # get all the words that start with prefix
results = words[start:end]
else: # sample from words starting with prefix
indexes = random.sample(xrange(start, end), self.k)
results = [words[i] for i in indexes]
random.shuffle(results)
## Consider words that are prefixes of the prefix.
## This is very slow, so don't use it until late in the game.
if self.tryharder:
for i in range(3, len(prefix)):
w = prefix[0:i]
if ((words == self.words and w in self.truename) or
(words == self.rwords and reversestr(w) in self.truename)):
results.append(w)
return results
paldict = PalDict()
def anpdictshort():
"Find the words that are valid when every phrase must start with 'a'"
def segment(word): return [s for s in word.split('a') if s]
def valid(word): return all(reversestr(s) in segments for s in segment(word))
words = map(canonical, open('anpdict.txt'))
segments = set(s for w in words for s in segment(canonical(w)))
valid_words = [paldict.truename[w] for w in words if valid(w)]
open('anpdict-short.txt', 'w').write('\n'.join(valid_words))
################ Search for a palindrome
class Panama:
def __init__(self, L='A man, a plan', R='a canal, Panama', dict=paldict):
## .left and .right hold lists of canonical words
## .diff holds the number of characters that are not matched,
## positive for words on left, negative for right.
## .stack holds (action, side, arg) tuples
update(self, left=[], right=[], best=0, seen={}, diff=0, stack=[],
used_reversibles=False, starttime=time.clock(), dict=dict)
for word in L.split(','):
self.add('left', canonical(word))
for rword in reversestr(R).split(','):
self.add('right', canonical(reversestr(rword)))
self.consider_candidates()
def search(self, steps=50000000):
"Search for palindromes."
for _ in xrange(steps):
if not self.stack:
return 'done'
action, dir, substr, arg = self.stack[-1]
if action == 'added': # undo the last word added
self.remove(dir, arg)
elif action == 'trying' and arg: # try the next word if there is one
self.add(dir, arg.pop()) and self.consider_candidates()
elif action == 'trying' and not arg: # otherwise backtrack
self.stack.pop()
else:
raise ValueError(action)
def add(self, dir, word):
"add a word"
if word in self.seen:
return False
else:
getattr(self, dir).append(word)
self.diff += factor[dir] * len(word)
self.seen[word] = True
self.stack.append(('added', dir, '?', word))
return True
def remove(self, dir, word):
"remove a word"
oldword = getattr(self, dir).pop()
assert word == oldword
self.diff -= factor[dir] * len(word)
del self.seen[word]
self.stack.pop()
def consider_candidates(self):
"""Push a new state with a set of candidate words onto stack."""
if self.diff > 0: # Left is longer, consider adding on right
dir = 'right'
substr = self.left[-1][-self.diff:]
candidates = self.dict.endswith(substr)
elif self.diff < 0: # Right is longer, consider adding on left
dir = 'left'
substr = reversestr(self.right[-1][0:-self.diff])
candidates = self.dict.startswith(substr)
else: # Both sides are same size
dir = 'left'
if not self.used_reversibles:
self.report()
self.add_reversibles()
substr = ''
candidates = self.dict.startswith('')
if substr == reversestr(substr):
self.report()
self.stack.append(('trying', dir, substr, candidates))
def add_reversibles(self):
"Add in reversible words."
print('using reversibles ...')
for (word, rword) in self.dict.reversible_words().items():
if word not in self.seen and rword not in self.seen:
self.add('left', word)
self.add('right', rword)
self.used_reversibles = True
self.stack = []
print('...done')
def report(self):
"Report a new palindrome to log file (if it is sufficiently big)."
N = len(self)
if N > 13333:
self.dict.tryharder = True
if N > self.best and (N > 12500 or N > self.best+500):
self.best = len(self)
self.bestphrase = str(self)
print('%5d phrases (%5d words) in %3d seconds' % (
self.best, self.bestphrase.count(' ')+1, time.clock() - self.starttime))
assert is_panama(self.bestphrase)
f = open('pallog%d.txt' % (id(self) % 10000), 'w')
f.write(self.bestphrase + '\n')
f.close()
def __len__(self):
return len(self.left) + len(self.right)
def __str__(self):
truename = self.dict.truename
lefts = [truename[w] for w in self.left]
rights =[truename[w] for w in self.right]
return ', '.join(lefts + rights[::-1])
factor = {'left': +1, 'right': -1}
# Note that we only allow one truename per canonical name. Occasionally
# this means we miss a good word (as in "a node" vs. "an ode"), but there
# are only 665 of these truename collisions, and most of them are of the
# form "a mark-up" vs. "a markup" so it seemed better to disallow them.
################ Unit Tests
def tests(p=Panama()):
assert is_panama('A man, a plan, a canal, Panama.')
assert is_panama('''A (man), a plan,,;, a ```canal?'' -- Panama!''')
assert not is_panama('A man, a plan, a radar, a canal, Panama.')
assert is_palindrome('A man, a plan, a canal, Panama.')
assert is_palindrome('radar, radar? radar!')
assert not is_palindrome('radars')
assert phrases('A man, a plan, Panama') == ['A man', 'a plan', 'Panama']
assert canonical('A man, a plan, a canal, Panama') == 'amanaplanacanalpanama'
assert reversestr('foo') == 'oof'
assert is_unique([1, 2, 3])
assert not is_unique([1, 2, 2])
d = p.dict
def sameset(a, b): return set(a) == set(b)
assert 'panama' in d
assert d.words[0] in d
assert d.words[-1] in d
assert sameset(d.startswith('aword'), ['awording', 'awordbreak',
'awordiness', 'awordage', 'awordplay', 'awordlore', 'awordbook',
'awordlessness', 'aword', 'awordsmith'])
assert sameset(d.endswith('ytisob'), ['aglobosity', 'averbosity',
'asubglobosity', 'anonverbosity', 'agibbosity'])
d.tryharder = True
assert sameset(d.startswith('oklahoma'), ['oklahoma', 'okla'])
d.tryharder = False
assert d.startswith('oklahoma') == ['oklahoma']
assert d.startswith('fsfdsfdsfds') == []
print('all tests pass')
if __name__ == '__main__':
p = Panama();
tests(p)
p.search()