Added files from norvig.com

This commit is contained in:
Peter Norvig 2017-02-28 21:52:46 -08:00 committed by GitHub
parent 3b06853ded
commit 740e597a0d
14 changed files with 2479 additions and 0 deletions

134
SET.py Normal file
View File

@ -0,0 +1,134 @@
import random
import collections
import itertools
"""
Game of Set (Peter Norvig 2010-2015)
How often do sets appear when we deal an array of cards?
How often in the course of playing out the game?
Here are the data types we will use:
card: A string, such as '3R=0', meaning "three red striped ovals".
deck: A list of cards, initially of length 81.
layout: A list of cards, initially of length 12.
set: A tuple of 3 cards.
Tallies: A dict: {12: {True: 33, False: 1}}} means a layout of size 12
tallied 33 sets and 1 non-set.
"""
#### Cards, dealing cards, and defining the notion of sets.
CARDS = [number + color + shade + symbol
for number in '123'
for color in 'RGP'
for shade in '@O='
for symbol in '0SD']
def deal(n, deck):
"Deal n cards from the deck."
return [deck.pop() for _ in range(n)]
def is_set(cards):
"Are these 3 cards a set? No if any feature has 2 values."
for f in range(4):
values = {card[f] for card in cards}
if len(values) == 2:
return False
return True
def find_set(layout):
"Return a set found from this layout, if there is one."
for cards in itertools.combinations(layout, 3):
if is_set(cards):
return cards
return ()
#### Tallying set:no-set ratio
def Tallies():
"A data structure to keep track, for each size, the number of sets and no-sets."
return collections.defaultdict(lambda: {True: 0, False: 0})
def tally(tallies, layout):
"Record that a set was found or not found in a layout of given size; return the set."
s = find_set(layout)
tallies[len(layout)][bool(s)] += 1
return s
#### Three experiments
def tally_initial_layout(N, sizes=(12, 15)):
"Record tallies for N initial deals."
tallies = Tallies()
deck = list(CARDS)
for deal in range(N):
random.shuffle(deck)
for size in sizes:
tally(tallies, deck[:size])
return tallies
def tally_initial_layout_no_prior_sets(N, sizes=(12, 15)):
"""Simulate N initial deals for each size, keeping tallies for Sets and NoSets,
but only when there was no set with 3 fewer cards."""
tallies = Tallies()
deck = list(CARDS)
for deal in range(N):
random.shuffle(deck)
for size in sizes:
if not find_set(deck[:size-3]):
tally(tallies, deck[:size])
return tallies
def tally_game_play(N):
"Record tallies for the play of N complete games."
tallies = Tallies()
for game in range(N):
deck = list(CARDS)
random.shuffle(deck)
layout = deal(12, deck)
while deck:
s = tally(tallies, layout)
# Pick up the cards in the set, if any
for card in s: layout.remove(card)
# Deal new cards
if len(layout) < 12 or not s:
layout += deal(3, deck)
return tallies
def experiments(N):
show({12: [1, 33], 15: [1, 2500]},
'the instruction booklet')
show(tally_initial_layout(N),
'initial layout')
show(tally_game_play(N // 25),
'game play')
show(tally_initial_layout_no_prior_sets(N),
'initial layout, but no sets before dealing last 3 cards')
def show(tallies, label):
"Print out the counts."
print()
print('Size | Sets | NoSets | Set:NoSet ratio for', label)
print('-----+--------+--------+----------------')
for size in sorted(tallies):
y, n = tallies[size][True], tallies[size][False]
ratio = ('inft' if n==0 else int(round(float(y)/n)))
print('{:4d} |{:7,d} |{:7,d} | {:4}:1'
.format(size, y, n, ratio))
def test():
assert len(CARDS) == 81 == len(set(CARDS))
assert is_set(('3R=O', '2R=S', '1R=D'))
assert not is_set(('3R=0', '2R=S', '1R@D'))
assert find_set(['1PO0', '2G=D', '3R=0', '2R=S', '1R=D']) == ('3R=0', '2R=S', '1R=D')
assert not find_set(['1PO0', '2G=D', '3R=0', '2R=S', '1R@D'])
photo = '2P=0 3P=D 2R=0 3GO0 2POD 3R@D 2RO0 2ROS 1P@S 2P@0 3ROS 2GOD 2P@D 1GOD 3GOS'.split()
assert not find_set(photo)
assert set(itertools.combinations([1, 2, 3, 4], 3)) == {(1, 2, 3), (1, 2, 4), (1, 3, 4), (2, 3, 4)}
print('All tests pass.')
test()
experiments(100000)

193
ibol.py Normal file
View File

@ -0,0 +1,193 @@
from collections import defaultdict
def get_genomes(fname="byronbayseqs.fas.txt"):
"Return a list of genomes, and a list of their corresponding names."
import re
names, species, genomes = [], [], []
for name, g in re.findall('>(.*?)\r([^\r]*)\r*', file(fname).read()):
names.append(name)
species.append(name.split('|')[-1])
genomes.append(g)
return names, species, genomes
def get_neighbors(fname="editdistances.txt"):
"Return dict: neighbors[i][j] = neighbors[j][i] = d means i,j are d apart."
## Read the data pre-computed from the Java program
neighbors = dict((i, {}) for i in range(n))
for line in file(fname):
i,j,d = map(int, line.split())
neighbors[i][j] = neighbors[j][i] = d
return neighbors
def cluster(neighbors, d, dc):
"""Return a list of clusters, each cluster element is within d of another
and within dc of every other cluster element."""
unclustered = set(neighbors) ## set of g's not yet clustered
return [closure(g, set(), unclustered, d, dc)
for g in neighbors if g in unclustered]
def closure(g, s, unclustered, d, dc):
"Accumulate in set s the transitive closure of 'near', starting at g"
if g not in s and g in unclustered and near(g, s, d, dc):
s.add(g); unclustered.remove(g)
for g2 in neighbors[g]:
closure(g2, s, unclustered, d, dc)
return s
def dist(i, j):
"Distance between two genomes."
if i == j: return 0
return neighbors[min(i, j)].get(max(i, j), max_distance)
def near(g, cluster, d, dc):
"Is g within d of some member of c, and within dc of every member of c?"
distances = [dist(g, g2) for g2 in cluster] or [0]
return min(distances) <= d and max(distances) <= dc
def diameter(cluster):
"The largest distance between two elements of the cluster"
return max([dist(i, j) for i in cluster for j in cluster] or [0])
def margin(cluster):
"The distance from a cluster to the nearest g2 outside this cluster."
return min([d for g in cluster for g2,d in neighbors[g].items()
if g2 not in cluster] or [max_distance])
################################################################ Analysis
def pct(num, den):
"Return a string representing the percentage. "
if '__len__' in dir(den): den = len(den)
if num==den: return ' 100%'
return '%.1f%%' % (num*100.0/den)
def histo(items):
"Make a histogram from a sequence of items or (item, count) tuples."
D = defaultdict(int)
for item in items:
if isinstance(item, tuple): D[item[0]] += item[1]
else: D[item] += 1
return D
def showh(d):
"Show a histogram"
if not isinstance(d, dict): d = histo(d)
return ' '.join('%s:%s' % i for i in sorted(d.items()))
def greport(genomes):
print "Number of genomes: %d (%d distinct)" % (len(genomes), len(set(genomes)))
G = dict((g, set()) for g in genomes)
for i in range(n):
G[genomes[i]].add(species[i])
print "Multi-named genomes:", (
len([s for s in G.values() if len(s) > 1]))
lens = map(len, genomes)
print "Genome lengths: min=%d, max=%d" % (min(lens), max(lens))
print "Character counts: ", showh(c for g in genomes for c in g)
def nreport(neighbors):
NN, NumN = defaultdict(int), defaultdict(int) ## Nearest, Number of neighbors
for n in neighbors:
nn = min(neighbors[n].values() or ['>25'])
NN[nn] += 1
for d2 in neighbors[n].values():
NumN[d2] += 1
print
print "Nearest neighbor counts:", showh(NN)
print "Number of neighbors at each distance:", showh(NumN)
def nspecies(c): return len(set(species[g] for g in c))
def showc(c):
return "N=%d, D=%d, M=%d: %s %s" % (
len(c), diameter(c), margin(c), list(c), showh(species[g] for g in c))
def creport(drange, dcrange):
def table(what, fn):
print "\n" + what
print ' '*8, ' '.join([' '+pct(dc, glen) for dc in dcrange])
for d in drange:
print '%s (%2d)' % (pct(d, glen), d),
for dc in dcrange:
print '%5s' % fn(cluster(neighbors, d, dc)),
print
print '\nNearest neighbor must be closer than this percentage (places). '
print 'Each column: all genomes in cluster within this percentage of each other.'
table("Number of clusters", len)
cluster1 = cluster(neighbors, 8, 15) ## splits Cleora
print '\nNumber of clusters of different sizes:', showh(len(c) for c in cluster1)
M, T = defaultdict(int), defaultdict(int)
for c in cluster1:
M[margin(c)] += 1; T[margin(c)] += len(c)
for x in M: print '%d\t%d\t%d'% (x,M[x],T[x])
print '\nMargins', showh(M)
for c in cluster1:
if margin(c) <= 16:
print showc(c)
print '\nScatter plot of cluster diameter vs. margin.'
for c in cluster1:
if diameter(c) > 0:
pass
#print '%d\t%d' % (diameter(c), margin(c))
print '\nDifference from cluster(neighbors, 11, 14):'
#table(lambda cl: pct(len(cluster1)-compare(cluster1, cl),max(len(cluster1),len(cl))))
print '\nNumber of clusters witth more than one species name:'
#table(lambda cl: sum(nspecies(c) > 1 for c in cl))
def pct_near_another(clusters, P=1.25):
total = 0
for c in clusters:
d = diameter(c)
for g in c:
for g2 in neighbors[g]:
if g2 not in c and dist(g, g2) < P*d:
total += 1
return pct(total, n)
def f(P):
print '\nPercent of individuals within %.2f*diameter of another cluster.'%P
table(lambda cl: pct_near_another(cl, P))
#map(f, [1.2, 1.33, 1.5])
def sreport(species):
SS = defaultdict(int)
print
for s in set(species):
c = [g for g in range(n) if species[g] == s]
d = diameter(c)
if d > 14:
if d==glen: d = '>25'
print 'diameter %s for %s (%d elements)' % (d, s, len(c))
SS[d] += 1
print 'Diameters of %d labelled clusters: %s' % (len(set(species)), showh(SS))
def compare(cl1, cl2):
"Compare two lists of clusters"
return sum(c1==c2 or 0.5*(abs(len(c1)-len(c2))==1 and
(c1.issubset(c2) or c2.issubset(c1)))
for c1 in cl1 for c2 in cl2)
def unit_tests():
assert set(len(g) for g in genomes) == set([glen])
clusters = cluster(neighbors, 11, 11)
assert sum(len(c) for c in clusters) == len(genomes)
assert len(set(g for c in clusters for g in c)) == len(genomes)
assert dist(17, 42) == dist(42, 17)
assert diameter(set()) == 0
assert diameter([17, 42]) == dist(17, 42)
assert pct(1, 2) == '50.0%'
print '\nAll tests pass.\n'
################################################################ Main body
max_distance = 26
names, species, genomes = get_genomes() ## genomes = ['ACT...', ...]
n = len(genomes)
glen = len(genomes[0])
neighbors = get_neighbors() ## neighbor[g] = {g2:d2, g3:g3, ...}
greport(genomes)
nreport(neighbors)
creport(range(6, 15), [glen,16,15,14,13, 12, 11])
#sreport(species)
unit_tests()

440
lettercount.py Normal file
View File

@ -0,0 +1,440 @@
"""
Read files in the Google Books ngram format, and convert them to a simpler format.
The original format looks like this:
word \t year \t word_count \t book_count
word_POS \t year \t word_count \t book_count
for example,
accreted_VERB 1846 7 4
accreted_VERB 1847 1 1
accreted_VERB 1848 1 1
The function 'read_year_file' will convert a file of this form into a dict of
{WORD: count} pairs, where the WORD is uppercased, and the count is the total
over all years (you have the option to specify a starting year) and all
capitalizations. Then 'read_dict' and 'write_dict' convert between a dict and
an external file format that looks like this:
ACCRETED 9
"""
from __future__ import division
from collections import Counter, defaultdict
#### Read files in Books-Ngram format; convert to a dict
def read_year_file(filename, dic=None):
"""Read a file of 'word year word_count book_count' lines and convert to a dict
{WORD: totalcount}. Uppercase all words, and only include all-alphabetic words."""
if dic is None: dic = {}
for line in file(filename):
word, year, c1, c2 = line.split('\t')
if '_' in word:
word = word[:word.index('_')]
if word.isalpha():
word = word.upper()
dic[word] = dic.get(word, 0) + int(c1)
return dic
#### Read and write files of the form 'WORD \t count \n'
def write_dict(dic, filename):
"Write a {word:count} dict as 'word \t count' lines in filename."
out = file(filename, 'w')
for key in sorted(dic):
out.write('%s\t%s\n' % (key, dic[key]))
return out.close()
def read_dict(filename, sep='\t'):
"Read 'word \t count' lines from file and make them into a dict of {word:count}."
pairs = (line.split(sep) for line in file(filename))
return {word: int(count) for (word, count) in pairs}
#### Convert a bunch of year files into dict file format.
def convert_files(filenames, mincount=1e5):
def report(filename, D, adj):
import time
N = len(D)
W = sum(v for v in D.itervalues())
print '%s: %s %s words (%s tokens) at %s' % (
filename, adj, format(W, ',d'), format(N, ',d'),
time.strftime("%H:%M:%S", time.gmtime()))
for f in filenames:
report(f, {}, 'starting')
D = read_year_file(f)
report(f, D, 'total')
for key in list(D):
if D[key] < mincount:
del D[key]
write_dict(D, 'WORD-' + f[-1].upper())
report(f, D, 'popular')
def load():
global D, W, M
D = read_dict('top-words.txt')
W = len(D)
M = sum(D.values())
#### Compute letter counts and save as HTML files.
def histogram(items):
C = Counter()
for (key, val) in items:
C[key] += val
return C
def end(name): return '/' + name
def tag(name, **kwds): return '<' + name + keywords(kwds) + '>'
def row(cells, **kwds):
return '<tr>' + ''
def ngram_tables(dic, N, pos=[0, 1, 2, 3, 4, -5, -4, -3, -2, -1]):
"""Return three dicts of letter N-grams of length N: counts, counts1, counts2.
counts is a dict of {'AB': 123} that counts how often 'AB' occurs.
counts1[i] is a dict of {'AB': 123} that counts how often 'AB' occurs at position i.
counts2[i][j] is a dict of {'AB': 123} that counts how often 'AB' occurs at position i."""
L = len(max(D, key=len))
counts = Counter()
counts1 = [Counter() for _ in range(L)]
counts2 = [[Counter() for i in range(L)]]
def counter(pairs):
"Make a Counter from an iterable of (value, count) pairs."
c = Counter()
for (value, count) in pairs:
c[value] += count
return c
def ngrams(word, N):
return [word[i:i+N] for i in range(len(word)+1-N)]
import glob
#convert_files(glob.glob('book?'))
#DB = [[letter_counts() for length in range(length)] for length in range(maxlen)]
## Unused ???
def letter_counts(wc):
"""From word_counts dictionary wc, Create a dictionary of {(s, i, L): count}
where s is a letter n-gram, i is the starting position, and L is the length
of the word in which it appears."""
result = defaultdict(int)
for (word, count) in wc.iteritems():
for p in pieces(word):
result[p] += count
return result
def pieces(word):
"Yield the 1- and 2-letter grams in (s, i, L) format."
L = len(word)
for i in range(L):
yield (word[i], i, L)
if i+1 < L:
yield (word[i:i+2], i, L)
def getcount(counts, s, pos, length):
"""The count for letter sequence s (one or two letters) starting at
position i of words of length length. If any argument is all, sum them up."""
if length == all:
return sum(getcount(counts, s, pos, L) for L in all_lengths)
elif pos == all:
return sum(getcount(counts, s, i, length) for i in range(length))
else:
return counts[s, pos, length]
print 'start'
#wc = word_counts('count_100K.txt')
#counts = letter_counts(wc)
print 'end'
def test():
D = {'the': 100, 'of': 70, 'and': 60, 'to': 50, 'a': 40}
def num(ch):
"Translate 'a' or 'A' to 0, ... 'z' or 'Z' to 25."
return 'abcdefghijklmnopqrstuvwxyz'.index(ch.lower())
def stats(D, NS = (1, 2, 3, 4, 5, 6)):
counts = {n: Counter() for n in NS}
print 'words ' + ' '.join(' %d-grams ' % n for n in NS)
for (i, word) in enumerate(sortedby(D), 1):
for n in NS:
for ng in ngrams(word, n):
counts[n][ng] += 1
if i % 5000 == 0 or i == len(D):
print "%4dK" % (i/1000),
for n in NS:
c = len(counts[n])
field = "%5d (%d%%)" % (c, int(round(c*100/(26**n))))
print '%12s' % field,
print
letters = 'ETAOINSRHLDCUMFPGWYBVKXJQZ'
alphabet = ''.join(sorted(letters))
from itertools import cycle, izip
colors = 'ygobp'
def bar(text, color, count, N, pixels, height=16):
width = int(round(pixels * count / N))
if width < 2: width = 3
title = '{}: {:.3f}%; {:,}'.format(text, count*100./N, count)
return '<span title="%s"><img src="%s.jpg" height=%d width=%d><span style="position:relative; left:%d; bottom:4">%s</span></span>' % (
title, color, height, width, -width+2, text) # -int(width/2+5)
def letter_bar(LC, N=None, factor='', pixels=700):
if N is None: N = sum(LC.values())
#divisor = {'':1., 'K':1e3, 'M':1e6, 'B':1e9}[factor]
return ''.join(
bar(L.lower(), color, LC[L], N, pixels)
for (L, color) in izip(letters, cycle(colors)))
def singleton(x): return [x]
positions = [0, 1, 2, 3, 4, 5, 6, -7, -6, -5, -4, -3, -2, -1]
def substr(word, pos, length):
"""Return the substr of word of given length starting/ending at pos; or None."""
W = len(word)
if pos >= 0 and pos+length <= W:
return word[pos:pos+length]
elif pos < 0 and abs(pos)+length-1 <= W:
return word[W+pos+1-length:W+pos+1]
else:
return None
def lettercount(D, pos):
LC = histogram((substr(w, pos, 1), D[w]) for w in D)
del LC[None]
print LC
pos_name = (str(pos)+'+' if isinstance(pos, tuple) else
pos if pos < 0 else
pos+1)
return '\n<br>\n%-3s %s' % (pos_name, letter_bar(LC))
def ngramcount(D, n=2):
return histogram((ng, D[w]) for w in D for ng in ngrams(w, n))
def twograms(D2):
N = sum(D2.values())
header = '<table cellpadding=1 cellborder=1>'
rows = [tr([cell(A+B, D2, N) for A in alphabet]) for B in alphabet]
return '\n'.join([header] + rows + ['</table>'])
def cell(text, D2, N, height=16, maxwidth=25, scale=27):
count = D2.get(text, 0)
width = int(round(maxwidth * count * scale * 1. / N))
if width < 1: width = 1
title = '{}: {:.3f}%; {:,}'.format(text, count*100./N, count)
return '<td title="%s"><img src="o.jpg" height=%d width=%d><span style="position:relative; left:%d; bottom:4">%s</span></span>' % (
title, height, width, -width+2, text)
def cell(text, D2, N, height=16, maxwidth=25, scale=27):
count = D2.get(text, 0)
width = int(round(maxwidth * count * scale * 1. / N))
if width < 1: width = 1
title = '{}: {:.3f}%; {:,}'.format(text, count*100./N, count)
return '<td title="%s" background="o.jpg" height=%d width=%d>%s' % (
title, height, width, text)
def tr(cells):
return '<tr>' + ''.join(cells)
def comma(n): return '{:,}'.format(n)
def ngram_stats(D, n, k=5):
DN = ngramcount(D, n)
topk = ', '.join(sortedby(DN)[:k])
return '<tr><td>%d-grams<td align=right>%s<td align=right>%s<td><a href="counts-%d.csv">counts-%d.csv</a><td><a href="counts-%d.html">counts-%d.html</a><td>%s' % (
n, comma(len(DN)), comma(sum(DN.values())), n, n, n, n, topk)
#### Tables
def sortedby(D):
return sorted(D, key=lambda x: -D[x])
ANY = '*'
wordlengths = range(1, 10)
def col(*args): return args
def columns(n, wordlengths=wordlengths):
lengths = [k for k in wordlengths if k >= n]
return ([col(ANY, ANY)]
+ [col(k, ANY) for k in lengths]
+ [col(k, start, start+n-1) for k in lengths for start in range(1, 2+k-n)]
+ [col(ANY, start, start+n-1) for start in wordlengths]
+ [col(ANY, -k, -k+n-1) for k in reversed(lengths) if -k+n-1 < 0])
def colname(col):
fmt = '%s/%s' if (len(col) == 2) else '%s/%d:%d'
return fmt % col
def csvline(first, rest):
return '\t'.join([first] + map(str, rest))
def makecsv(n, D=D):
out = file('ngrams%d.csv' % n, 'w')
cols = columns(n)
Dng = defaultdict(lambda: defaultdict(int))
for w in D:
for (start, ng) in enumerate(ngrams(w, n), 1):
entry = Dng[ng]
N = D[w]
wlen = len(w)
entry[ANY, ANY] += N
entry[wlen, ANY] += N
if start <= 9:
entry[wlen, start, start+n-1] += N
entry[ANY, start, start+n-1] += N
from_end = wlen-start+1
if from_end <= 9:
entry[ANY, -from_end, -from_end+n-1] += N
# enumerate ngrams from word and increment counts for each one
print >> out, csvline('%d-gram' % n, map(colname, cols))
for ng in sorted(Dng, key=lambda ng: -Dng[ng][(ANY, ANY)]):
print >> out, csvline(ng, [Dng[ng].get(col, 0) for col in cols])
out.close()
return Dng
### Tests
"""
>>> for w in words:
print '%-6s %6.2f B (%4.2f%%) <img src="s.jpg" height=12 width=%d>' % (w.lower(), D[w]/1e9, D[w]*100./N, int(round(D[w]*4000./N)))
...
the 53.10 B (7.14%) <img src="s.jpg" height=12 width=286>
of 30.97 B (4.16%) <img src="s.jpg" height=12 width=167>
and 22.63 B (3.04%) <img src="s.jpg" height=12 width=122>
to 19.35 B (2.60%) <img src="s.jpg" height=12 width=104>
in 16.89 B (2.27%) <img src="s.jpg" height=12 width=91>
a 15.31 B (2.06%) <img src="s.jpg" height=12 width=82>
is 8.38 B (1.13%) <img src="s.jpg" height=12 width=45>
that 8.00 B (1.08%) <img src="s.jpg" height=12 width=43>
for 6.55 B (0.88%) <img src="s.jpg" height=12 width=35>
it 5.74 B (0.77%) <img src="s.jpg" height=12 width=31>
as 5.70 B (0.77%) <img src="s.jpg" height=12 width=31>
was 5.50 B (0.74%) <img src="s.jpg" height=12 width=30>
with 5.18 B (0.70%) <img src="s.jpg" height=12 width=28>
be 4.82 B (0.65%) <img src="s.jpg" height=12 width=26>
by 4.70 B (0.63%) <img src="s.jpg" height=12 width=25>
on 4.59 B (0.62%) <img src="s.jpg" height=12 width=25>
not 4.52 B (0.61%) <img src="s.jpg" height=12 width=24>
he 4.11 B (0.55%) <img src="s.jpg" height=12 width=22>
i 3.88 B (0.52%) <img src="s.jpg" height=12 width=21>
this 3.83 B (0.51%) <img src="s.jpg" height=12 width=21>
are 3.70 B (0.50%) <img src="s.jpg" height=12 width=20>
or 3.67 B (0.49%) <img src="s.jpg" height=12 width=20>
his 3.61 B (0.49%) <img src="s.jpg" height=12 width=19>
from 3.47 B (0.47%) <img src="s.jpg" height=12 width=19>
at 3.41 B (0.46%) <img src="s.jpg" height=12 width=18>
which 3.14 B (0.42%) <img src="s.jpg" height=12 width=17>
but 2.79 B (0.38%) <img src="s.jpg" height=12 width=15>
have 2.78 B (0.37%) <img src="s.jpg" height=12 width=15>
an 2.73 B (0.37%) <img src="s.jpg" height=12 width=15>
had 2.62 B (0.35%) <img src="s.jpg" height=12 width=14>
they 2.46 B (0.33%) <img src="s.jpg" height=12 width=13>
you 2.34 B (0.31%) <img src="s.jpg" height=12 width=13>
were 2.27 B (0.31%) <img src="s.jpg" height=12 width=12>
their 2.15 B (0.29%) <img src="s.jpg" height=12 width=12>
one 2.15 B (0.29%) <img src="s.jpg" height=12 width=12>
all 2.06 B (0.28%) <img src="s.jpg" height=12 width=11>
we 2.06 B (0.28%) <img src="s.jpg" height=12 width=11>
can 1.67 B (0.22%) <img src="s.jpg" height=12 width=9>
her 1.63 B (0.22%) <img src="s.jpg" height=12 width=9>
has 1.63 B (0.22%) <img src="s.jpg" height=12 width=9>
there 1.62 B (0.22%) <img src="s.jpg" height=12 width=9>
been 1.62 B (0.22%) <img src="s.jpg" height=12 width=9>
if 1.56 B (0.21%) <img src="s.jpg" height=12 width=8>
more 1.55 B (0.21%) <img src="s.jpg" height=12 width=8>
when 1.52 B (0.20%) <img src="s.jpg" height=12 width=8>
will 1.49 B (0.20%) <img src="s.jpg" height=12 width=8>
would 1.47 B (0.20%) <img src="s.jpg" height=12 width=8>
who 1.46 B (0.20%) <img src="s.jpg" height=12 width=8>
so 1.45 B (0.19%) <img src="s.jpg" height=12 width=8>
no 1.40 B (0.19%) <img src="s.jpg" height=12 width=8>
>>> for n in sorted(H):
print '%2d %9.2f M (%6.3f%%) <img src="s.jpg" height=12 width=%d> %d' % (n, H[n]/1e6, H[n]*100./NN, H[n]*3000./NN, n)
...
1 22301.22 M ( 2.998%) <img src="s.jpg" height=12 width=89> 1
2 131293.85 M (17.651%) <img src="s.jpg" height=12 width=529> 2
3 152568.38 M (20.511%) <img src="s.jpg" height=12 width=615> 3
4 109988.33 M (14.787%) <img src="s.jpg" height=12 width=443> 4
5 79589.32 M (10.700%) <img src="s.jpg" height=12 width=320> 5
6 62391.21 M ( 8.388%) <img src="s.jpg" height=12 width=251> 6
7 59052.66 M ( 7.939%) <img src="s.jpg" height=12 width=238> 7
8 44207.29 M ( 5.943%) <img src="s.jpg" height=12 width=178> 8
9 33006.93 M ( 4.437%) <img src="s.jpg" height=12 width=133> 9
10 22883.84 M ( 3.076%) <img src="s.jpg" height=12 width=92> 10
11 13098.06 M ( 1.761%) <img src="s.jpg" height=12 width=52> 11
12 7124.15 M ( 0.958%) <img src="s.jpg" height=12 width=28> 12
13 3850.58 M ( 0.518%) <img src="s.jpg" height=12 width=15> 13
14 1653.08 M ( 0.222%) <img src="s.jpg" height=12 width=6> 14
15 565.24 M ( 0.076%) <img src="s.jpg" height=12 width=2> 15
16 151.22 M ( 0.020%) <img src="s.jpg" height=12 width=0> 16
17 72.81 M ( 0.010%) <img src="s.jpg" height=12 width=0> 17
18 28.62 M ( 0.004%) <img src="s.jpg" height=12 width=0> 18
19 8.51 M ( 0.001%) <img src="s.jpg" height=12 width=0> 19
20 6.35 M ( 0.001%) <img src="s.jpg" height=12 width=0> 20
21 0.13 M ( 0.000%) <img src="s.jpg" height=12 width=0> 21
22 0.81 M ( 0.000%) <img src="s.jpg" height=12 width=0> 22
23 0.32 M ( 0.000%) <img src="s.jpg" height=12 width=0> 23
>>> NL = sum(LC.values())
>>> for L in sorted(LC, key=lambda L: -LC[L]):
print '%s %8.1f B (%5.2f%%) <img src="s.jpg" height=12 width=%d>' % (L, LC[L]/1e9, LC[L]*100./NL, LC[L]*3000./NL)
...
E 445.2 B (12.49%) <img src="s.jpg" height=12 width=374>
T 330.5 B ( 9.28%) <img src="s.jpg" height=12 width=278>
A 286.5 B ( 8.04%) <img src="s.jpg" height=12 width=241>
O 272.3 B ( 7.64%) <img src="s.jpg" height=12 width=229>
I 269.7 B ( 7.57%) <img src="s.jpg" height=12 width=227>
N 257.8 B ( 7.23%) <img src="s.jpg" height=12 width=217>
S 232.1 B ( 6.51%) <img src="s.jpg" height=12 width=195>
R 223.8 B ( 6.28%) <img src="s.jpg" height=12 width=188>
H 180.1 B ( 5.05%) <img src="s.jpg" height=12 width=151>
L 145.0 B ( 4.07%) <img src="s.jpg" height=12 width=122>
D 136.0 B ( 3.82%) <img src="s.jpg" height=12 width=114>
C 119.2 B ( 3.34%) <img src="s.jpg" height=12 width=100>
U 97.3 B ( 2.73%) <img src="s.jpg" height=12 width=81>
M 89.5 B ( 2.51%) <img src="s.jpg" height=12 width=75>
F 85.6 B ( 2.40%) <img src="s.jpg" height=12 width=72>
P 76.1 B ( 2.14%) <img src="s.jpg" height=12 width=64>
G 66.6 B ( 1.87%) <img src="s.jpg" height=12 width=56>
W 59.7 B ( 1.68%) <img src="s.jpg" height=12 width=50>
Y 59.3 B ( 1.66%) <img src="s.jpg" height=12 width=49>
B 52.9 B ( 1.48%) <img src="s.jpg" height=12 width=44>
V 37.5 B ( 1.05%) <img src="s.jpg" height=12 width=31>
K 19.3 B ( 0.54%) <img src="s.jpg" height=12 width=16>
X 8.4 B ( 0.23%) <img src="s.jpg" height=12 width=7>
J 5.7 B ( 0.16%) <img src="s.jpg" height=12 width=4>
Q 4.3 B ( 0.12%) <img src="s.jpg" height=12 width=3>
Z 3.2 B ( 0.09%) <img src="s.jpg" height=12 width=2>
>>> D2 = ngramcount(D, 2)
>>> for ng in sorted(D2, key=lambda L: -D2[L])[:50]: print '%s %8.1f B (%5.2f%%) <img src="o.jpg" height=12 width=%d>' % (ng, D2[ng]/1e9, D2[ng]*100./N2, D2[ng]*15000./N2)
def doit(k=25):
counts = [sortedby(ngramcount(D, n))[:k] for n in range(2, 10)]
for i in range(k):
print (' '.join(count[i] for count in counts)).lower()
"""

145
lis.py Normal file
View File

@ -0,0 +1,145 @@
################ Lispy: Scheme Interpreter in Python
## (c) Peter Norvig, 2010-16; See http://norvig.com/lispy.html
from __future__ import division
import math
import operator as op
################ Types
Symbol = str # A Lisp Symbol is implemented as a Python str
List = list # A Lisp List is implemented as a Python list
Number = (int, float) # A Lisp Number is implemented as a Python int or float
################ Parsing: parse, tokenize, and read_from_tokens
def parse(program):
"Read a Scheme expression from a string."
return read_from_tokens(tokenize(program))
def tokenize(s):
"Convert a string into a list of tokens."
return s.replace('(',' ( ').replace(')',' ) ').split()
def read_from_tokens(tokens):
"Read an expression from a sequence of tokens."
if len(tokens) == 0:
raise SyntaxError('unexpected EOF while reading')
token = tokens.pop(0)
if '(' == token:
L = []
while tokens[0] != ')':
L.append(read_from_tokens(tokens))
tokens.pop(0) # pop off ')'
return L
elif ')' == token:
raise SyntaxError('unexpected )')
else:
return atom(token)
def atom(token):
"Numbers become numbers; every other token is a symbol."
try: return int(token)
except ValueError:
try: return float(token)
except ValueError:
return Symbol(token)
################ Environments
def standard_env():
"An environment with some Scheme standard procedures."
env = Env()
env.update(vars(math)) # sin, cos, sqrt, pi, ...
env.update({
'+':op.add, '-':op.sub, '*':op.mul, '/':op.truediv,
'>':op.gt, '<':op.lt, '>=':op.ge, '<=':op.le, '=':op.eq,
'abs': abs,
'append': op.add,
'apply': apply,
'begin': lambda *x: x[-1],
'car': lambda x: x[0],
'cdr': lambda x: x[1:],
'cons': lambda x,y: [x] + y,
'eq?': op.is_,
'equal?': op.eq,
'length': len,
'list': lambda *x: list(x),
'list?': lambda x: isinstance(x,list),
'map': map,
'max': max,
'min': min,
'not': op.not_,
'null?': lambda x: x == [],
'number?': lambda x: isinstance(x, Number),
'procedure?': callable,
'round': round,
'symbol?': lambda x: isinstance(x, Symbol),
})
return env
class Env(dict):
"An environment: a dict of {'var':val} pairs, with an outer Env."
def __init__(self, parms=(), args=(), outer=None):
self.update(zip(parms, args))
self.outer = outer
def find(self, var):
"Find the innermost Env where var appears."
return self if (var in self) else self.outer.find(var)
global_env = standard_env()
################ Interaction: A REPL
def repl(prompt='lis.py> '):
"A prompt-read-eval-print loop."
while True:
val = eval(parse(raw_input(prompt)))
if val is not None:
print(lispstr(val))
def lispstr(exp):
"Convert a Python object back into a Lisp-readable string."
if isinstance(exp, List):
return '(' + ' '.join(map(lispstr, exp)) + ')'
else:
return str(exp)
################ Procedures
class Procedure(object):
"A user-defined Scheme procedure."
def __init__(self, parms, body, env):
self.parms, self.body, self.env = parms, body, env
def __call__(self, *args):
return eval(self.body, Env(self.parms, args, self.env))
################ eval
def eval(x, env=global_env):
"Evaluate an expression in an environment."
if isinstance(x, Symbol): # variable reference
return env.find(x)[x]
elif not isinstance(x, List): # constant literal
return x
elif x[0] == 'quote': # (quote exp)
(_, exp) = x
return exp
elif x[0] == 'if': # (if test conseq alt)
(_, test, conseq, alt) = x
exp = (conseq if eval(test, env) else alt)
return eval(exp, env)
elif x[0] == 'define': # (define var exp)
(_, var, exp) = x
env[var] = eval(exp, env)
elif x[0] == 'set!': # (set! var exp)
(_, var, exp) = x
env.find(var)[var] = eval(exp, env)
elif x[0] == 'lambda': # (lambda (var...) body)
(_, parms, body) = x
return Procedure(parms, body, env)
else: # (proc arg...)
proc = eval(x[0], env)
args = [eval(exp, env) for exp in x[1:]]
return proc(*args)

318
lispy.py Normal file
View File

@ -0,0 +1,318 @@
################ Scheme Interpreter in Python
## (c) Peter Norvig, 2010; See http://norvig.com/lispy2.html
################ Symbol, Procedure, classes
from __future__ import division
import re, sys, StringIO
class Symbol(str): pass
def Sym(s, symbol_table={}):
"Find or create unique Symbol entry for str s in symbol table."
if s not in symbol_table: symbol_table[s] = Symbol(s)
return symbol_table[s]
_quote, _if, _set, _define, _lambda, _begin, _definemacro, = map(Sym,
"quote if set! define lambda begin define-macro".split())
_quasiquote, _unquote, _unquotesplicing = map(Sym,
"quasiquote unquote unquote-splicing".split())
class Procedure(object):
"A user-defined Scheme procedure."
def __init__(self, parms, exp, env):
self.parms, self.exp, self.env = parms, exp, env
def __call__(self, *args):
return eval(self.exp, Env(self.parms, args, self.env))
################ parse, read, and user interaction
def parse(inport):
"Parse a program: read and expand/error-check it."
# Backwards compatibility: given a str, convert it to an InPort
if isinstance(inport, str): inport = InPort(StringIO.StringIO(inport))
return expand(read(inport), toplevel=True)
eof_object = Symbol('#<eof-object>') # Note: uninterned; can't be read
class InPort(object):
"An input port. Retains a line of chars."
tokenizer = r"""\s*(,@|[('`,)]|"(?:[\\].|[^\\"])*"|;.*|[^\s('"`,;)]*)(.*)"""
def __init__(self, file):
self.file = file; self.line = ''
def next_token(self):
"Return the next token, reading new text into line buffer if needed."
while True:
if self.line == '': self.line = self.file.readline()
if self.line == '': return eof_object
token, self.line = re.match(InPort.tokenizer, self.line).groups()
if token != '' and not token.startswith(';'):
return token
def readchar(inport):
"Read the next character from an input port."
if inport.line != '':
ch, inport.line = inport.line[0], inport.line[1:]
return ch
else:
return inport.file.read(1) or eof_object
def read(inport):
"Read a Scheme expression from an input port."
def read_ahead(token):
if '(' == token:
L = []
while True:
token = inport.next_token()
if token == ')': return L
else: L.append(read_ahead(token))
elif ')' == token: raise SyntaxError('unexpected )')
elif token in quotes: return [quotes[token], read(inport)]
elif token is eof_object: raise SyntaxError('unexpected EOF in list')
else: return atom(token)
# body of read:
token1 = inport.next_token()
return eof_object if token1 is eof_object else read_ahead(token1)
quotes = {"'":_quote, "`":_quasiquote, ",":_unquote, ",@":_unquotesplicing}
def atom(token):
'Numbers become numbers; #t and #f are booleans; "..." string; otherwise Symbol.'
if token == '#t': return True
elif token == '#f': return False
elif token[0] == '"': return token[1:-1].decode('string_escape')
try: return int(token)
except ValueError:
try: return float(token)
except ValueError:
try: return complex(token.replace('i', 'j', 1))
except ValueError:
return Sym(token)
def to_string(x):
"Convert a Python object back into a Lisp-readable string."
if x is True: return "#t"
elif x is False: return "#f"
elif isa(x, Symbol): return x
elif isa(x, str): return '"%s"' % x.encode('string_escape').replace('"',r'\"')
elif isa(x, list): return '('+' '.join(map(to_string, x))+')'
elif isa(x, complex): return str(x).replace('j', 'i')
else: return str(x)
def load(filename):
"Eval every expression from a file."
repl(None, InPort(open(filename)), None)
def repl(prompt='lispy> ', inport=InPort(sys.stdin), out=sys.stdout):
"A prompt-read-eval-print loop."
sys.stderr.write("Lispy version 2.0\n")
while True:
try:
if prompt: sys.stderr.write(prompt)
x = parse(inport)
if x is eof_object: return
val = eval(x)
if val is not None and out: print >> out, to_string(val)
except Exception as e:
print '%s: %s' % (type(e).__name__, e)
################ Environment class
class Env(dict):
"An environment: a dict of {'var':val} pairs, with an outer Env."
def __init__(self, parms=(), args=(), outer=None):
# Bind parm list to corresponding args, or single parm to list of args
self.outer = outer
if isa(parms, Symbol):
self.update({parms:list(args)})
else:
if len(args) != len(parms):
raise TypeError('expected %s, given %s, '
% (to_string(parms), to_string(args)))
self.update(zip(parms,args))
def find(self, var):
"Find the innermost Env where var appears."
if var in self: return self
elif self.outer is None: raise LookupError(var)
else: return self.outer.find(var)
def is_pair(x): return x != [] and isa(x, list)
def cons(x, y): return [x]+y
def callcc(proc):
"Call proc with current continuation; escape only"
ball = RuntimeWarning("Sorry, can't continue this continuation any longer.")
def throw(retval): ball.retval = retval; raise ball
try:
return proc(throw)
except RuntimeWarning as w:
if w is ball: return ball.retval
else: raise w
def add_globals(self):
"Add some Scheme standard procedures."
import math, cmath, operator as op
self.update(vars(math))
self.update(vars(cmath))
self.update({
'+':op.add, '-':op.sub, '*':op.mul, '/':op.div, 'not':op.not_,
'>':op.gt, '<':op.lt, '>=':op.ge, '<=':op.le, '=':op.eq,
'equal?':op.eq, 'eq?':op.is_, 'length':len, 'cons':cons,
'car':lambda x:x[0], 'cdr':lambda x:x[1:], 'append':op.add,
'list':lambda *x:list(x), 'list?': lambda x:isa(x,list),
'null?':lambda x:x==[], 'symbol?':lambda x: isa(x, Symbol),
'boolean?':lambda x: isa(x, bool), 'pair?':is_pair,
'port?': lambda x:isa(x,file), 'apply':lambda proc,l: proc(*l),
'eval':lambda x: eval(expand(x)), 'load':lambda fn: load(fn), 'call/cc':callcc,
'open-input-file':open,'close-input-port':lambda p: p.file.close(),
'open-output-file':lambda f:open(f,'w'), 'close-output-port':lambda p: p.close(),
'eof-object?':lambda x:x is eof_object, 'read-char':readchar,
'read':read, 'write':lambda x,port=sys.stdout:port.write(to_string(x)),
'display':lambda x,port=sys.stdout:port.write(x if isa(x,str) else to_string(x))})
return self
isa = isinstance
global_env = add_globals(Env())
################ eval (tail recursive)
def eval(x, env=global_env):
"Evaluate an expression in an environment."
while True:
if isa(x, Symbol): # variable reference
return env.find(x)[x]
elif not isa(x, list): # constant literal
return x
elif x[0] is _quote: # (quote exp)
(_, exp) = x
return exp
elif x[0] is _if: # (if test conseq alt)
(_, test, conseq, alt) = x
x = (conseq if eval(test, env) else alt)
elif x[0] is _set: # (set! var exp)
(_, var, exp) = x
env.find(var)[var] = eval(exp, env)
return None
elif x[0] is _define: # (define var exp)
(_, var, exp) = x
env[var] = eval(exp, env)
return None
elif x[0] is _lambda: # (lambda (var*) exp)
(_, vars, exp) = x
return Procedure(vars, exp, env)
elif x[0] is _begin: # (begin exp+)
for exp in x[1:-1]:
eval(exp, env)
x = x[-1]
else: # (proc exp*)
exps = [eval(exp, env) for exp in x]
proc = exps.pop(0)
if isa(proc, Procedure):
x = proc.exp
env = Env(proc.parms, exps, proc.env)
else:
return proc(*exps)
################ expand
def expand(x, toplevel=False):
"Walk tree of x, making optimizations/fixes, and signaling SyntaxError."
require(x, x!=[]) # () => Error
if not isa(x, list): # constant => unchanged
return x
elif x[0] is _quote: # (quote exp)
require(x, len(x)==2)
return x
elif x[0] is _if:
if len(x)==3: x = x + [None] # (if t c) => (if t c None)
require(x, len(x)==4)
return map(expand, x)
elif x[0] is _set:
require(x, len(x)==3);
var = x[1] # (set! non-var exp) => Error
require(x, isa(var, Symbol), "can set! only a symbol")
return [_set, var, expand(x[2])]
elif x[0] is _define or x[0] is _definemacro:
require(x, len(x)>=3)
_def, v, body = x[0], x[1], x[2:]
if isa(v, list) and v: # (define (f args) body)
f, args = v[0], v[1:] # => (define f (lambda (args) body))
return expand([_def, f, [_lambda, args]+body])
else:
require(x, len(x)==3) # (define non-var/list exp) => Error
require(x, isa(v, Symbol), "can define only a symbol")
exp = expand(x[2])
if _def is _definemacro:
require(x, toplevel, "define-macro only allowed at top level")
proc = eval(exp)
require(x, callable(proc), "macro must be a procedure")
macro_table[v] = proc # (define-macro v proc)
return None # => None; add v:proc to macro_table
return [_define, v, exp]
elif x[0] is _begin:
if len(x)==1: return None # (begin) => None
else: return [expand(xi, toplevel) for xi in x]
elif x[0] is _lambda: # (lambda (x) e1 e2)
require(x, len(x)>=3) # => (lambda (x) (begin e1 e2))
vars, body = x[1], x[2:]
require(x, (isa(vars, list) and all(isa(v, Symbol) for v in vars))
or isa(vars, Symbol), "illegal lambda argument list")
exp = body[0] if len(body) == 1 else [_begin] + body
return [_lambda, vars, expand(exp)]
elif x[0] is _quasiquote: # `x => expand_quasiquote(x)
require(x, len(x)==2)
return expand_quasiquote(x[1])
elif isa(x[0], Symbol) and x[0] in macro_table:
return expand(macro_table[x[0]](*x[1:]), toplevel) # (m arg...)
else: # => macroexpand if m isa macro
return map(expand, x) # (f arg...) => expand each
def require(x, predicate, msg="wrong length"):
"Signal a syntax error if predicate is false."
if not predicate: raise SyntaxError(to_string(x)+': '+msg)
_append, _cons, _let = map(Sym, "append cons let".split())
def expand_quasiquote(x):
"""Expand `x => 'x; `,x => x; `(,@x y) => (append x y) """
if not is_pair(x):
return [_quote, x]
require(x, x[0] is not _unquotesplicing, "can't splice here")
if x[0] is _unquote:
require(x, len(x)==2)
return x[1]
elif is_pair(x[0]) and x[0][0] is _unquotesplicing:
require(x[0], len(x[0])==2)
return [_append, x[0][1], expand_quasiquote(x[1:])]
else:
return [_cons, expand_quasiquote(x[0]), expand_quasiquote(x[1:])]
def let(*args):
args = list(args)
x = cons(_let, args)
require(x, len(args)>1)
bindings, body = args[0], args[1:]
require(x, all(isa(b, list) and len(b)==2 and isa(b[0], Symbol)
for b in bindings), "illegal binding list")
vars, vals = zip(*bindings)
return [[_lambda, list(vars)]+map(expand, body)] + map(expand, vals)
macro_table = {_let:let} ## More macros can go here
eval(parse("""(begin
(define-macro and (lambda args
(if (null? args) #t
(if (= (length args) 1) (car args)
`(if ,(car args) (and ,@(cdr args)) #f)))))
;; More macros can also go here
)"""))
if __name__ == '__main__':
repl()

121
lispytest.py Normal file
View File

@ -0,0 +1,121 @@
################ Tests for lis.py and lispy.py
lis_tests = [
("(quote (testing 1 (2.0) -3.14e159))", ['testing', 1, [2.0], -3.14e159]),
("(+ 2 2)", 4),
("(+ (* 2 100) (* 1 10))", 210),
("(if (> 6 5) (+ 1 1) (+ 2 2))", 2),
("(if (< 6 5) (+ 1 1) (+ 2 2))", 4),
("(define x 3)", None), ("x", 3), ("(+ x x)", 6),
("(begin (define x 1) (set! x (+ x 1)) (+ x 1))", 3),
("((lambda (x) (+ x x)) 5)", 10),
("(define twice (lambda (x) (* 2 x)))", None), ("(twice 5)", 10),
("(define compose (lambda (f g) (lambda (x) (f (g x)))))", None),
("((compose list twice) 5)", [10]),
("(define repeat (lambda (f) (compose f f)))", None),
("((repeat twice) 5)", 20), ("((repeat (repeat twice)) 5)", 80),
("(define fact (lambda (n) (if (<= n 1) 1 (* n (fact (- n 1))))))", None),
("(fact 3)", 6),
("(fact 50)", 30414093201713378043612608166064768844377641568960512000000000000),
("(define abs (lambda (n) ((if (> n 0) + -) 0 n)))", None),
("(list (abs -3) (abs 0) (abs 3))", [3, 0, 3]),
("""(define combine (lambda (f)
(lambda (x y)
(if (null? x) (quote ())
(f (list (car x) (car y))
((combine f) (cdr x) (cdr y)))))))""", None),
("(define zip (combine cons))", None),
("(zip (list 1 2 3 4) (list 5 6 7 8))", [[1, 5], [2, 6], [3, 7], [4, 8]]),
("""(define riff-shuffle (lambda (deck) (begin
(define take (lambda (n seq) (if (<= n 0) (quote ()) (cons (car seq) (take (- n 1) (cdr seq))))))
(define drop (lambda (n seq) (if (<= n 0) seq (drop (- n 1) (cdr seq)))))
(define mid (lambda (seq) (/ (length seq) 2)))
((combine append) (take (mid deck) deck) (drop (mid deck) deck)))))""", None),
("(riff-shuffle (list 1 2 3 4 5 6 7 8))", [1, 5, 2, 6, 3, 7, 4, 8]),
("((repeat riff-shuffle) (list 1 2 3 4 5 6 7 8))", [1, 3, 5, 7, 2, 4, 6, 8]),
("(riff-shuffle (riff-shuffle (riff-shuffle (list 1 2 3 4 5 6 7 8))))", [1,2,3,4,5,6,7,8]),
]
lispy_tests = [
("()", SyntaxError), ("(set! x)", SyntaxError),
("(define 3 4)", SyntaxError),
("(quote 1 2)", SyntaxError), ("(if 1 2 3 4)", SyntaxError),
("(lambda 3 3)", SyntaxError), ("(lambda (x))", SyntaxError),
("""(if (= 1 2) (define-macro a 'a)
(define-macro a 'b))""", SyntaxError),
("(define (twice x) (* 2 x))", None), ("(twice 2)", 4),
("(twice 2 2)", TypeError),
("(define lyst (lambda items items))", None),
("(lyst 1 2 3 (+ 2 2))", [1,2,3,4]),
("(if 1 2)", 2),
("(if (= 3 4) 2)", None),
("(define ((account bal) amt) (set! bal (+ bal amt)) bal)", None),
("(define a1 (account 100))", None),
("(a1 0)", 100), ("(a1 10)", 110), ("(a1 10)", 120),
("""(define (newton guess function derivative epsilon)
(define guess2 (- guess (/ (function guess) (derivative guess))))
(if (< (abs (- guess guess2)) epsilon) guess2
(newton guess2 function derivative epsilon)))""", None),
("""(define (square-root a)
(newton 1 (lambda (x) (- (* x x) a)) (lambda (x) (* 2 x)) 1e-8))""", None),
("(> (square-root 200.) 14.14213)", True),
("(< (square-root 200.) 14.14215)", True),
("(= (square-root 200.) (sqrt 200.))", True),
("""(define (sum-squares-range start end)
(define (sumsq-acc start end acc)
(if (> start end) acc (sumsq-acc (+ start 1) end (+ (* start start) acc))))
(sumsq-acc start end 0))""", None),
("(sum-squares-range 1 3000)", 9004500500), ## Tests tail recursion
("(call/cc (lambda (throw) (+ 5 (* 10 (throw 1))))) ;; throw", 1),
("(call/cc (lambda (throw) (+ 5 (* 10 1)))) ;; do not throw", 15),
("""(call/cc (lambda (throw)
(+ 5 (* 10 (call/cc (lambda (escape) (* 100 (escape 3)))))))) ; 1 level""", 35),
("""(call/cc (lambda (throw)
(+ 5 (* 10 (call/cc (lambda (escape) (* 100 (throw 3)))))))) ; 2 levels""", 3),
("""(call/cc (lambda (throw)
(+ 5 (* 10 (call/cc (lambda (escape) (* 100 1))))))) ; 0 levels""", 1005),
("(* 1i 1i)", -1), ("(sqrt -1)", 1j),
("(let ((a 1) (b 2)) (+ a b))", 3),
("(let ((a 1) (b 2 3)) (+ a b))", SyntaxError),
("(and 1 2 3)", 3), ("(and (> 2 1) 2 3)", 3), ("(and)", True),
("(and (> 2 1) (> 2 3))", False),
("(define-macro unless (lambda args `(if (not ,(car args)) (begin ,@(cdr args))))) ; test `", None),
("(unless (= 2 (+ 1 1)) (display 2) 3 4)", None),
(r'(unless (= 4 (+ 1 1)) (display 2) (display "\n") 3 4)', 4),
("(quote x)", 'x'),
("(quote (1 2 three))", [1, 2, 'three']),
("'x", 'x'),
("'(one 2 3)", ['one', 2, 3]),
("(define L (list 1 2 3))", None),
("`(testing ,@L testing)", ['testing',1,2,3,'testing']),
("`(testing ,L testing)", ['testing',[1,2,3],'testing']),
("`,@L", SyntaxError),
("""'(1 ;test comments '
;skip this line
2 ; more ; comments ; ) )
3) ; final comment""", [1,2,3]),
]
def test(tests, name=''):
"For each (exp, expected) test case, see if eval(parse(exp)) == expected."
fails = 0
for (x, expected) in tests:
try:
result = eval(parse(x))
print x, '=>', to_string(result)
ok = (result == expected)
except Exception as e:
print x, '=raises=>', type(e).__name__, e
ok = issubclass(expected, Exception) and isinstance(e, expected)
if not ok:
fails += 1
print 'FAIL!!! Expected', expected
print '%s %s: %d out of %d tests fail.' % ('*'*45, name, fails, len(tests))
if __name__ == '__main__':
from lis import *
test(lis_tests, 'lis.py')
from lispy import *
test(lis_tests+lispy_tests, 'lispy.py')

154
pal.py Normal file
View File

@ -0,0 +1,154 @@
import string, random, os, re, bisect
"""Produce Panama-ish Palindromes. Copyright (C) 2002, Peter Norvig.
See http://www.norvig.com/license.html and http://www.norvig.com/pal-alg.html"""
def is_panama(p):
"Test if p is a Panama-ish palindrome."
def is_unique(seq): return len(seq) == len(dict(zip(seq, seq)))
return (p.endswith('Panama') and is_palindrome(p)
and is_unique([s.strip() for s in p.split(',')]))
def is_palindrome(phrase):
"Test if a phrase is a palindrome."
cphrase = canonical(phrase)
return cphrase == reverse(cphrase)
def canonical(word, sub=re.compile('[^A-Za-z0-9]').sub):
"The canonical form for comparing: lowercase alphanumerics."
return sub('', word).lower()
def read_dict(filename='npdict.txt'):
"Read the file into global variables _fw and _bw and _truename."
global _fw, _bw, _truename
_fw, _bw, _truename = [], [], {'': ''}
for word in open(filename).read().splitlines():
w = canonical(word)
_fw.append(w)
_bw.append(reverse(w))
_truename[w] = word
_fw.sort(); _bw.sort()
return len(_fw), len(_bw), len(_truename)
def update(obj, **entries): obj.__dict__.update(entries); return obj
class PalDict:
"""A dictionary from which you can find canonical words that start or end
with a given canonical substring, and find the true name of a
canonical word."""
def __init__(self, fw=None, bw=None, truename=None):
update(self, fw=fw or _fw, bw=bw or _bw, truename=truename or _truename)
def startswith(self, prefix, k=100):
"""Return up to k canonical words that start with prefix.
If there are more than k, choose from them at random."""
return k_startingwith(k, self.fw, prefix)
def endswith(self, suffix, k=100):
"""Return up to k canonical words that end with suffix.
If there are more than k, choose from them at random.
Both the suffix and the word returned are reversed."""
return k_startingwith(k, self.bw, suffix)
def k_startingwith(k, words, prefix):
"""Choose up to k words that match the prefix (choose randomly if > k)."""
start = bisect.bisect(words, prefix)
end = bisect.bisect(words, prefix + 'zzzz')
n = end - start
if k >= n:
results = words[start:end]
random.shuffle(results)
else: # Should really try to avoid duplicates
results = [words[random.randrange(start, end)] for i in range(k)]
return results
class Panama:
def __init__(self, L='A man, a plan', R='a canal, Panama', dict=None):
left = [canonical(w) for w in L.split(', ')]
right = [canonical(reverse(w)) for w in reverse(R.split(', '))]
update(self, left=left, right=right, dict=dict or PalDict(), best=0,
seen={}, diff=len(''.join(left)) - len(''.join(right)))
for word in left + map(reverse, right):
self.seen[word] = 1
def missing(self, k=20):
"""Return the substring that is missing, and candidate words."""
if self.diff >= 0: # Left is longer, missing on right
substr = self.left[-1][-self.diff:]
return substr, self.dict.endswith(substr, k)
else: # Right is longer, missing on left
substr = self.right[-1][self.diff:]
return substr, self.dict.startswith(substr, k)
def search(self, k=200):
"Search for palindromes; consider at most k words at each level."
self.stack = [self.missing(k)]
while self.stack:
substr, words = self.stack[-1]
if is_palindrome(substr):
self.report()
if words:
self.extend(words.pop(), k)
elif not self.backtrack():
return
def extend(self, word, k):
"Add a new word (unless we've already seen it)."
if self.diff >= 0: # Left is longer, add to right
fword = reverse(word)
if fword in self.seen: return
self.diff -= len(fword)
self.seen[fword] = 1
self.right.append(word)
self.stack.append(self.missing(k))
else: # Right is longer, add to left
if word in self.seen: return
self.diff += len(word)
self.seen[word] = 1
self.left.append(word)
self.stack.append(self.missing(k))
def backtrack(self):
"Remove the last word added; return 0 if can't backtrack"
if self.diff >= 0: # Left is longer, pop from left
if not self.left: return 0
word = self.left.pop()
self.diff -= len(word)
del self.seen[word]
else: # Right is longer, pop from right
if not self.right: return 0
word = self.right.pop()
self.diff += len(word)
del self.seen[reverse(word)]
self.stack.pop()
return 1
def report(self):
"Write current state to log file."
if len(self) > self.best + 200:
self.best = len(self)
print self.best
self.bestphrase = str(self)
assert is_panama(self.bestphrase)
f = open('pallog%d.txt' % os.getpid(), 'w')
f.write(self.bestphrase + '\n')
f.close()
def __len__(self):
return len(self.left) + len(self.right)
def __str__(self):
truename = self.dict.truename
lefts = [truename[w] for w in self.left]
rights = [truename[reverse(w)] for w in reverse(self.right[:])]
return ', '.join(lefts + ['*****'] + rights)
def reverse(x):
"Reverse a list or string."
if type(x) == type(''):
return ''.join(reverse(list(x)))
else:
x.reverse()
return x
if __name__ == '__main__': read_dict(); p = Panama(); p.search()

262
pal2.py Normal file
View File

@ -0,0 +1,262 @@
import random, re, bisect, time
"""Produce Panama-ish Palindromes. Copyright (C) 2002-2008, Peter Norvig."""
################ Checking for Palindromes
def is_panama(s):
"Test if string s is a Panama-ish palindrome."
return is_palindrome(s) and is_unique(phrases(s))
def is_palindrome(s):
"Test if a string is a palindrome."
s1 = canonical(s)
return s1 == reversestr(s1)
def phrases(s):
"Break a string s into comma-separated phrases."
return [phrase.strip() for phrase in s.split(',')]
def canonical(word, sub=re.compile('''[-* \t\n\r.,;!?:()`"']''').sub):
"The canonical form for comparing: lowercase, no blanks or punctuation."
return sub('', word).lower()
################ Utilities
def reversestr(x):
"Reverse a string."
return x[::-1]
def is_unique(seq):
"Return true if seq has no duplicate elements."
return len(seq) == len(set(seq))
def update(obj, **entries):
"Change attributes of obj, according to the keyword args."
obj.__dict__.update(entries)
return obj
################ Reading in a dictionary
class PalDict:
"""A dictionary from which you can find canonical words that start or end
with a given canonical substring, and find the true name of a
canonical word with d.truename[canonicalword]."""
def __init__(self, k=1000, filename='npdict.txt'):
words, rwords, truename = [], [], {'': '', 'panama': 'Panama!'}
for tword in open(filename).read().splitlines():
word = canonical(tword)
words.append(word)
rwords.append(reversestr(word))
truename[word] = tword
words.sort()
rwords.sort()
update(self, k=k, words=words, rwords=rwords, truename=truename,
reversibles={}, rangek=range(k), tryharder=False)
def startswith(self, prefix):
"""Return up to k canonical words that start with prefix.
If there are more than k, choose from them at random."""
return self._k_startingwith(self.words, prefix)
def endswith(self, rsuffix):
"""Return up to k canonical words that end with the reversed suffix.
If you want words ending in 'ing', ask for d.endswith('gni').
If there are more than k, choose from them at random."""
return map(reversestr, self._k_startingwith(self.rwords, rsuffix))
def __contains__(self, word):
return word in self.truename
def reversible_words(self):
"Find words that have a reverse in the dict, like {'Camus': 'Sumac'}"
if not self.reversibles:
reversibles = self.reversibles
for rw in self.rwords:
if rw in self:
w = reversestr(rw)
if w != rw and w not in reversibles:
reversibles[w] = rw
self.reversibles = reversibles
return self.reversibles
def _k_startingwith(self, words, prefix):
start = bisect.bisect_left(words, prefix)
end = bisect.bisect(words, prefix + 'zzzz')
n = end - start
if self.k >= n: # get all the words that start with prefix
results = words[start:end]
else: # sample from words starting with prefix
indexes = random.sample(xrange(start, end), self.k)
results = [words[i] for i in indexes]
random.shuffle(results)
## Consider words that are prefixes of the prefix.
## This is very slow, so don't use it until late in the game.
if self.tryharder:
for i in range(3, len(prefix)):
w = prefix[0:i]
if ((words == self.words and w in self.truename) or
(words == self.rwords and reversestr(w) in self.truename)):
results.append(w)
return results
paldict = PalDict()
def anpdictshort():
"Find the words that are valid when every phrase must start with 'a'"
def segment(word): return [s for s in word.split('a') if s]
def valid(word): return all(reversestr(s) in segments for s in segment(word))
words = map(canonical, file('anpdict.txt'))
segments = set(s for w in words for s in segment(canonical(w)))
valid_words = [paldict.truename[w] for w in words if valid(w)]
file('anpdict-short.txt', 'w').write('\n'.join(valid_words))
################ Search for a palindrome
class Panama:
def __init__(self, L='A man, a plan', R='a canal, Panama', dict=paldict):
## .left and .right hold lists of canonical words
## .diff holds the number of characters that are not matched,
## positive for words on left, negative for right.
## .stack holds (action, side, arg) tuples
update(self, left=[], right=[], best=0, seen={}, diff=0, stack=[],
used_reversibles=False, starttime=time.clock(), dict=dict)
for word in L.split(','):
self.add('left', canonical(word))
for rword in reversestr(R).split(','):
self.add('right', canonical(reversestr(rword)))
self.consider_candidates()
def search(self, steps=50000000):
"Search for palindromes."
for _ in xrange(steps):
if not self.stack:
return 'done'
action, dir, substr, arg = self.stack[-1]
if action == 'added': # undo the last word added
self.remove(dir, arg)
elif action == 'trying' and arg: # try the next word if there is one
self.add(dir, arg.pop()) and self.consider_candidates()
elif action == 'trying' and not arg: # otherwise backtrack
self.stack.pop()
else:
raise ValueError(action)
def add(self, dir, word):
"add a word"
if word in self.seen:
return False
else:
getattr(self, dir).append(word)
self.diff += factor[dir] * len(word)
self.seen[word] = True
self.stack.append(('added', dir, '?', word))
return True
def remove(self, dir, word):
"remove a word"
oldword = getattr(self, dir).pop()
assert word == oldword
self.diff -= factor[dir] * len(word)
del self.seen[word]
self.stack.pop()
def consider_candidates(self):
"""Push a new state with a set of candidate words onto stack."""
if self.diff > 0: # Left is longer, consider adding on right
dir = 'right'
substr = self.left[-1][-self.diff:]
candidates = self.dict.endswith(substr)
elif self.diff < 0: # Right is longer, consider adding on left
dir = 'left'
substr = reversestr(self.right[-1][0:-self.diff])
candidates = self.dict.startswith(substr)
else: # Both sides are same size
dir = 'left'
if not self.used_reversibles:
self.report()
self.add_reversibles()
substr = ''
candidates = self.dict.startswith('')
if substr == reversestr(substr):
self.report()
self.stack.append(('trying', dir, substr, candidates))
def add_reversibles(self):
"Add in reversible words."
print 'using reversibles ...'
for (word, rword) in self.dict.reversible_words().items():
if word not in self.seen and rword not in self.seen:
self.add('left', word)
self.add('right', rword)
self.used_reversibles = True
self.stack = []
print '...done'
def report(self):
"Report a new palindrome to log file (if it is sufficiently big)."
N = len(self)
if N > 13333:
self.dict.tryharder = True
if N > self.best and (N > 12500 or N > self.best+500):
self.best = len(self)
self.bestphrase = str(self)
print '%5d phrases (%5d words) in %3d seconds' % (
self.best, self.bestphrase.count(' ')+1, time.clock() - self.starttime)
assert is_panama(self.bestphrase)
f = open('pallog%d.txt' % (id(self) % 10000), 'w')
f.write(self.bestphrase + '\n')
f.close()
def __len__(self):
return len(self.left) + len(self.right)
def __str__(self):
truename = self.dict.truename
lefts = [truename[w] for w in self.left]
rights =[truename[w] for w in self.right]
return ', '.join(lefts + rights[::-1])
factor = {'left': +1, 'right': -1}
# Note that we only allow one truename per canonical name. Occasionally
# this means we miss a good word (as in "a node" vs. "an ode"), but there
# are only 665 of these truename collisions, and most of them are of the
# form "a mark-up" vs. "a markup" so it seemed better to disallow them.
################ Unit Tests
def tests(p=Panama()):
assert is_panama('A man, a plan, a canal, Panama.')
assert is_panama('''A (man), a plan,,;, a ```canal?'' -- Panama!''')
assert not is_panama('A man, a plan, a radar, a canal, Panama.')
assert is_palindrome('A man, a plan, a canal, Panama.')
assert is_palindrome('radar, radar? radar!')
assert not is_palindrome('radars')
assert phrases('A man, a plan, Panama') == ['A man', 'a plan', 'Panama']
assert canonical('A man, a plan, a canal, Panama') == 'amanaplanacanalpanama'
assert reversestr('foo') == 'oof'
assert is_unique([1, 2, 3])
assert not is_unique([1, 2, 2])
d = p.dict
def sameset(a, b): return set(a) == set(b)
assert 'panama' in d
assert d.words[0] in d
assert d.words[-1] in d
assert sameset(d.startswith('aword'), ['awording', 'awordbreak',
'awordiness', 'awordage', 'awordplay', 'awordlore', 'awordbook',
'awordlessness', 'aword', 'awordsmith'])
assert sameset(d.endswith('ytisob'), ['aglobosity', 'averbosity',
'asubglobosity', 'anonverbosity', 'agibbosity'])
d.tryharder = True
assert sameset(d.startswith('oklahoma'), ['oklahoma', 'okla'])
d.tryharder = False
assert d.startswith('oklahoma') == ['oklahoma']
assert d.startswith('fsfdsfdsfds') == []
print 'all tests pass'
if __name__ == '__main__':
p = Panama();
tests(p)
p.search()

170
pal3.py Normal file
View File

@ -0,0 +1,170 @@
from collections import Counter, deque
import re
class PhraseDict(dict):
"""A dictionary of {letters: phrase}, such as {'donaldeknuth': 'Donald E. Knuth'}, with:
.prefixes: Counter of {'pre': n} where n is the number of keys that start with 'pre'
.suffixes: Counter of {'xes': n} where n is the number of keys that end with 'xes'"""
def __init__(self, phrases):
for phrase in phrases:
phrase = phrase.strip()
self[letters(phrase)] = phrase
self.prefixes = Counter(x for p in self for x in prefixes(p))
self.suffixes = Counter(x for p in self for x in suffixes(p))
def prefixes(phrase): return [phrase[:i] for i in range(1, len(phrase) + 1)]
def suffixes(phrase): return [phrase[-i:] for i in range(1, len(phrase) + 1)]
def letters(phrase, sub=re.compile(r'[\W]+').sub):
"Remove all the non-letters from phrase; return lowercase version."
return sub('', phrase).lower()
DICT = PhraseDict(open('npdict.txt'))
class Panama:
"""Panama represents a palindrome, or a state in searching for one.
It has .left and .right to hold the phrases that are chosen,
and .L and .R to hold the current partial phrases in the middle (still working on these).
Also, a .set of all complete phrases, and the .dict of allowable phrases to choose from."""
def __init__(self, left=['aman', 'aplan'], L='aca', R='', right=['acanal', 'panama'], dict=DICT):
assert cat(left + [L]) == cat([R] + right)[::-1]
self.left = list(left) # list of complete phrases on left
self.L = L # an incomplete phrase on left
self.R = R # an incomplete phrase on right
self.right = deque(right) # deque of complete phrases on right
self.dict = dict # a {letters: actual_phrase} mapping
self.set = set(left + right) # a set of all complete phrases in palindrome
self.best = [] # list of phrases in longest palindrome found
self.Nshown = 0 # the number of phrases shown in the previous printout
self.i = 0 # the number of steps taken in the search
self.check()
def __str__(self): return self.original_phrases(self.best)
def original_phrases(self, phrases): return ', '.join(self.dict[phrase] for phrase in phrases)
def search(self, steps=10**5):
"""Depth-first search for palindromes. From the current state, find all applicable actions.
Do the first one, and put on the stack reminders to undo it and try the others,
but first search deeper from the result of the first action."""
stack = [self.applicable_actions()]
for self.i in range(steps):
if not stack:
return
command = stack.pop()
if isinstance(command, UndoCommand):
self.undo(command)
elif command:
act = command.pop()
self.do(act)
self.check()
stack.extend([command, UndoCommand(act), self.applicable_actions()])
def do(self, act):
"Modify the current state by adding a letter, or finishing a phrase."
if act == ',': # finish phrase on left
self.set.add(self.L)
self.left.append(self.L)
self.L = ''
elif act == ';': # finish phrase on right
self.set.add(self.R)
self.right.appendleft(self.R)
self.R = ''
else: # add a letter
self.L = self.L + act
self.R = act + self.R
def undo(self, act):
"Modify the current state by undoing an action that was previously done."
if act == ',': # unfinish phrase on left
assert self.L == ''
self.L = self.left.pop()
self.set.remove(self.L)
elif act == ';': # unfinish phrase on right
assert self.R == ''
self.R = self.right.popleft()
self.set.remove(self.R)
else: # remove a letter
self.L = self.L[:-1]
self.R = self.R[1:]
def check(self):
"Check to see if current state is a palindrome, and if so, record it and maybe print."
if not self.is_palindrome(): return
N = len(self.left) + len(self.right)
if N > len(self.best):
self.best = self.left + list(self.right)
if N - self.Nshown > 1000 or (N > 14000 and N - self.Nshown > 100) or N > 14500:
self.Nshown = N
print(self.report())
def report(self):
N = len(self.best)
nwords = N + sum(self.dict[p].count(' ') for p in self.best)
nletters = sum(len(p) for p in self.best)
return ('Pal: {:6,d} phrases, {:6,d} words, {:6,d} letters (at step {:,d})'
.format(N, nwords, nletters, self.i+1))
def applicable_actions(self):
L, R, D = self.L, self.R, self.dict
actions = []
def score(A): return D.prefixes[L+A] * D.suffixes[A+R]
if self.is_allowed(L):
actions.append(',')
if self.is_allowed(R):
actions.append(';')
for A in sorted(alphabet, key=score):
if score(A) > 0:
actions.append(A)
return actions
def is_allowed(self, phrase): return phrase in self.dict and phrase not in self.set
def is_palindrome(self):
"Is this a palindrome? (Does any extra .L or .R match the other side?)"
return ((self.L == '' and self.left[-1].endswith(self.R)) or
(self.R == '' and self.right[0].startswith(self.L)))
alphabet = 'abcdefghijklmnopqrstuvwxyz'
cat = ''.join
UndoCommand = str
DoCommand = list
################ Unit Tests
def test1():
assert prefixes('hello') == ['h', 'he', 'hel', 'hell', 'hello']
assert suffixes('hello') == ['o', 'lo', 'llo', 'ello', 'hello']
assert letters('a man') == 'aman'
assert letters('an elk') == 'anelk'
assert letters('Mr. T') == 'mrt'
assert letters('Donald E. Knuth') == 'donaldeknuth'
assert len(DICT) == 125512
assert 'panama' in DICT
assert 'aman' in DICT
assert 'threemen' not in DICT
assert DICT['acanal'] == 'a canal'
return 'ok'
def test2():
p1 = Panama()
assert p1.is_palindrome()
assert str(p1) == 'a man, a plan, a canal, Panama'
p2 = Panama(['aman','aplan'], 'acadd','dd', ['acanal', 'panama'])
assert not p2.is_palindrome()
p3 = Panama(['maya'], '', '', ['ayam'])
assert p3.is_palindrome()
assert str(p3) == 'Maya, a yam'
return 'ok'
if __name__ == '__main__':
p = Panama();
test1()
test2()
p.search(10**6)
print(p.report())
print(str(p))

52
parse.py Normal file
View File

@ -0,0 +1,52 @@
grammar = {
'Noun': ['stench', 'wumpus'],
'Verb': ['is', 'smell'],
'Adjective': ['dead', 'smelly'],
'Adverb': ['left', 'back'],
'Pronoun': ['me', 'you'],
'Name': ['John', 'Mary'],
'Article': ['the', 'a'],
'Preposition': ['to', 'in'],
'Conjunction': ['and', 'or'],
'Digit': ['0', '1'],
'S': [['NP', 'VP'], ['S', 'Comjunction', 'S']],
'NP': ['Pronoun', 'Noun', ['Article', 'Noun'], ['Digit', 'Digit'],
['NP', 'PP'], ['NP', 'RelClause']],
'VP': ['Verb', ['VP', 'NP'], ['VP', 'Adjective'], ['VP', 'PP'],
['VP', 'Adverb']],
'PP': [['Preposition', 'NP']],
'RelClause': [['that', 'VP']]
}
def parse(forest, grammar):
if len(forest) == 1 and category(forest[0]) == 'S':
return forest[0]
for i in range(len(forest)):
for lhs in grammar.keys():
for rhs in grammar[lhs]:
rhs = mklist(rhs)
n = len(rhs)
subsequence = forest[i:i+n]
if match(subsequence, rhs):
print subsequence, lhs, '=>', rhs
forest2 = forest[:]
forest2[i:i+n] = [(lhs, subsequence)]
result = parse(forest2, grammar)
if result != None:
return result
return None
def mklist(x):
if type(x) == type([]): return x
else: return [x]
def match(forest, rhs):
for i in range(len(rhs)):
if category(forest[i]) != rhs[i] and forest[i] != rhs[i]: return 0
return 1
def category(forest):
if type(forest) == type(()): return forest[0]
else: return 'word'

110
py2html.py Normal file
View File

@ -0,0 +1,110 @@
"""Pretty-print Python code to colorized, hyperlinked html.
In python, do:
py2html.convert_files(['file1.py', 'file2.py', ...])
From the shell, do:
python py2html.py *.py"""
import re, string, time, os
id = r'[a-zA-Z_][a-zA-Z_0-9]*' ## RE for a Python identifier
g1, g2, g3, g4 = r'\1 \2 \3 \4'.split() ## groups for re.matches
def b(text): return '<b>%s</b>' % text
def i(text): return '<i>%s</i>' % text
def color(rgb, text): return '<font color="%s">%s</font>' % (rgb, text)
def link(url, anchor): return '<a href="%s">%s</a>' % (url, anchor)
def hilite(text, bg="ffff00"):
return '<b style="background-color:%s"><a name="%s">%s</b>' % (
bg, text, text)
def modulelink(module, baseurl=''):
"""Hyperlink to a module, either locally or on python.org"""
if module+'.py' not in local_files:
baseurl = 'http://www.python.org/doc/current/lib/module-'
return link(baseurl+module+'.html', module)
def importer(m):
"Turn text such as 'utils, math, re' into a string of HTML links."
modules = [modulelink(mod.strip()) for mod in m.group(2).split(',')]
return (m.group(1) + ', '.join(modules) + m.group(3))
def find1(regex, str):
return (re.findall(regex, str) or ['&nbsp;'])[0]
def convert_files(filenames, local_filenames=None, tblfile='readme.htm'):
"Convert files of python code to colorized HTML."
global local_files
local_files = local_filenames or filenames
summary_table = {}
for f in filenames:
fulltext = '\n'.join(map(string.rstrip, open(f).readlines()))
text = fulltext
for (pattern, repl) in replacements:
text = re.sub(pattern, repl, text)
text = '<<header("AIMA Python file: %s")>><pre>%s</pre><<footer>>' % (
f, text)
open(f[:-3]+'.htm', 'w').write(text)
if tblfile:
ch = find1(r'Chapters?\s+([^ \)"]*)', fulltext)
module = f.replace('.py','')
lines = fulltext.count('\n')
desc = find1(r'"""(.*)\n', fulltext).replace('"""', '')
summary_table.setdefault(ch,[]).append((module, lines, desc))
if tblfile:
totallines = 0
tbl = ["<tr><th>Chapter<th>Module<th>Files<th>Lines<th>Description"]
fmt = "<tr><td align=right>%s<th>%s<td>%s<td align=right>%s<td>%s"
items = summary_table.items(); items.sort(num_cmp)
for (ch, entries) in items:
for (module, lines, desc) in entries:
totallines += lines
files = link(module+'.py', '.py')
if os.path.exists(module+'.txt'):
files += ' ' + link(module+'.txt', '.txt')
tbl += [fmt % (ch, link(module+'.html', module),
files, lines, desc)]
tbl += [fmt % ('', '', '', totallines, ''), "</table>"]
## Now read the tblfile, and replace the first table with tbl
old = open(tblfile).read()
new = re.sub("(?s)(<table border=1>)(.*)(</table>)",
r'\1' + '\n'.join(tbl) + r'\3', old, 1)
open(tblfile, 'w').write(new)
def num_cmp(x, y):
def num(x):
nums = re.findall('[0-9]+', x or '')
if nums: return int(nums[0])
return x
return cmp(num(x[0]), num(y[0]))
### Above is general (more or less); below is specific to my files.
def comment(text): return i(color("green", text))
replacements = [
(r'&', '&amp;'),
(r'<', '&lt;'),
(r'>', '&gt;'),
(r'(?ms)^#+[#_]{10,} *\n', '<hr>'),
(r"""('[^']*?'|"[^"]*?")""", comment(g1)),
(r'(?s)(""".*?"""|' + r"'''.*?''')", comment(g1)),
(r'(#.*)', color("cc33cc", g1)),
(r'(?m)(^[a-zA-Z][a-zA-Z_0-9, ]+)(\s+=\s+)', hilite(g1) + g2),
(r'(?m)(^\s*)(def\s+)(%s)' % id, g1 + b(g2) + hilite(g3)),
(r'(?m)(^\s*)(class\s+)(%s)' % id, g1 + b(g2) + hilite(g3)),
(r'(from\s+)([a-z]+)(\s+import)', importer),
(r'(import\s+)([a-z, ]+)(\s|\n|$|,)', importer),
]
if __name__ == '__main__':
import sys, glob
files = []
for arg in sys.argv[1:]:
files.extend(glob.glob(arg))
convert_files(files)
## ENHANCEMENTS:
## Can get confused with """ and '''; not a problem in practice.
## Maybe we should create an index
## Probably should switch to Doxygen

106
spell.py Normal file
View File

@ -0,0 +1,106 @@
"""Spelling Corrector in Python 3; see http://norvig.com/spell-correct.html
Copyright (c) 2007-2016 Peter Norvig
MIT license: www.opensource.org/licenses/mit-license.php
"""
################ Spelling Corrector
import re
from collections import Counter
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('big.txt').read()))
def P(word, N=sum(WORDS.values())):
"Probability of `word`."
return WORDS[word] / N
def correction(word):
"Most probable spelling correction for word."
return max(candidates(word), key=P)
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
################ Test Code
def unit_tests():
assert correction('speling') == 'spelling' # insert
assert correction('korrectud') == 'corrected' # replace 2
assert correction('bycycle') == 'bicycle' # replace
assert correction('inconvient') == 'inconvenient' # insert 2
assert correction('arrainged') == 'arranged' # delete
assert correction('peotry') =='poetry' # transpose
assert correction('peotryy') =='poetry' # transpose + delete
assert correction('word') == 'word' # known
assert correction('quintessential') == 'quintessential' # unknown
assert words('This is a TEST.') == ['this', 'is', 'a', 'test']
assert Counter(words('This is a test. 123; A TEST this is.')) == (
Counter({'123': 1, 'a': 2, 'is': 2, 'test': 2, 'this': 2}))
assert len(WORDS) == 32192
assert sum(WORDS.values()) == 1115504
assert WORDS.most_common(10) == [
('the', 79808),
('of', 40024),
('and', 38311),
('to', 28765),
('in', 22020),
('a', 21124),
('that', 12512),
('he', 12401),
('was', 11410),
('it', 10681)]
assert WORDS['the'] == 79808
assert P('quintessential') == 0
assert 0.07 < P('the') < 0.08
return 'unit_tests pass'
def spelltest(tests, verbose=False):
"Run correction(wrong) on all (right, wrong) pairs; report results."
import time
start = time.clock()
good, unknown = 0, 0
n = len(tests)
for right, wrong in tests:
w = correction(wrong)
good += (w == right)
if w != right:
unknown += (right not in WORDS)
if verbose:
print('correction({}) => {} ({}); expected {} ({})'
.format(wrong, w, WORDS[w], right, WORDS[right]))
dt = time.clock() - start
print('{:.0%} of {} correct ({:.0%} unknown) at {:.0f} words per second '
.format(good / n, n, unknown / n, n / dt))
def Testset(lines):
"Parse 'right: wrong1 wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs."
return [(right, wrong)
for (right, wrongs) in (line.split(':') for line in lines)
for wrong in wrongs.split()]
if __name__ == '__main__':
print(unit_tests())
spelltest(Testset(open('spell-testset1.txt')))
spelltest(Testset(open('spell-testset2.txt')))

201
sudoku.py Normal file
View File

@ -0,0 +1,201 @@
## Solve Every Sudoku Puzzle
## See http://norvig.com/sudoku.html
## Throughout this program we have:
## r is a row, e.g. 'A'
## c is a column, e.g. '3'
## s is a square, e.g. 'A3'
## d is a digit, e.g. '9'
## u is a unit, e.g. ['A1','B1','C1','D1','E1','F1','G1','H1','I1']
## grid is a grid,e.g. 81 non-blank chars, e.g. starting with '.18...7...
## values is a dict of possible values, e.g. {'A1':'12349', 'A2':'8', ...}
def cross(A, B):
"Cross product of elements in A and elements in B."
return [a+b for a in A for b in B]
digits = '123456789'
rows = 'ABCDEFGHI'
cols = digits
squares = cross(rows, cols)
unitlist = ([cross(rows, c) for c in cols] +
[cross(r, cols) for r in rows] +
[cross(rs, cs) for rs in ('ABC','DEF','GHI') for cs in ('123','456','789')])
units = dict((s, [u for u in unitlist if s in u])
for s in squares)
peers = dict((s, set(sum(units[s],[]))-set([s]))
for s in squares)
################ Unit Tests ################
def test():
"A set of tests that must pass."
assert len(squares) == 81
assert len(unitlist) == 27
assert all(len(units[s]) == 3 for s in squares)
assert all(len(peers[s]) == 20 for s in squares)
assert units['C2'] == [['A2', 'B2', 'C2', 'D2', 'E2', 'F2', 'G2', 'H2', 'I2'],
['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9'],
['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'C1', 'C2', 'C3']]
assert peers['C2'] == set(['A2', 'B2', 'D2', 'E2', 'F2', 'G2', 'H2', 'I2',
'C1', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9',
'A1', 'A3', 'B1', 'B3'])
print 'All tests pass.'
################ Parse a Grid ################
def parse_grid(grid):
"""Convert grid to a dict of possible values, {square: digits}, or
return False if a contradiction is detected."""
## To start, every square can be any digit; then assign values from the grid.
values = dict((s, digits) for s in squares)
for s,d in grid_values(grid).items():
if d in digits and not assign(values, s, d):
return False ## (Fail if we can't assign d to square s.)
return values
def grid_values(grid):
"Convert grid into a dict of {square: char} with '0' or '.' for empties."
chars = [c for c in grid if c in digits or c in '0.']
assert len(chars) == 81
return dict(zip(squares, chars))
################ Constraint Propagation ################
def assign(values, s, d):
"""Eliminate all the other values (except d) from values[s] and propagate.
Return values, except return False if a contradiction is detected."""
other_values = values[s].replace(d, '')
if all(eliminate(values, s, d2) for d2 in other_values):
return values
else:
return False
def eliminate(values, s, d):
"""Eliminate d from values[s]; propagate when values or places <= 2.
Return values, except return False if a contradiction is detected."""
if d not in values[s]:
return values ## Already eliminated
values[s] = values[s].replace(d,'')
## (1) If a square s is reduced to one value d2, then eliminate d2 from the peers.
if len(values[s]) == 0:
return False ## Contradiction: removed last value
elif len(values[s]) == 1:
d2 = values[s]
if not all(eliminate(values, s2, d2) for s2 in peers[s]):
return False
## (2) If a unit u is reduced to only one place for a value d, then put it there.
for u in units[s]:
dplaces = [s for s in u if d in values[s]]
if len(dplaces) == 0:
return False ## Contradiction: no place for this value
elif len(dplaces) == 1:
# d can only be in one place in unit; assign it there
if not assign(values, dplaces[0], d):
return False
return values
################ Display as 2-D grid ################
def display(values):
"Display these values as a 2-D grid."
width = 1+max(len(values[s]) for s in squares)
line = '+'.join(['-'*(width*3)]*3)
for r in rows:
print ''.join(values[r+c].center(width)+('|' if c in '36' else '')
for c in cols)
if r in 'CF': print line
print
################ Search ################
def solve(grid): return search(parse_grid(grid))
def search(values):
"Using depth-first search and propagation, try all possible values."
if values is False:
return False ## Failed earlier
if all(len(values[s]) == 1 for s in squares):
return values ## Solved!
## Chose the unfilled square s with the fewest possibilities
n,s = min((len(values[s]), s) for s in squares if len(values[s]) > 1)
return some(search(assign(values.copy(), s, d))
for d in values[s])
################ Utilities ################
def some(seq):
"Return some element of seq that is true."
for e in seq:
if e: return e
return False
def from_file(filename, sep='\n'):
"Parse a file into a list of strings, separated by sep."
return file(filename).read().strip().split(sep)
def shuffled(seq):
"Return a randomly shuffled copy of the input sequence."
seq = list(seq)
random.shuffle(seq)
return seq
################ System test ################
import time, random
def solve_all(grids, name='', showif=0.0):
"""Attempt to solve a sequence of grids. Report results.
When showif is a number of seconds, display puzzles that take longer.
When showif is None, don't display any puzzles."""
def time_solve(grid):
start = time.clock()
values = solve(grid)
t = time.clock()-start
## Display puzzles that take long enough
if showif is not None and t > showif:
display(grid_values(grid))
if values: display(values)
print '(%.2f seconds)\n' % t
return (t, solved(values))
times, results = zip(*[time_solve(grid) for grid in grids])
N = len(grids)
if N > 1:
print "Solved %d of %d %s puzzles (avg %.2f secs (%d Hz), max %.2f secs)." % (
sum(results), N, name, sum(times)/N, N/sum(times), max(times))
def solved(values):
"A puzzle is solved if each unit is a permutation of the digits 1 to 9."
def unitsolved(unit): return set(values[s] for s in unit) == set(digits)
return values is not False and all(unitsolved(unit) for unit in unitlist)
def random_puzzle(N=17):
"""Make a random puzzle with N or more assignments. Restart on contradictions.
Note the resulting puzzle is not guaranteed to be solvable, but empirically
about 99.8% of them are solvable. Some have multiple solutions."""
values = dict((s, digits) for s in squares)
for s in shuffled(squares):
if not assign(values, s, random.choice(values[s])):
break
ds = [values[s] for s in squares if len(values[s]) == 1]
if len(ds) >= N and len(set(ds)) >= 8:
return ''.join(values[s] if len(values[s])==1 else '.' for s in squares)
return random_puzzle(N) ## Give up and make a new puzzle
grid1 = '003020600900305001001806400008102900700000008006708200002609500800203009005010300'
grid2 = '4.....8.5.3..........7......2.....6.....8.4......1.......6.3.7.5..2.....1.4......'
hard1 = '.....6....59.....82....8....45........3........6..3.54...325..6..................'
if __name__ == '__main__':
test()
solve_all(from_file("easy50.txt", '========'), "easy", None)
solve_all(from_file("top95.txt"), "hard", None)
solve_all(from_file("hardest.txt"), "hardest", None)
solve_all([random_puzzle() for _ in range(99)], "random", 100.0)
## References used:
## http://www.scanraid.com/BasicStrategies.htm
## http://www.sudokudragon.com/sudokustrategy.htm
## http://www.krazydad.com/blog/2005/09/29/an-index-of-sudoku-strategies/
## http://www2.warwick.ac.uk/fac/sci/moac/currentstudents/peter_cock/python/sudoku/

73
testaccum.py Normal file
View File

@ -0,0 +1,73 @@
from __future__ import division
import re
from accum import *
acc_re = re.compile("[[](.+):(.+) for (.+) in (.+)[]]")
def expand_accumulations(program_text):
"""Replace any accumulation displays in program_text with calls to
accumulation. Used to simulate a hypothetical Python interpreter that
actually handles accumlation displays. This one is rather poor: it
won't match across lines, it won't match nested accumulation displays,
and it doesn't handle multiple 'for' clauses; nor 'if' clauses."""
def _(matchobj):
(acc, exp, x, it) = matchobj.groups()
return "accumulation(%s, lambda %s: (%s), %s)" % (acc, x, exp, it)
return acc_re.sub(_, program_text)
def test1(acc_display, expected):
"Eval an accumulation display and see if it gets the expected answer."
print acc_display
result = eval(expand_accumulations(acc_display))
assert result == expected, ('Got %s; expected %s' % (result, expected))
print ' ==> %s' % result
#### Initialize some data
temp = [70, 70, 71, 74, 76, 76, 72, 76, 77, 77, 77, 78,
78, 79, 79, 79, 78, 80, 82, 83, 83, 81, 84, 83]
data = temp
def f(x): return 2 * x
votes = {'Arnie': 48, 'Gray': 45, 'Tom': 13, 'Cruz': 32, 'Peter': 3}
candidates = votes.keys()
def test():
print 'temp = ', temp
print 'data = temp'
print 'votes = ', votes
print 'candidates = ', candidates
print
#### Test some accumulation displays
test1("[Max: temp[hour] for hour in range(24)]",
max([temp[hour] for hour in range(24)]))
test1("[Min: temp[hour] for hour in range(24)]",
min([temp[hour] for hour in range(24)]))
test1("[Sum: x*x for x in data]",
sum([x*x for x in data]))
test1("[Mean: f(x) for x in data]",
sum([f(x) for x in data])/len(data))
test1("[Median: f(x) for x in data]",
156.0)
test1("[Mode: f(x) for x in data]",
166)
test1("[Argmax: votes[c] for c in candidates]",
'Arnie')
test1("[Argmin: votes[c] for c in candidates]",
'Peter')
test1("[Some: temp[hour] > 75 for hour in range(24)]",
len([hour for four in range(24) if temp[hour] > 75])>0)
test1("[Every: temp[hour] > 75 for hour in range(24)]",
len([h for h in range(24) if temp[h] > 75]) == 24)
test1("[Top(10): temp[hour] for hour in range(24)]",
[84, 83, 83, 83, 82, 81, 80, 79, 79, 79])
test1("[Join(', '): votes[c] for c in candidates]",
', '.join([str(votes[c]) for c in candidates]))
test1("[SortBy: abs(x) for x in (-2, -4, 3, 1)]",
[1, -2, 3, -4])
test1("[SortBy(reverse=True): abs(x) for x in (-2, -4, 3, 1)]",
[-4, 3, -2, 1])
if __name__ == "__main__":
test()