"""
Code to support http://norvig.com/mayzner.html
Read files in the Google Books ngram format, and convert them to a simpler format.
The original format looks like this:
word \t year \t word_count \t book_count
word_POS \t year \t word_count \t book_count
for example,
accreted_VERB 1846 7 4
accreted_VERB 1847 1 1
accreted_VERB 1848 1 1
The function 'read_year_file' will convert a file of this form into a dict of
{WORD: count} pairs, where the WORD is uppercased, and the count is the total
over all years (you have the option to specify a starting year) and all
capitalizations. Then 'read_dict' and 'write_dict' convert between a dict and
an external file format that looks like this:
ACCRETED 9
"""
from __future__ import division
from collections import Counter, defaultdict
#### Read files in Books-Ngram format; convert to a dict
def read_year_file(filename, dic=None):
"""Read a file of 'word year word_count book_count' lines and convert to a dict
{WORD: totalcount}. Uppercase all words, and only include all-alphabetic words."""
if dic is None: dic = {}
for line in file(filename):
word, year, c1, c2 = line.split('\t')
if '_' in word:
word = word[:word.index('_')]
if word.isalpha():
word = word.upper()
dic[word] = dic.get(word, 0) + int(c1)
return dic
#### Read and write files of the form 'WORD \t count \n'
def write_dict(dic, filename):
"Write a {word:count} dict as 'word \t count' lines in filename."
out = file(filename, 'w')
for key in sorted(dic):
out.write('%s\t%s\n' % (key, dic[key]))
return out.close()
def read_dict(filename, sep='\t'):
"Read 'word \t count' lines from file and make them into a dict of {word:count}."
pairs = (line.split(sep) for line in file(filename))
return {word: int(count) for (word, count) in pairs}
#### Convert a bunch of year files into dict file format.
def convert_files(filenames, mincount=1e5):
def report(filename, D, adj):
import time
N = len(D)
W = sum(v for v in D.itervalues())
print '%s: %s %s words (%s tokens) at %s' % (
filename, adj, format(W, ',d'), format(N, ',d'),
time.strftime("%H:%M:%S", time.gmtime()))
for f in filenames:
report(f, {}, 'starting')
D = read_year_file(f)
report(f, D, 'total')
for key in list(D):
if D[key] < mincount:
del D[key]
write_dict(D, 'WORD-' + f[-1].upper())
report(f, D, 'popular')
def load(filename='top-words.txt'):
"Load file of 'word \t count' lines into D (a dict), W (length of D) and M (total number of words)."
global D, W, M
D = read_dict(filename)
W = len(D)
M = sum(D.values())
#### Compute letter counts and save as HTML files.
def histogram(items):
"Return a Counter of the number of times each key occurs in (key, val) pairs."
C = Counter()
for (key, val) in items:
C[key] += val
return C
def end(name): return '/' + name
def tag(name, **kwds): return '<' + name + keywords(kwds) + '>'
def row(cells, **kwds):
return '
' + ''
def ngram_tables(dic, N, pos=[0, 1, 2, 3, 4, -5, -4, -3, -2, -1]):
"""Return three dicts of letter N-grams of length N: counts, counts1, counts2.
counts is a dict of {'AB': 123} that counts how often 'AB' occurs.
counts1[i] is a dict of {'AB': 123} that counts how often 'AB' occurs at position i.
counts2[i][j] is a dict of {'AB': 123} that counts how often 'AB' occurs at position i."""
L = len(max(D, key=len))
counts = Counter()
counts1 = [Counter() for _ in range(L)]
counts2 = [[Counter() for i in range(L)]]
def counter(pairs):
"Make a Counter from an iterable of (value, count) pairs."
c = Counter()
for (value, count) in pairs:
c[value] += count
return c
def ngrams(word, N):
return [word[i:i+N] for i in range(len(word)+1-N)]
import glob
#convert_files(glob.glob('book?'))
#DB = [[letter_counts() for length in range(length)] for length in range(maxlen)]
## Unused ???
def letter_counts(wc):
"""From word_counts dictionary wc, Create a dictionary of {(s, i, L): count}
where s is a letter n-gram, i is the starting position, and L is the length
of the word in which it appears."""
result = defaultdict(int)
for (word, count) in wc.iteritems():
for p in pieces(word):
result[p] += count
return result
def pieces(word):
"Yield the 1- and 2-letter grams in (s, i, L) format."
L = len(word)
for i in range(L):
yield (word[i], i, L)
if i+1 < L:
yield (word[i:i+2], i, L)
def getcount(counts, s, pos, length):
"""The count for letter sequence s (one or two letters) starting at
position i of words of length length. If any argument is all, sum them up."""
if length == all:
return sum(getcount(counts, s, pos, L) for L in all_lengths)
elif pos == all:
return sum(getcount(counts, s, i, length) for i in range(length))
else:
return counts[s, pos, length]
print 'start'
#wc = word_counts('count_100K.txt')
#counts = letter_counts(wc)
print 'end'
def test():
D = {'the': 100, 'of': 70, 'and': 60, 'to': 50, 'a': 40}
def num(ch):
"Translate 'a' or 'A' to 0, ... 'z' or 'Z' to 25."
return 'abcdefghijklmnopqrstuvwxyz'.index(ch.lower())
def stats(D, NS = (1, 2, 3, 4, 5, 6)):
counts = {n: Counter() for n in NS}
print 'words ' + ' '.join(' %d-grams ' % n for n in NS)
for (i, word) in enumerate(sortedby(D), 1):
for n in NS:
for ng in ngrams(word, n):
counts[n][ng] += 1
if i % 5000 == 0 or i == len(D):
print "%4dK" % (i/1000),
for n in NS:
c = len(counts[n])
field = "%5d (%d%%)" % (c, int(round(c*100/(26**n))))
print '%12s' % field,
print
letters = 'ETAOINSRHLDCUMFPGWYBVKXJQZ'
alphabet = ''.join(sorted(letters))
from itertools import cycle, izip
colors = 'ygobp'
def bar(text, color, count, N, pixels, height=16):
width = int(round(pixels * count / N))
if width < 2: width = 3
title = '{}: {:.3f}%; {:,}'.format(text, count*100./N, count)
return '
%s' % (
title, color, height, width, -width+2, text) # -int(width/2+5)
def letter_bar(LC, N=None, factor='', pixels=700):
if N is None: N = sum(LC.values())
#divisor = {'':1., 'K':1e3, 'M':1e6, 'B':1e9}[factor]
return ''.join(
bar(L.lower(), color, LC[L], N, pixels)
for (L, color) in izip(letters, cycle(colors)))
def singleton(x): return [x]
positions = [0, 1, 2, 3, 4, 5, 6, -7, -6, -5, -4, -3, -2, -1]
def substr(word, pos, length):
"""Return the substr of word of given length starting/ending at pos; or None."""
W = len(word)
if pos >= 0 and pos+length <= W:
return word[pos:pos+length]
elif pos < 0 and abs(pos)+length-1 <= W:
return word[W+pos+1-length:W+pos+1]
else:
return None
def lettercount(D, pos):
LC = histogram((substr(w, pos, 1), D[w]) for w in D)
del LC[None]
print LC
pos_name = (str(pos)+'+' if isinstance(pos, tuple) else
pos if pos < 0 else
pos+1)
return '\n
\n%-3s %s' % (pos_name, letter_bar(LC))
def ngramcount(D, n=2):
return histogram((ng, D[w]) for w in D for ng in ngrams(w, n))
def twograms(D2):
N = sum(D2.values())
header = ''
rows = [tr([cell(A+B, D2, N) for A in alphabet]) for B in alphabet]
return '\n'.join([header] + rows + ['
'])
def cell(text, D2, N, height=16, maxwidth=25, scale=27):
count = D2.get(text, 0)
width = int(round(maxwidth * count * scale * 1. / N))
if width < 1: width = 1
title = '{}: {:.3f}%; {:,}'.format(text, count*100./N, count)
return ' %s' % (
title, height, width, -width+2, text)
def cell(text, D2, N, height=16, maxwidth=25, scale=27):
count = D2.get(text, 0)
width = int(round(maxwidth * count * scale * 1. / N))
if width < 1: width = 1
title = '{}: {:.3f}%; {:,}'.format(text, count*100./N, count)
return ' | %s' % (
title, height, width, text)
def tr(cells):
return ' |
' + ''.join(cells)
def comma(n): return '{:,}'.format(n)
def ngram_stats(D, n, k=5):
DN = ngramcount(D, n)
topk = ', '.join(sortedby(DN)[:k])
return '
%d-grams | %s | %s | counts-%d.csv | counts-%d.html | %s' % (
n, comma(len(DN)), comma(sum(DN.values())), n, n, n, n, topk)
#### Tables
def sortedby(D):
return sorted(D, key=lambda x: -D[x])
ANY = '*'
wordlengths = range(1, 10)
def col(*args): return args
def columns(n, wordlengths=wordlengths):
lengths = [k for k in wordlengths if k >= n]
return ([col(ANY, ANY)]
+ [col(k, ANY) for k in lengths]
+ [col(k, start, start+n-1) for k in lengths for start in range(1, 2+k-n)]
+ [col(ANY, start, start+n-1) for start in wordlengths]
+ [col(ANY, -k, -k+n-1) for k in reversed(lengths) if -k+n-1 < 0])
def colname(col):
fmt = '%s/%s' if (len(col) == 2) else '%s/%d:%d'
return fmt % col
def csvline(first, rest):
return '\t'.join([first] + map(str, rest))
def makecsv(n, D=D):
out = file('ngrams%d.csv' % n, 'w')
cols = columns(n)
Dng = defaultdict(lambda: defaultdict(int))
for w in D:
for (start, ng) in enumerate(ngrams(w, n), 1):
entry = Dng[ng]
N = D[w]
wlen = len(w)
entry[ANY, ANY] += N
entry[wlen, ANY] += N
if start <= 9:
entry[wlen, start, start+n-1] += N
entry[ANY, start, start+n-1] += N
from_end = wlen-start+1
if from_end <= 9:
entry[ANY, -from_end, -from_end+n-1] += N
# enumerate ngrams from word and increment counts for each one
print >> out, csvline('%d-gram' % n, map(colname, cols))
for ng in sorted(Dng, key=lambda ng: -Dng[ng][(ANY, ANY)]):
print >> out, csvline(ng, [Dng[ng].get(col, 0) for col in cols])
out.close()
return Dng
### Tests
"""
>>> for w in words:
print '%-6s %6.2f B (%4.2f%%) ' % (w.lower(), D[w]/1e9, D[w]*100./N, int(round(D[w]*4000./N)))
...
the 53.10 B (7.14%)
of 30.97 B (4.16%)
and 22.63 B (3.04%)
to 19.35 B (2.60%)
in 16.89 B (2.27%)
a 15.31 B (2.06%)
is 8.38 B (1.13%)
that 8.00 B (1.08%)
for 6.55 B (0.88%)
it 5.74 B (0.77%)
as 5.70 B (0.77%)
was 5.50 B (0.74%)
with 5.18 B (0.70%)
be 4.82 B (0.65%)
by 4.70 B (0.63%)
on 4.59 B (0.62%)
not 4.52 B (0.61%)
he 4.11 B (0.55%)
i 3.88 B (0.52%)
this 3.83 B (0.51%)
are 3.70 B (0.50%)
or 3.67 B (0.49%)
his 3.61 B (0.49%)
from 3.47 B (0.47%)
at 3.41 B (0.46%)
which 3.14 B (0.42%)
but 2.79 B (0.38%)
have 2.78 B (0.37%)
an 2.73 B (0.37%)
had 2.62 B (0.35%)
they 2.46 B (0.33%)
you 2.34 B (0.31%)
were 2.27 B (0.31%)
their 2.15 B (0.29%)
one 2.15 B (0.29%)
all 2.06 B (0.28%)
we 2.06 B (0.28%)
can 1.67 B (0.22%)
her 1.63 B (0.22%)
has 1.63 B (0.22%)
there 1.62 B (0.22%)
been 1.62 B (0.22%)
if 1.56 B (0.21%)
more 1.55 B (0.21%)
when 1.52 B (0.20%)
will 1.49 B (0.20%)
would 1.47 B (0.20%)
who 1.46 B (0.20%)
so 1.45 B (0.19%)
no 1.40 B (0.19%)
>>> for n in sorted(H):
print '%2d %9.2f M (%6.3f%%) %d' % (n, H[n]/1e6, H[n]*100./NN, H[n]*3000./NN, n)
...
1 22301.22 M ( 2.998%) 1
2 131293.85 M (17.651%) 2
3 152568.38 M (20.511%) 3
4 109988.33 M (14.787%) 4
5 79589.32 M (10.700%) 5
6 62391.21 M ( 8.388%) 6
7 59052.66 M ( 7.939%) 7
8 44207.29 M ( 5.943%) 8
9 33006.93 M ( 4.437%) 9
10 22883.84 M ( 3.076%) 10
11 13098.06 M ( 1.761%) 11
12 7124.15 M ( 0.958%) 12
13 3850.58 M ( 0.518%) 13
14 1653.08 M ( 0.222%) 14
15 565.24 M ( 0.076%) 15
16 151.22 M ( 0.020%) 16
17 72.81 M ( 0.010%) 17
18 28.62 M ( 0.004%) 18
19 8.51 M ( 0.001%) 19
20 6.35 M ( 0.001%) 20
21 0.13 M ( 0.000%) 21
22 0.81 M ( 0.000%) 22
23 0.32 M ( 0.000%) 23
>>> NL = sum(LC.values())
>>> for L in sorted(LC, key=lambda L: -LC[L]):
print '%s %8.1f B (%5.2f%%) ' % (L, LC[L]/1e9, LC[L]*100./NL, LC[L]*3000./NL)
...
E 445.2 B (12.49%)
T 330.5 B ( 9.28%)
A 286.5 B ( 8.04%)
O 272.3 B ( 7.64%)
I 269.7 B ( 7.57%)
N 257.8 B ( 7.23%)
S 232.1 B ( 6.51%)
R 223.8 B ( 6.28%)
H 180.1 B ( 5.05%)
L 145.0 B ( 4.07%)
D 136.0 B ( 3.82%)
C 119.2 B ( 3.34%)
U 97.3 B ( 2.73%)
M 89.5 B ( 2.51%)
F 85.6 B ( 2.40%)
P 76.1 B ( 2.14%)
G 66.6 B ( 1.87%)
W 59.7 B ( 1.68%)
Y 59.3 B ( 1.66%)
B 52.9 B ( 1.48%)
V 37.5 B ( 1.05%)
K 19.3 B ( 0.54%)
X 8.4 B ( 0.23%)
J 5.7 B ( 0.16%)
Q 4.3 B ( 0.12%)
Z 3.2 B ( 0.09%)
>>> D2 = ngramcount(D, 2)
>>> for ng in sorted(D2, key=lambda L: -D2[L])[:50]: print '%s %8.1f B (%5.2f%%) ' % (ng, D2[ng]/1e9, D2[ng]*100./N2, D2[ng]*15000./N2)
def doit(k=25):
counts = [sortedby(ngramcount(D, n))[:k] for n in range(2, 10)]
for i in range(k):
print (' '.join(count[i] for count in counts)).lower()
"""
|