""" Code to support http://norvig.com/mayzner.html Read files in the Google Books ngram format, and convert them to a simpler format. The original format looks like this: word \t year \t word_count \t book_count word_POS \t year \t word_count \t book_count for example, accreted_VERB 1846 7 4 accreted_VERB 1847 1 1 accreted_VERB 1848 1 1 The function 'read_year_file' will convert a file of this form into a dict of {WORD: count} pairs, where the WORD is uppercased, and the count is the total over all years (you have the option to specify a starting year) and all capitalizations. Then 'read_dict' and 'write_dict' convert between a dict and an external file format that looks like this: ACCRETED 9 """ from __future__ import division from collections import Counter, defaultdict #### Read files in Books-Ngram format; convert to a dict def read_year_file(filename, dic=None): """Read a file of 'word year word_count book_count' lines and convert to a dict {WORD: totalcount}. Uppercase all words, and only include all-alphabetic words.""" if dic is None: dic = {} for line in file(filename): word, year, c1, c2 = line.split('\t') if '_' in word: word = word[:word.index('_')] if word.isalpha(): word = word.upper() dic[word] = dic.get(word, 0) + int(c1) return dic #### Read and write files of the form 'WORD \t count \n' def write_dict(dic, filename): "Write a {word:count} dict as 'word \t count' lines in filename." out = file(filename, 'w') for key in sorted(dic): out.write('%s\t%s\n' % (key, dic[key])) return out.close() def read_dict(filename, sep='\t'): "Read 'word \t count' lines from file and make them into a dict of {word:count}." pairs = (line.split(sep) for line in file(filename)) return {word: int(count) for (word, count) in pairs} #### Convert a bunch of year files into dict file format. def convert_files(filenames, mincount=1e5): def report(filename, D, adj): import time N = len(D) W = sum(v for v in D.itervalues()) print '%s: %s %s words (%s tokens) at %s' % ( filename, adj, format(W, ',d'), format(N, ',d'), time.strftime("%H:%M:%S", time.gmtime())) for f in filenames: report(f, {}, 'starting') D = read_year_file(f) report(f, D, 'total') for key in list(D): if D[key] < mincount: del D[key] write_dict(D, 'WORD-' + f[-1].upper()) report(f, D, 'popular') def load(filename='top-words.txt'): "Load file of 'word \t count' lines into D (a dict), W (length of D) and M (total number of words)." global D, W, M D = read_dict(filename) W = len(D) M = sum(D.values()) #### Compute letter counts and save as HTML files. def histogram(items): "Return a Counter of the number of times each key occurs in (key, val) pairs." C = Counter() for (key, val) in items: C[key] += val return C def end(name): return '/' + name def tag(name, **kwds): return '<' + name + keywords(kwds) + '>' def row(cells, **kwds): return '' + '' def ngram_tables(dic, N, pos=[0, 1, 2, 3, 4, -5, -4, -3, -2, -1]): """Return three dicts of letter N-grams of length N: counts, counts1, counts2. counts is a dict of {'AB': 123} that counts how often 'AB' occurs. counts1[i] is a dict of {'AB': 123} that counts how often 'AB' occurs at position i. counts2[i][j] is a dict of {'AB': 123} that counts how often 'AB' occurs at position i.""" L = len(max(D, key=len)) counts = Counter() counts1 = [Counter() for _ in range(L)] counts2 = [[Counter() for i in range(L)]] def counter(pairs): "Make a Counter from an iterable of (value, count) pairs." c = Counter() for (value, count) in pairs: c[value] += count return c def ngrams(word, N): return [word[i:i+N] for i in range(len(word)+1-N)] import glob #convert_files(glob.glob('book?')) #DB = [[letter_counts() for length in range(length)] for length in range(maxlen)] ## Unused ??? def letter_counts(wc): """From word_counts dictionary wc, Create a dictionary of {(s, i, L): count} where s is a letter n-gram, i is the starting position, and L is the length of the word in which it appears.""" result = defaultdict(int) for (word, count) in wc.iteritems(): for p in pieces(word): result[p] += count return result def pieces(word): "Yield the 1- and 2-letter grams in (s, i, L) format." L = len(word) for i in range(L): yield (word[i], i, L) if i+1 < L: yield (word[i:i+2], i, L) def getcount(counts, s, pos, length): """The count for letter sequence s (one or two letters) starting at position i of words of length length. If any argument is all, sum them up.""" if length == all: return sum(getcount(counts, s, pos, L) for L in all_lengths) elif pos == all: return sum(getcount(counts, s, i, length) for i in range(length)) else: return counts[s, pos, length] print 'start' #wc = word_counts('count_100K.txt') #counts = letter_counts(wc) print 'end' def test(): D = {'the': 100, 'of': 70, 'and': 60, 'to': 50, 'a': 40} def num(ch): "Translate 'a' or 'A' to 0, ... 'z' or 'Z' to 25." return 'abcdefghijklmnopqrstuvwxyz'.index(ch.lower()) def stats(D, NS = (1, 2, 3, 4, 5, 6)): counts = {n: Counter() for n in NS} print 'words ' + ' '.join(' %d-grams ' % n for n in NS) for (i, word) in enumerate(sortedby(D), 1): for n in NS: for ng in ngrams(word, n): counts[n][ng] += 1 if i % 5000 == 0 or i == len(D): print "%4dK" % (i/1000), for n in NS: c = len(counts[n]) field = "%5d (%d%%)" % (c, int(round(c*100/(26**n)))) print '%12s' % field, print letters = 'ETAOINSRHLDCUMFPGWYBVKXJQZ' alphabet = ''.join(sorted(letters)) from itertools import cycle, izip colors = 'ygobp' def bar(text, color, count, N, pixels, height=16): width = int(round(pixels * count / N)) if width < 2: width = 3 title = '{}: {:.3f}%; {:,}'.format(text, count*100./N, count) return '

%s' % ( title, color, height, width, -width+2, text) # -int(width/2+5) def letter_bar(LC, N=None, factor='', pixels=700): if N is None: N = sum(LC.values()) #divisor = {'':1., 'K':1e3, 'M':1e6, 'B':1e9}[factor] return ''.join( bar(L.lower(), color, LC[L], N, pixels) for (L, color) in izip(letters, cycle(colors))) def singleton(x): return [x] positions = [0, 1, 2, 3, 4, 5, 6, -7, -6, -5, -4, -3, -2, -1] def substr(word, pos, length): """Return the substr of word of given length starting/ending at pos; or None.""" W = len(word) if pos >= 0 and pos+length <= W: return word[pos:pos+length] elif pos < 0 and abs(pos)+length-1 <= W: return word[W+pos+1-length:W+pos+1] else: return None def lettercount(D, pos): LC = histogram((substr(w, pos, 1), D[w]) for w in D) del LC[None] print LC pos_name = (str(pos)+'+' if isinstance(pos, tuple) else pos if pos < 0 else pos+1) return '\n
\n%-3s %s' % (pos_name, letter_bar(LC)) def ngramcount(D, n=2): return histogram((ng, D[w]) for w in D for ng in ngrams(w, n)) def twograms(D2): N = sum(D2.values()) header = '' rows = [tr([cell(A+B, D2, N) for A in alphabet]) for B in alphabet] return '\n'.join([header] + rows + ['

']) def cell(text, D2, N, height=16, maxwidth=25, scale=27): count = D2.get(text, 0) width = int(round(maxwidth * count * scale * 1. / N)) if width < 1: width = 1 title = '{}: {:.3f}%; {:,}'.format(text, count*100./N, count) return '

%s' % ( title, height, width, -width+2, text) def cell(text, D2, N, height=16, maxwidth=25, scale=27): count = D2.get(text, 0) width = int(round(maxwidth * count * scale * 1. / N)) if width < 1: width = 1 title = '{}: {:.3f}%; {:,}'.format(text, count*100./N, count) return '%s' % ( title, height, width, text) def tr(cells): return '' + ''.join(cells) def comma(n): return '{:,}'.format(n) def ngram_stats(D, n, k=5): DN = ngramcount(D, n) topk = ', '.join(sortedby(DN)[:k]) return '%d-grams%s%scounts-%d.csv counts-%d.html%s' % ( n, comma(len(DN)), comma(sum(DN.values())), n, n, n, n, topk) #### Tables def sortedby(D): return sorted(D, key=lambda x: -D[x]) ANY = '*' wordlengths = range(1, 10) def col(*args): return args def columns(n, wordlengths=wordlengths): lengths = [k for k in wordlengths if k >= n] return ([col(ANY, ANY)] + [col(k, ANY) for k in lengths] + [col(k, start, start+n-1) for k in lengths for start in range(1, 2+k-n)] + [col(ANY, start, start+n-1) for start in wordlengths] + [col(ANY, -k, -k+n-1) for k in reversed(lengths) if -k+n-1 < 0]) def colname(col): fmt = '%s/%s' if (len(col) == 2) else '%s/%d:%d' return fmt % col def csvline(first, rest): return '\t'.join([first] + map(str, rest)) def makecsv(n, D=D): out = file('ngrams%d.csv' % n, 'w') cols = columns(n) Dng = defaultdict(lambda: defaultdict(int)) for w in D: for (start, ng) in enumerate(ngrams(w, n), 1): entry = Dng[ng] N = D[w] wlen = len(w) entry[ANY, ANY] += N entry[wlen, ANY] += N if start <= 9: entry[wlen, start, start+n-1] += N entry[ANY, start, start+n-1] += N from_end = wlen-start+1 if from_end <= 9: entry[ANY, -from_end, -from_end+n-1] += N # enumerate ngrams from word and increment counts for each one print >> out, csvline('%d-gram' % n, map(colname, cols)) for ng in sorted(Dng, key=lambda ng: -Dng[ng][(ANY, ANY)]): print >> out, csvline(ng, [Dng[ng].get(col, 0) for col in cols]) out.close() return Dng ### Tests """ >>> for w in words: print '%-6s %6.2f B (%4.2f%%)

' % (w.lower(), D[w]/1e9, D[w]*100./N, int(round(D[w]*4000./N))) ... the 53.10 B (7.14%)

of 30.97 B (4.16%)

and 22.63 B (3.04%)

to 19.35 B (2.60%)

in 16.89 B (2.27%)

a 15.31 B (2.06%)

is 8.38 B (1.13%)

that 8.00 B (1.08%)

for 6.55 B (0.88%)

it 5.74 B (0.77%)

as 5.70 B (0.77%)

was 5.50 B (0.74%)

with 5.18 B (0.70%)

be 4.82 B (0.65%)

by 4.70 B (0.63%)

on 4.59 B (0.62%)

not 4.52 B (0.61%)

he 4.11 B (0.55%)

i 3.88 B (0.52%)

this 3.83 B (0.51%)

are 3.70 B (0.50%)

or 3.67 B (0.49%)

his 3.61 B (0.49%)

from 3.47 B (0.47%)

at 3.41 B (0.46%)

which 3.14 B (0.42%)

but 2.79 B (0.38%)

have 2.78 B (0.37%)

an 2.73 B (0.37%)

had 2.62 B (0.35%)

they 2.46 B (0.33%)

you 2.34 B (0.31%)

were 2.27 B (0.31%)

their 2.15 B (0.29%)

one 2.15 B (0.29%)

all 2.06 B (0.28%)

we 2.06 B (0.28%)

can 1.67 B (0.22%)

her 1.63 B (0.22%)

has 1.63 B (0.22%)

there 1.62 B (0.22%)

been 1.62 B (0.22%)

if 1.56 B (0.21%)

more 1.55 B (0.21%)

when 1.52 B (0.20%)

will 1.49 B (0.20%)

would 1.47 B (0.20%)

who 1.46 B (0.20%)

so 1.45 B (0.19%)

no 1.40 B (0.19%)

>>> for n in sorted(H): print '%2d %9.2f M (%6.3f%%)

%d' % (n, H[n]/1e6, H[n]*100./NN, H[n]*3000./NN, n) ... 1 22301.22 M ( 2.998%)

1 2 131293.85 M (17.651%)

2 3 152568.38 M (20.511%)

3 4 109988.33 M (14.787%)

4 5 79589.32 M (10.700%)

5 6 62391.21 M ( 8.388%)

6 7 59052.66 M ( 7.939%)

7 8 44207.29 M ( 5.943%)

8 9 33006.93 M ( 4.437%)

9 10 22883.84 M ( 3.076%)

10 11 13098.06 M ( 1.761%)

11 12 7124.15 M ( 0.958%)

12 13 3850.58 M ( 0.518%)

13 14 1653.08 M ( 0.222%)

14 15 565.24 M ( 0.076%)

15 16 151.22 M ( 0.020%)

16 17 72.81 M ( 0.010%)

17 18 28.62 M ( 0.004%)

18 19 8.51 M ( 0.001%)

19 20 6.35 M ( 0.001%)

20 21 0.13 M ( 0.000%)

21 22 0.81 M ( 0.000%)

22 23 0.32 M ( 0.000%)

23 >>> NL = sum(LC.values()) >>> for L in sorted(LC, key=lambda L: -LC[L]): print '%s %8.1f B (%5.2f%%)

' % (L, LC[L]/1e9, LC[L]*100./NL, LC[L]*3000./NL) ... E 445.2 B (12.49%)

T 330.5 B ( 9.28%)

A 286.5 B ( 8.04%)

O 272.3 B ( 7.64%)

I 269.7 B ( 7.57%)

N 257.8 B ( 7.23%)

S 232.1 B ( 6.51%)

R 223.8 B ( 6.28%)

H 180.1 B ( 5.05%)

L 145.0 B ( 4.07%)

D 136.0 B ( 3.82%)

C 119.2 B ( 3.34%)

U 97.3 B ( 2.73%)

M 89.5 B ( 2.51%)

F 85.6 B ( 2.40%)

P 76.1 B ( 2.14%)

G 66.6 B ( 1.87%)

W 59.7 B ( 1.68%)

Y 59.3 B ( 1.66%)

B 52.9 B ( 1.48%)

V 37.5 B ( 1.05%)

K 19.3 B ( 0.54%)

X 8.4 B ( 0.23%)

J 5.7 B ( 0.16%)

Q 4.3 B ( 0.12%)

Z 3.2 B ( 0.09%)

>>> D2 = ngramcount(D, 2) >>> for ng in sorted(D2, key=lambda L: -D2[L])[:50]: print '%s %8.1f B (%5.2f%%)

' % (ng, D2[ng]/1e9, D2[ng]*100./N2, D2[ng]*15000./N2) def doit(k=25): counts = [sortedby(ngramcount(D, n))[:k] for n in range(2, 10)] for i in range(k): print (' '.join(count[i] for count in counts)).lower() """