Python 3 fixes

This commit is contained in:
cclauss
2018-10-17 01:48:46 +02:00
parent 1697ee1c43
commit 6a8e87e17b
13 changed files with 140 additions and 116 deletions

View File

@@ -23,6 +23,7 @@ an external file format that looks like this:
"""
from __future__ import division
from __future__ import print_function
from collections import Counter, defaultdict
#### Read files in Books-Ngram format; convert to a dict
@@ -31,7 +32,7 @@ def read_year_file(filename, dic=None):
"""Read a file of 'word year word_count book_count' lines and convert to a dict
{WORD: totalcount}. Uppercase all words, and only include all-alphabetic words."""
if dic is None: dic = {}
for line in file(filename):
for line in open(filename):
word, year, c1, c2 = line.split('\t')
if '_' in word:
word = word[:word.index('_')]
@@ -44,14 +45,14 @@ def read_year_file(filename, dic=None):
def write_dict(dic, filename):
"Write a {word:count} dict as 'word \t count' lines in filename."
out = file(filename, 'w')
out = open(filename, 'w')
for key in sorted(dic):
out.write('%s\t%s\n' % (key, dic[key]))
return out.close()
def read_dict(filename, sep='\t'):
"Read 'word \t count' lines from file and make them into a dict of {word:count}."
pairs = (line.split(sep) for line in file(filename))
pairs = (line.split(sep) for line in open(filename))
return {word: int(count) for (word, count) in pairs}
#### Convert a bunch of year files into dict file format.
@@ -61,9 +62,9 @@ def convert_files(filenames, mincount=1e5):
import time
N = len(D)
W = sum(v for v in D.itervalues())
print '%s: %s %s words (%s tokens) at %s' % (
print('%s: %s %s words (%s tokens) at %s' % (
filename, adj, format(W, ',d'), format(N, ',d'),
time.strftime("%H:%M:%S", time.gmtime()))
time.strftime("%H:%M:%S", time.gmtime())))
for f in filenames:
report(f, {}, 'starting')
D = read_year_file(f)
@@ -155,10 +156,10 @@ def getcount(counts, s, pos, length):
return counts[s, pos, length]
print 'start'
print('start')
#wc = word_counts('count_100K.txt')
#counts = letter_counts(wc)
print 'end'
print('end')
@@ -172,18 +173,18 @@ def num(ch):
def stats(D, NS = (1, 2, 3, 4, 5, 6)):
counts = {n: Counter() for n in NS}
print 'words ' + ' '.join(' %d-grams ' % n for n in NS)
print('words ' + ' '.join(' %d-grams ' % n for n in NS))
for (i, word) in enumerate(sortedby(D), 1):
for n in NS:
for ng in ngrams(word, n):
counts[n][ng] += 1
if i % 5000 == 0 or i == len(D):
print "%4dK" % (i/1000),
print("%4dK" % (i/1000), end=' ')
for n in NS:
c = len(counts[n])
field = "%5d (%d%%)" % (c, int(round(c*100/(26**n))))
print '%12s' % field,
print
print('%12s' % field, end=' ')
print()
letters = 'ETAOINSRHLDCUMFPGWYBVKXJQZ'
alphabet = ''.join(sorted(letters))
@@ -224,7 +225,7 @@ def substr(word, pos, length):
def lettercount(D, pos):
LC = histogram((substr(w, pos, 1), D[w]) for w in D)
del LC[None]
print LC
print(LC)
pos_name = (str(pos)+'+' if isinstance(pos, tuple) else
pos if pos < 0 else
pos+1)
@@ -293,7 +294,7 @@ def csvline(first, rest):
return '\t'.join([first] + map(str, rest))
def makecsv(n, D=D):
out = file('ngrams%d.csv' % n, 'w')
out = open('ngrams%d.csv' % n, 'w')
cols = columns(n)
Dng = defaultdict(lambda: defaultdict(int))
for w in D:
@@ -310,9 +311,9 @@ def makecsv(n, D=D):
if from_end <= 9:
entry[ANY, -from_end, -from_end+n-1] += N
# enumerate ngrams from word and increment counts for each one
print >> out, csvline('%d-gram' % n, map(colname, cols))
print(csvline('%d-gram' % n, map(colname, cols)), file=out)
for ng in sorted(Dng, key=lambda ng: -Dng[ng][(ANY, ANY)]):
print >> out, csvline(ng, [Dng[ng].get(col, 0) for col in cols])
print(csvline(ng, [Dng[ng].get(col, 0) for col in cols]), file=out)
out.close()
return Dng