Add subdirectories
Add /ipynb/ and /py/ subdirectories to keep the home page neater.
This commit is contained in:
106
py/spell.py
Normal file
106
py/spell.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""Spelling Corrector in Python 3; see http://norvig.com/spell-correct.html
|
||||
|
||||
Copyright (c) 2007-2016 Peter Norvig
|
||||
MIT license: www.opensource.org/licenses/mit-license.php
|
||||
"""
|
||||
|
||||
################ Spelling Corrector
|
||||
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
def words(text): return re.findall(r'\w+', text.lower())
|
||||
|
||||
WORDS = Counter(words(open('big.txt').read()))
|
||||
|
||||
def P(word, N=sum(WORDS.values())):
|
||||
"Probability of `word`."
|
||||
return WORDS[word] / N
|
||||
|
||||
def correction(word):
|
||||
"Most probable spelling correction for word."
|
||||
return max(candidates(word), key=P)
|
||||
|
||||
def candidates(word):
|
||||
"Generate possible spelling corrections for word."
|
||||
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
|
||||
|
||||
def known(words):
|
||||
"The subset of `words` that appear in the dictionary of WORDS."
|
||||
return set(w for w in words if w in WORDS)
|
||||
|
||||
def edits1(word):
|
||||
"All edits that are one edit away from `word`."
|
||||
letters = 'abcdefghijklmnopqrstuvwxyz'
|
||||
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
|
||||
deletes = [L + R[1:] for L, R in splits if R]
|
||||
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
|
||||
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
|
||||
inserts = [L + c + R for L, R in splits for c in letters]
|
||||
return set(deletes + transposes + replaces + inserts)
|
||||
|
||||
def edits2(word):
|
||||
"All edits that are two edits away from `word`."
|
||||
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
||||
|
||||
################ Test Code
|
||||
|
||||
def unit_tests():
|
||||
assert correction('speling') == 'spelling' # insert
|
||||
assert correction('korrectud') == 'corrected' # replace 2
|
||||
assert correction('bycycle') == 'bicycle' # replace
|
||||
assert correction('inconvient') == 'inconvenient' # insert 2
|
||||
assert correction('arrainged') == 'arranged' # delete
|
||||
assert correction('peotry') =='poetry' # transpose
|
||||
assert correction('peotryy') =='poetry' # transpose + delete
|
||||
assert correction('word') == 'word' # known
|
||||
assert correction('quintessential') == 'quintessential' # unknown
|
||||
assert words('This is a TEST.') == ['this', 'is', 'a', 'test']
|
||||
assert Counter(words('This is a test. 123; A TEST this is.')) == (
|
||||
Counter({'123': 1, 'a': 2, 'is': 2, 'test': 2, 'this': 2}))
|
||||
assert len(WORDS) == 32192
|
||||
assert sum(WORDS.values()) == 1115504
|
||||
assert WORDS.most_common(10) == [
|
||||
('the', 79808),
|
||||
('of', 40024),
|
||||
('and', 38311),
|
||||
('to', 28765),
|
||||
('in', 22020),
|
||||
('a', 21124),
|
||||
('that', 12512),
|
||||
('he', 12401),
|
||||
('was', 11410),
|
||||
('it', 10681)]
|
||||
assert WORDS['the'] == 79808
|
||||
assert P('quintessential') == 0
|
||||
assert 0.07 < P('the') < 0.08
|
||||
return 'unit_tests pass'
|
||||
|
||||
def spelltest(tests, verbose=False):
|
||||
"Run correction(wrong) on all (right, wrong) pairs; report results."
|
||||
import time
|
||||
start = time.clock()
|
||||
good, unknown = 0, 0
|
||||
n = len(tests)
|
||||
for right, wrong in tests:
|
||||
w = correction(wrong)
|
||||
good += (w == right)
|
||||
if w != right:
|
||||
unknown += (right not in WORDS)
|
||||
if verbose:
|
||||
print('correction({}) => {} ({}); expected {} ({})'
|
||||
.format(wrong, w, WORDS[w], right, WORDS[right]))
|
||||
dt = time.clock() - start
|
||||
print('{:.0%} of {} correct ({:.0%} unknown) at {:.0f} words per second '
|
||||
.format(good / n, n, unknown / n, n / dt))
|
||||
|
||||
def Testset(lines):
|
||||
"Parse 'right: wrong1 wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs."
|
||||
return [(right, wrong)
|
||||
for (right, wrongs) in (line.split(':') for line in lines)
|
||||
for wrong in wrongs.split()]
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(unit_tests())
|
||||
spelltest(Testset(open('spell-testset1.txt')))
|
||||
spelltest(Testset(open('spell-testset2.txt')))
|
||||
Reference in New Issue
Block a user