From c4f9ae331a2bcde931e6c8a0020b421b0f0789e1 Mon Sep 17 00:00:00 2001 From: Peter Norvig Date: Tue, 23 Feb 2021 12:00:02 -0800 Subject: [PATCH] Add files via upload --- ipynb/SpellingBee.ipynb | 138 +++++++++++++++++++++++++++++----------- 1 file changed, 101 insertions(+), 37 deletions(-) diff --git a/ipynb/SpellingBee.ipynb b/ipynb/SpellingBee.ipynb index 502551d..8eca526 100644 --- a/ipynb/SpellingBee.ipynb +++ b/ipynb/SpellingBee.ipynb @@ -63,10 +63,13 @@ "source": [ "Word = str # Type for a word\n", "\n", - "def valid_words(text) -> List[Word]:\n", - " \"\"\"Words with at least 4 letters, no 'S', and no more than 7 distinct letters.\"\"\"\n", - " return [w for w in text.upper().split() \n", - " if len(w) >= 4 and 'S' not in w and len(set(w)) <= 7]\n", + "def valid(word) -> bool:\n", + " \"\"\"Does word have at least 4 letters, no 'S', and no more than 7 distinct letters?\"\"\"\n", + " return len(word) >= 4 and 'S' not in word and len(set(word)) <= 7\n", + "\n", + "def valid_words(text, valid=valid) -> List[Word]: \n", + " \"\"\"All the valid words in text.\"\"\"\n", + " return [w for w in text.upper().split() if valid(w)]\n", "\n", "def pangram_bonus(word) -> int: \n", " \"\"\"Does a word get a bonus for having 7 distinct letters (some maybe more than once)?\"\"\"\n", @@ -356,20 +359,20 @@ { "data": { "text/plain": [ - "[Honeycomb(letters='ACEIORT', center='A'),\n", - " Honeycomb(letters='ACEIORT', center='C'),\n", - " Honeycomb(letters='ACEIORT', center='E'),\n", - " Honeycomb(letters='ACEIORT', center='I'),\n", - " Honeycomb(letters='ACEIORT', center='O'),\n", - " Honeycomb(letters='ACEIORT', center='R'),\n", - " Honeycomb(letters='ACEIORT', center='T'),\n", - " Honeycomb(letters='AEGLMPX', center='A'),\n", + "[Honeycomb(letters='AEGLMPX', center='A'),\n", " Honeycomb(letters='AEGLMPX', center='E'),\n", " Honeycomb(letters='AEGLMPX', center='G'),\n", " Honeycomb(letters='AEGLMPX', center='L'),\n", " Honeycomb(letters='AEGLMPX', center='M'),\n", " Honeycomb(letters='AEGLMPX', center='P'),\n", - " Honeycomb(letters='AEGLMPX', center='X')]" + " Honeycomb(letters='AEGLMPX', center='X'),\n", + " Honeycomb(letters='ACEIORT', center='A'),\n", + " Honeycomb(letters='ACEIORT', center='C'),\n", + " Honeycomb(letters='ACEIORT', center='E'),\n", + " Honeycomb(letters='ACEIORT', center='I'),\n", + " Honeycomb(letters='ACEIORT', center='O'),\n", + " Honeycomb(letters='ACEIORT', center='R'),\n", + " Honeycomb(letters='ACEIORT', center='T')]" ] }, "execution_count": 13, @@ -536,8 +539,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 9.77 ms, sys: 35 µs, total: 9.8 ms\n", - "Wall time: 9.8 ms\n" + "CPU times: user 9.27 ms, sys: 37 µs, total: 9.31 ms\n", + "Wall time: 9.3 ms\n" ] }, { @@ -579,7 +582,7 @@ } ], "source": [ - ".01 * 55902 / 60" + "55902 * 10/1000 / 60" ] }, { @@ -757,8 +760,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 1.81 s, sys: 2.31 ms, total: 1.82 s\n", - "Wall time: 1.82 s\n" + "CPU times: user 1.81 s, sys: 3 ms, total: 1.81 s\n", + "Wall time: 1.81 s\n" ] }, { @@ -840,7 +843,7 @@ "metadata": {}, "outputs": [], "source": [ - "def best_honeycomb(words) -> Honeycomb: \n", + "def best_honeycomb2(words) -> Honeycomb: \n", " \"\"\"Return a honeycomb with highest game score on these words.\"\"\"\n", " points_table = tabulate_points(words)\n", " best, best_score = None, 0\n", @@ -864,8 +867,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 406 ms, sys: 1.13 ms, total: 407 ms\n", - "Wall time: 406 ms\n" + "CPU times: user 408 ms, sys: 1.74 ms, total: 410 ms\n", + "Wall time: 409 ms\n" ] }, { @@ -880,7 +883,7 @@ } ], "source": [ - "%time best_honeycomb(enable1)" + "%time best_honeycomb2(enable1)" ] }, { @@ -987,7 +990,7 @@ { "data": { "text/plain": [ - "[('has S', 103913), ('valid', 44585), ('> 7', 23400), ('< 4', 922)]" + "Counter({'< 4': 922, 'valid': 44585, 'has S': 103913, '> 7': 23400})" ] }, "execution_count": 33, @@ -1000,7 +1003,7 @@ " '< 4' if len(w) < 4 else \n", " '> 7' if len(set(w)) > 7 else \n", " 'valid'\n", - " for w in open('enable1.txt').read().upper().split()).most_common()" + " for w in valid_words(open('enable1.txt').read(), lambda w: True))" ] }, { @@ -1075,7 +1078,68 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The best honeycomb is also the highest scoring letter subset on its own (although it only gets 832 of the 3,898 total points from using all seven letters)." + "The best honeycomb is also the highest scoring letter subset on its own (although it only gets 832 of the 3,898 total points from using all seven letters).\n", + "\n", + "### How many honeycombs does `best_honeycomb2` consider?\n", + "\n", + "We know that `best_honeycomb` considers 7,986 × 7 = 55,902 honeycombs. How many does `best_honeycomb2` consider? We can answer that by wrapping `Honeycomb` with a decorator that counts calls:" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(Honeycomb(letters='AEGINRT', center='R'), 8084)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def call_counter(fn):\n", + " \"Return a function that calls fn, and increments a counter on each call.\"\n", + " def wrapped(*args, **kwds):\n", + " wrapped.call_counter += 1\n", + " return fn(*args, **kwds)\n", + " wrapped.call_counter = 0\n", + " return wrapped\n", + " \n", + "Honeycomb = call_counter(Honeycomb)\n", + "\n", + "best_honeycomb2(enable1), Honeycomb.call_counter" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "14.0" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(8084 - 7986) / 7" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That means that most pangrams are only considered once; for only 14 pangrams do we consider all seven centers." ] }, { @@ -1089,7 +1153,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -1131,7 +1195,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -1156,7 +1220,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -1338,7 +1402,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -1347,14 +1411,14 @@ "(98141, 44585)" ] }, - "execution_count": 39, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "enable1s = [w for w in open('enable1.txt').read().upper().split() \n", - " if len(w) >= 4 and len(set(w)) <= 7]\n", + "enable1s = valid_words(open('enable1.txt').read(), \n", + " lambda w: len(w) >= 4 and len(set(w)) <= 7)\n", "\n", "len(enable1s), len(enable1)" ] @@ -1368,7 +1432,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -1664,10 +1728,10 @@ "\n", "Thanks to a series of ideas, we were able to achieve a substantial reduction in the number of honeycombs that need to be examined (a factor of 400), the run time needed for `game_score` (a factor of about 200), and the overall run time (a factor of about 70,000).\n", "\n", - "- **Enumeration (10 hours (estimate) run time; 3,364,900 honeycombs)**
Try every possible honeycomb.\n", - "- **Pangram Lettersets (10 minutes (estimate) run time; 55,902 honeycombs)**
Try just the honeycombs that are pangram lettersets (with every center).\n", - "- **Points Table (under 2 seconds run time; 55,902 honeycombs)**
Precompute the score for each letterset, and sum the 64 letter subsets of each honeycomb.\n", - "- **Branch and Bound (under 1/2 second run time; 8,084 honeycombs)**
Try every center only for lettersets that score better than the best score so far.\n", + "- **Enumeration (3,364,900 honeycombs; 10 hours (estimate) run time)**
Try every possible honeycomb.\n", + "- **Pangram Lettersets (55,902 honeycombs; 10 minutes (estimate) run time)**
Try just the honeycombs that are pangram lettersets (with every center).\n", + "- **Points Table (55,902 honeycombs; under 2 seconds run time)**
Precompute the score for each letterset, and sum the 64 letter subsets of each honeycomb.\n", + "- **Branch and Bound (8,084 honeycombs; under 1/2 second run time)**
Try every center only for lettersets that score better than the best score so far.\n", "\n", "\n", "\n",