diff --git a/ipynb/xkxd-part3.ipynb b/ipynb/xkxd-part3.ipynb deleted file mode 100644 index e2388b4..0000000 --- a/ipynb/xkxd-part3.ipynb +++ /dev/null @@ -1,428 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def partition(covers):\n", - " # covers: {w: {r,...}}\n", - " # invcovers: {r: {w,...}}\n", - " pass\n", - "\n", - "def connected(w, covers, invcovers, result):\n", - " if w not in result:\n", - " result.add(w)\n", - " for r in covers[w]:\n", - " for w2 in invcovers[r]:\n", - " connected(w2, covers, invcovers, result)\n", - " return result\n", - "\n", - "for (W, L, legend) in ALL:\n", - " covers = eliminate_dominated(regex_covers(W, L))\n", - " invcovers = invert_multimap(covers)\n", - " start = list(covers)[2]\n", - " P = connected(start, covers, invcovers, set())\n", - " print legend, len(P), len(covers), len(covers)-len(P)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finding Shorter Regexes: Trying Multiple Times\n", - "----\n", - " \n", - "Why run just two versions of `findregex`? Why not run 1000 variations, and then pick the best solution? Of course, I don't want to write 1000 different functions by hand; I want an automated way of varying each run. I can think of three easy things to vary:\n", - " \n", - "* The number '4' in the `score` function. That is, vary the tradeoff between number of winners matched and number of characters.\n", - "* The tie-breaker. In case of a tie, Python's `max` function always picks the first one. Let's make it choose a different 'best' regex from among all those that tie.\n", - "* The greediness. Don't be so greedy (picking the best) every time. Occasionally pick a not-quite-best component, and see if that works out better.\n", - " \n", - "The first of these is easy; we just use the `random.choice` function to choose an integer, `K`, to serve as the tradeoff factor. \n", - "\n", - "The second is easy too. We could write an alternative to the `max` function, say `max_random_tiebreaker`. That would work, but an easier approach is to build the tiebreaker into the `score` function. In addition to awarding points for matching winners and the number of characters, we will have add in a tiebreaker: a random number between 0 and 1. Since all the scores are otherwise integers, this will not change the order of the scores, but it will break ties.\n", - "\n", - "The third we can accomplish by allowing the random factor to be larger than 1 (allowing us to pick a component that is not the shortest) or even larger than `K` (allowing us to pick a component that does not cover the most winners). \n", - " \n", - "I will factor out the function `greedy_search` to do a single computation oof a covering regex, while keeping the name `findregex` for the top level function that now calls `greedy_search` 1000 times and chooses the best (shortest length) result." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "def findregex(winners, losers, tries=1000):\n", - " \"Find a regex that matches all winners but no losers (sets of strings).\"\n", - " # Repeatedly call 'findregex1' the given number of tries; pick the shortest result\n", - " covers = regex_covers(winners, losers)\n", - " results = [greedy_search(winners, covers) for _ in range(tries)]\n", - " return min(results, key=len)\n", - "\n", - "def greedy_search(winners, covers):\n", - " # On each iteration, add the 'best' component in covers to 'result',\n", - " # remove winners covered by best, and remove from 'pool' any components\n", - " # that no longer match any remaining winners.\n", - " winners = set(winners) # Copy input so as not to modify it.\n", - " pool = set(covers)\n", - " result = []\n", - " \n", - " def matches(regex, strings): return {w for w in covers[regex] if w in strings}\n", - " \n", - " K = random.choice((2, 3, 4, 4, 5, 6))\n", - " T = random.choice((1., 1.5, 2., K+1., K+2.))\n", - " def score(c): \n", - " return K * len(matches(c, winners)) - len(c) + random.uniform(0., T)\n", - " \n", - " while winners:\n", - " best = max(pool, key=score)\n", - " result.append(best)\n", - " winners -= covers[best]\n", - " pool -= {c for c in pool if covers[c].isdisjoint(winners)}\n", - " return OR(result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def factorial1(n):\n", - " if (n <= 1):\n", - " return 1\n", - " else:\n", - " return n * factorial1(n-1)\n", - "\n", - "def factorial2(n, partial_solution=1):\n", - " if (n <= 1):\n", - " return partial_solution\n", - " else:\n", - " return factorial2(n-1, n * partial_solution)\n", - " \n", - "assert factorial1(6) == factorial2(6) == 720" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def findregex(winners, losers, calls=100000):\n", - " \"Find the shortest disjunction of regex components that covers winners but not losers.\"\n", - " covers = regex_covers(winners, losers)\n", - " best = '^(' + OR(winners) + ')$'\n", - " state = Struct(best=best, calls=calls)\n", - " return bb_search('', covers, state).best\n", - "\n", - "def bb_search(regex, covers, state):\n", - " \"\"\"Recursively build a shortest regex from the components in covers.\"\"\"\n", - " if state.calls > 0:\n", - " state.calls -= 1\n", - " regex, covers = simplify_covers(regex, covers)\n", - " if not covers:\n", - " state.best = min(regex, state.best, key=len)\n", - " elif len(OR2(regex, min(covers, key=len))) < len(state.best):\n", - " # Try with and without the greedy-best component\n", - " def score(c): return 4 * len(covers[c]) - len(c)\n", - " best = max(covers, key=score)\n", - " covered = covers[best]\n", - " covers.pop(best)\n", - " bb_search(OR2(regex, best), {c:covers[c]-covered for c in covers}, state)\n", - " bb_search(regex, covers, state)\n", - " return state\n", - "\n", - "class Struct(object):\n", - " \"A mutable structure with specified fields and values.\"\n", - " def __init__(self, **kwds): vars(self).update(kwds)\n", - " def __repr__(self): return '<%s>' % vars(self)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def findregex(winners, losers, calls=100000):\n", - " \"Find the shortest disjunction of regex components that covers winners but not losers.\"\n", - " covers = regex_covers(winners, losers)\n", - " solution = '^(' + OR(winners) + ')$'\n", - " solution, calls = bb_search('', covers, solution, calls)\n", - " return solution\n", - "\n", - "def bb_search(regex, covers, solution, calls):\n", - " \"\"\"Recursively build a shortest regex from the components in covers.\"\"\"\n", - " if calls > 0:\n", - " calls -= 1\n", - " regex, covers = simplify_covers(regex, covers)\n", - " if not covers: # Solution is complete\n", - " solution = min(regex, solution, key=len)\n", - " elif len(OR2(regex, min(covers, key=len))) < len(solution):\n", - " # Try with and without the greedy-best component\n", - " def score(c): return 4 * len(covers[c]) - len(c)\n", - " r = max(covers, key=score) # Best component\n", - " covered = covers[r] # Set of winners covered by r\n", - " covers.pop(r)\n", - " solution, calls = bb_search(OR2(regex, r), \n", - " {c:covers[c]-covered for c in covers}, \n", - " solution, calls)\n", - " solution, calls = bb_search(regex, covers, solution, calls)\n", - " return solution, calls" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def findregex(winners, losers, calls=100000):\n", - " \"Find the shortest disjunction of regex components that covers winners but not losers.\"\n", - " global SOLUTION, CALLS\n", - " SOLUTION = '^(' + OR(winners) + ')$'\n", - " CALLS = calls\n", - " return bb_search(None, regex_covers(winners, losers))\n", - "\n", - "def bb_search(regex, covers):\n", - " \"\"\"Recursively build a shortest regex from the components in covers.\"\"\"\n", - " global SOLUTION, CALLS\n", - " CALLS -= 1\n", - " regex, covers = simplify_covers(regex, covers)\n", - " if not covers: # Solution is complete\n", - " SOLUTION = min(regex, SOLUTION, key=len)\n", - " elif CALLS >= 0 and len(OR(regex, min(covers, key=len))) < len(SOLUTION):\n", - " # Try with and without the greedy-best component\n", - " def score(c): return 4 * len(covers[c]) - len(c)\n", - " r = max(covers, key=score) # Best component\n", - " covered = covers[r] # Set of winners covered by r\n", - " covers.pop(r)\n", - " bb_search(OR(regex, r), {c:covers[c]-covered for c in covers})\n", - " bb_search(regex, covers)\n", - " return SOLUTION\n", - " \n", - "def OR(*regexes):\n", - " \"OR together regexes. Ignore 'None' components.\"\n", - " return '|'.join(r for r in regexes if r is not None)\n", - "\n", - "\n", - "def invert_multimap(multimap):\n", - " result = collections.defaultdict(list)\n", - " for key in multimap:\n", - " for val in multimap[key]:\n", - " result[val].append(key)\n", - " return result" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "## For debugging\n", - "\n", - "def findregex(winners, losers, calls=100000):\n", - " \"Find the shortest disjunction of regex components that covers winners but not losers.\"\n", - " solution = '^(' + OR(winners) + ')$'\n", - " covers = regex_covers(winners, losers)\n", - " b = BranchBound(solution, calls)\n", - " b.search(None, covers)\n", - " print b.calls, 'calls', len(b.solution), 'len'\n", - " return b.solution\n", - "\n", - "\n", - "def triage_covers(partial, covers):\n", - " \"Simplify covers by eliminating dominated regexes, and picking ones that uniquely cover a winner.\"\n", - " previous = None\n", - " while covers != previous:\n", - " previous = covers\n", - " # Eliminate regexes that are dominated by another regex\n", - " covers = eliminate_dominated(covers) # covers = {regex: {winner,...}}\n", - " coverers = invert_multimap(covers) # coverers = {winner: {regex,...}}\n", - " # For winners covered by only one component, move winner from covers to regex\n", - " singletons = {coverers[w][0] for w in coverers if len(coverers[w]) == 1}\n", - " if singletons:\n", - " partial = OR(partial, OR(singletons))\n", - " covered = {w for c in singletons for w in covers[c]}\n", - " covers = {c:covers[c]-covered for c in covers if c not in singletons}\n", - " return partial, covers\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ", and to , who suggested looking at [WFSTs](http://www.openfst.org/twiki/bin/view/FST/WebHome)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def regex_covers(winners, losers):\n", - " \"\"\"Generate regex components and return a dict of {regex: {winner...}}.\n", - " Each regex matches at least one winner and no loser.\"\"\"\n", - " losers_str = '\\n'.join(losers)\n", - " wholes = {'^'+winner+'$' for winner in winners}\n", - " parts = {d for w in wholes for p in subparts(w) for d in dotify(p)}\n", - " chars = set(cat(winners))\n", - " pairs = {A+'.'+rep_char+B for A in chars for B in chars for rep_char in '+*?'}\n", - " reps = {r for p in parts for r in repetitions(p)}\n", - " pool = wholes | parts | pairs | reps \n", - " searchers = [re.compile(c, re.MULTILINE).search for c in pool]\n", - " covers = {r: set(filter(searcher, winners)) \n", - " for (r, searcher) in zip(pool, searchers)\n", - " if not searcher(losers_str)}\n", - " covers = eliminate_dominated(covers)\n", - " return covers\n", - " return add_character_class_components(covers)\n", - "\n", - "def add_character_class_components(covers):\n", - " for (B, Ms, E) in combine_splits(covers):\n", - " N = len(Ms)\n", - " or_size = N*len(B+'.'+E) + N-1 # N=3 => 'B1E|B2E|B3E'\n", - " class_size = len(B+'[]'+E) + N # N=3 => 'B[123]E'\n", - " winners = {w for m in Ms for w in Ms[m]}\n", - " if class_size < or_size:\n", - " covers[B + make_char_class(Ms) + E] = winners\n", - " return covers\n", - "\n", - "def split3(word):\n", - " \"Splits a word into 3 parts, all ways, with middle part having 0 or 1 character.\"\n", - " return [(word[:i], word[i:i+L], word[i+L:]) \n", - " for i in range(len(word)+1) for L in (0, 1)\n", - " if not word[i:i+L].startswith(('.', '+', '*', '?'))]\n", - "\n", - "def combine_splits(covers):\n", - " \"Convert covers = {BME: {w...}} into a list of [(B, {M...}, E, {w...}].\"\n", - " table = collections.defaultdict(dict) # table = {(B, E): {M: {w...}}}\n", - " for r in covers:\n", - " for (B, M, E) in split3(r):\n", - " table[B, E][M] = covers[r]\n", - " return [(B, Ms, E) for ((B, E), Ms) in table.items()\n", - " if len(Ms) > 1]\n", - "\n", - "def make_char_class(chars):\n", - " chars = set(chars)\n", - " return '[%s]%s' % (cat(chars), ('?' if '' in chars else ''))\n", - "\n", - "covers = regex_covers(boys, girls)\n", - "old = set(covers)\n", - "print len(covers)\n", - "covers = add_character_class_components(covers)\n", - "print len(covers)\n", - "print set(covers) - old\n", - "\n", - "print dict(combine_splits({'..a': {1,2,3}, '..b': {4,5,6}, '..c':{7}}))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Consider the two components `'..a'` and `'..b'`. If we wanted to cover all the winners that both of these match, we could use `'..a|..b'`, or we could share the common prefix and introduce a *character class* to get `'..[ab]'`. Since the former is 7 characters and the later is only 6, the later would be preferred. It would be an even bigger win to replace `'..az|..bz|..cz'` with `'..[abc]z'`; that reduces the count from 14 to 8. Similarly, replacing `'..az|..bz|..z'` with `'..[ab]?z'` saves 5 characters.\n", - "\n", - "There seems to be potential savings with character classes. But how do we know which characters from which components to combine into classes? To keep things from getting out of control, I'm going to only look at components that are left after we eliminate dominated. That is not an ideal approach—there may well be some components that are dominated on their own, but could be part of an optimal solution when combined with other components into a character class. But I'm going to keep it simple." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Searching: Better Bounds\n", - "----\n", - "\n", - "Branch and bound prunes the search tree whenever it is on a branch that is guaranteed to result in a solution that is no better than the best solution found so far. Currently we estimate the best possible solution along the current branch by taking the length of the partial solution and adding the length of the shortest component in `covers`. We do that because we know for sure that we need at least one component, but we don't know for sure how many components we'll need (nor how long each of them will be. So our estimate is often severely underestimates the true answer, which means we don't cut off search some places where we could, if only we had a better estimate.\n", - " \n", - "Here's one way to get a better bound. We'll define the following quantities:\n", - "\n", - "+ *P* = the length of the partial solution, plus the \"|\", if needed. So if the partial solution is `None`, then *P* will be zero, otherwise *P* is the length plus 1.\n", - "+ *S* = the length of the shortest regex component in `covers`.\n", - "+ *W* = the number of winners still in `covers`.\n", - "+ *C* = the largest number of winners covered by any regex in `covers`.\n", - "\n", - "If we assume The current estimate is *P* + *S*. We can see that a better estimate is *P* + *S* × ceil(*W* / *C*)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import math\n", - "\n", - "class BranchBound(object):\n", - " \"Hold state information for a branch and bound search.\"\n", - " def __init__(self, solution, calls):\n", - " self.solution, self.calls = solution, calls\n", - " \n", - " def search(self, covers, partial=None):\n", - " \"Recursively extend partial regex until it matches all winners in covers.\"\n", - " if self.calls <= 0: \n", - " return self.solution\n", - " self.calls -= 1\n", - " covers, partial = simplify_covers(covers, partial)\n", - " if not covers: # Nothing left to cover; solution is complete\n", - " self.solution = min(partial, self.solution, key=len)\n", - " else:\n", - " P = 0 if not partial else len(partial) + 1\n", - " S = len(min(covers, key=len))\n", - " C = max(len(covers[r]) for r in covers)\n", - " W = len(set(w for r in covers for w in covers[r]))\n", - " if P + S * math.ceil(W / C) < len(self.solution):\n", - " # Try with and without the greedy-best component\n", - " def score(r): return 4 * len(covers[r]) - len(r)\n", - " r = max(covers, key=score) # Best component\n", - " covered = covers[r] # Set of winners covered by r\n", - " covers.pop(r)\n", - " self.search({c:covers[c]-covered for c in covers}, OR(partial, r))\n", - " self.search(covers, partial)\n", - " return self.solution" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.0" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -}