updated contents from Atlas repo

2014-10-14 14:26:55 -03:00
parent 40688c038d
commit 981d5bc473
157 changed files with 71134 additions and 1 deletions
--- a/dicts/container_perftest.py
+++ b/dicts/container_perftest.py
@@ -0,0 +1,55 @@
+"""
+Container ``in`` operator performance test
+"""
+import sys
+import timeit
+
+SETUP = '''
+import array
+selected = array.array('d')
+with open('selected.arr', 'rb') as fp:
+    selected.fromfile(fp, {size})
+if {container_type} is dict:
+    haystack = dict.fromkeys(selected, 1)
+else:
+    haystack = {container_type}(selected)
+if {verbose}:
+    print(type(haystack), end='  ')
+    print('haystack: %10d' % len(haystack), end='  ')
+needles = array.array('d')
+with open('not_selected.arr', 'rb') as fp:
+    needles.fromfile(fp, 500)
+needles.extend(selected[::{size}//500])
+if {verbose}:
+    print(' needles: %10d' % len(needles), end='  ')
+'''
+
+TEST = '''
+found = 0
+for n in needles:
+    if n in haystack:
+        found += 1
+if {verbose}:
+    print('  found: %10d' % found)
+'''
+
+def test(container_type, verbose):
+    MAX_EXPONENT = 7
+    for n in range(3, MAX_EXPONENT + 1):
+        size = 10**n
+        setup = SETUP.format(container_type=container_type,
+                             size=size, verbose=verbose)
+        test = TEST.format(verbose=verbose)
+        tt = timeit.repeat(stmt=test, setup=setup, repeat=5, number=1)
+        print('|{:{}d}|{:f}'.format(size, MAX_EXPONENT + 1, min(tt)))
+
+if __name__=='__main__':
+    if '-v' in sys.argv:
+        sys.argv.remove('-v')
+        verbose = True
+    else:
+        verbose = False
+    if len(sys.argv) != 2:
+        print('Usage: %s <container_type>' % sys.argv[0])
+    else:
+        test(sys.argv[1], verbose)
--- a/dicts/container_perftest_datagen.py
+++ b/dicts/container_perftest_datagen.py
@@ -0,0 +1,37 @@
+"""
+Generate data for container performance test
+"""
+
+import random
+import array
+
+MAX_EXPONENT = 7
+HAYSTACK_LEN = 10 ** MAX_EXPONENT
+NEEDLES_LEN = 10 ** (MAX_EXPONENT - 1)
+SAMPLE_LEN = HAYSTACK_LEN + NEEDLES_LEN // 2
+
+needles = array.array('d')
+
+sample = {1/random.random() for i in range(SAMPLE_LEN)}
+print('initial sample: %d elements' % len(sample))
+
+# complete sample, in case duplicate random numbers were discarded
+while len(sample) < SAMPLE_LEN:
+    sample.add(1/random.random())
+
+print('complete sample: %d elements' % len(sample))
+
+sample = array.array('d', sample)
+random.shuffle(sample)
+
+not_selected = sample[:NEEDLES_LEN // 2]
+print('not selected: %d samples' % len(not_selected))
+print('  writing not_selected.arr')
+with open('not_selected.arr', 'wb') as fp:
+    not_selected.tofile(fp)
+
+selected = sample[NEEDLES_LEN // 2:]
+print('selected: %d samples' % len(selected))
+print('  writing selected.arr')
+with open('selected.arr', 'wb') as fp:
+    selected.tofile(fp)
--- a/dicts/dialcodes.py
+++ b/dicts/dialcodes.py
@@ -0,0 +1,30 @@
+# BEGIN DIALCODES
+# dial codes of the top 10 most populous countries
+DIAL_CODES = [
+        (86, 'China'),
+        (91, 'India'),
+        (1, 'United States'),
+        (62, 'Indonesia'),
+        (55, 'Brazil'),
+        (92, 'Pakistan'),
+        (880, 'Bangladesh'),
+        (234, 'Nigeria'),
+        (7, 'Russia'),
+        (81, 'Japan'),
+    ]
+
+d1 = dict(DIAL_CODES)  # <1>
+print('d1:', d1.keys())
+d2 = dict(sorted(DIAL_CODES))  # <2>
+print('d2:', d2.keys())
+d3 = dict(sorted(DIAL_CODES, key=lambda x:x[1]))  # <3>
+print('d3:', d3.keys())
+assert d1 == d2 and d2 == d3  # <4>
+# END DIALCODES
+"""
+# BEGIN DIALCODES_OUTPUT
+d1: dict_keys([880, 1, 86, 55, 7, 234, 91, 92, 62, 81])
+d2: dict_keys([880, 1, 91, 86, 81, 55, 234, 7, 92, 62])
+d3: dict_keys([880, 81, 1, 86, 55, 7, 234, 91, 92, 62])
+# END DIALCODES_OUTPUT
+"""
--- a/dicts/dict_perftest.py
+++ b/dicts/dict_perftest.py
@@ -0,0 +1,34 @@
+"""
+Dict performance test
+"""
+
+import timeit
+
+SETUP = '''
+import array
+selected = array.array('d')
+with open('selected.arr', 'rb') as fp:
+    selected.fromfile(fp, {size})
+haystack = dict((n, n.as_integer_ratio()) for n in selected)
+print('haystack: %10d' % len(haystack), end='  ')
+needles = array.array('d')
+with open('not_selected.arr', 'rb') as fp:
+    needles.fromfile(fp, 500)
+needles.extend(selected[:500])
+# print(' needles: %10d' % len(needles), end='  ')
+'''
+
+TEST = '''
+found = 0
+for n in needles:
+    if n in haystack:
+        found += 1
+# print('  found: %10d' % found)
+'''
+
+MAX_EXPONENT = 7
+for n in range(3, MAX_EXPONENT + 1):
+    size = 10**n
+    setup = SETUP.format(size=size)
+    tt = timeit.repeat(stmt=TEST, setup=setup, repeat=5, number=1)
+    print('|{:{}d}|{:f}'.format(size, MAX_EXPONENT + 1, min(tt)))
--- a/dicts/hashdiff.py
+++ b/dicts/hashdiff.py
@@ -0,0 +1,20 @@
+import sys
+
+MAX_BITS = len(format(sys.maxsize, 'b'))
+print('%s-bit Python build' % (MAX_BITS + 1))
+
+def hash_diff(o1, o2):
+    h1 = '{:>0{}b}'.format(hash(o1), MAX_BITS)
+    h2 = '{:>0{}b}'.format(hash(o2), MAX_BITS)
+    diff = ''.join('!' if b1 != b2 else ' ' for b1, b2 in zip(h1, h2))
+    count = '!= {}'.format(diff.count('!'))
+    width = max(len(repr(o1)), len(repr(o2)), 8)
+    sep = '-' * (width * 2 + MAX_BITS)
+    return '{!r:{width}} {}\n{:{width}} {} {}\n{!r:{width}} {}\n{}'.format(
+    		o1, h1, ' ' * width, diff, count, o2, h2, sep, width=width)
+
+if __name__ == '__main__':
+    print(hash_diff(1, 1.0))
+    print(hash_diff(1.0, 1.0001))
+    print(hash_diff(1.0001, 1.0002))
+    print(hash_diff(1.0002, 1.0003))
--- a/dicts/index.py
+++ b/dicts/index.py
@@ -0,0 +1,25 @@
+# adapted from Alex Martelli's example in "Re-learning Python"
+# http://www.aleax.it/Python/accu04_Relearn_Python_alex.pdf
+# (slide 41) Ex: lines-by-word file index
+
+# BEGIN INDEX
+"""Build an index mapping word -> list of occurrences"""
+
+import sys
+import re
+
+WORD_RE = re.compile('\w+')
+
+index = {}
+with open(sys.argv[1], encoding='utf-8') as fp:
+    for line_no, line in enumerate(fp, 1):
+        for match in WORD_RE.finditer(line):
+            word = match.group()
+            column_no = match.start()+1
+            location = (line_no, column_no)
+            index.setdefault(word, []).append(location)  # <1>
+
+# print in alphabetical order
+for word in sorted(index, key=str.upper):
+    print(word, index[word])
+# END INDEX
--- a/dicts/index0.py
+++ b/dicts/index0.py
@@ -0,0 +1,28 @@
+# adapted from Alex Martelli's example in "Re-learning Python"
+# http://www.aleax.it/Python/accu04_Relearn_Python_alex.pdf
+# (slide 41) Ex: lines-by-word file index
+
+# BEGIN INDEX0
+"""Build an index mapping word -> list of occurrences"""
+
+import sys
+import re
+
+WORD_RE = re.compile('\w+')
+
+index = {}
+with open(sys.argv[1], encoding='utf-8') as fp:
+    for line_no, line in enumerate(fp, 1):
+        for match in WORD_RE.finditer(line):
+            word = match.group()
+            column_no = match.start()+1
+            location = (line_no, column_no)
+            # this is ugly; coded like this to make a point
+            occurrences = index.get(word, [])  # <1>
+            occurrences.append(location)       # <2>
+            index[word] = occurrences          # <3>
+
+# print in alphabetical order
+for word in sorted(index, key=str.upper):
+    print(word, index[word])
+# END INDEX0
--- a/dicts/index_alex.py
+++ b/dicts/index_alex.py
@@ -0,0 +1,22 @@
+# adapted from Alex Martelli's example in "Re-learning Python"
+# http://www.aleax.it/Python/accu04_Relearn_Python_alex.pdf
+# (slide 41) Ex: lines-by-word file index
+
+
+"""Build a map word -> list-of-line-numbers"""
+
+import sys
+import re
+
+NONWORD_RE = re.compile('\W+')
+
+idx = {}
+with open(sys.argv[1], encoding='utf-8') as fp:
+    for n, line in enumerate(fp, 1):
+        for word in NONWORD_RE.split(line):
+            if word.strip():
+                idx.setdefault(word, []).append(n)
+
+# print in alphabetical order
+for word in sorted(idx, key=str.upper):
+    print(word, idx[word])
--- a/dicts/index_default.py
+++ b/dicts/index_default.py
@@ -0,0 +1,26 @@
+# adapted from Alex Martelli's example in "Re-learning Python"
+# http://www.aleax.it/Python/accu04_Relearn_Python_alex.pdf
+# (slide 41) Ex: lines-by-word file index
+
+# BEGIN INDEX_DEFAULT
+"""Build an index mapping word -> list of occurrences"""
+
+import sys
+import re
+import collections
+
+WORD_RE = re.compile('\w+')
+
+index = collections.defaultdict(list)     # <1>
+with open(sys.argv[1], encoding='utf-8') as fp:
+    for line_no, line in enumerate(fp, 1):
+        for match in WORD_RE.finditer(line):
+            word = match.group()
+            column_no = match.start()+1
+            location = (line_no, column_no)
+            index[word].append(location)  # <2>
+
+# print in alphabetical order
+for word in sorted(index, key=str.upper):
+    print(word, index[word])
+# END INDEX_DEFAULT
--- a/dicts/set_perftest.py
+++ b/dicts/set_perftest.py
@@ -0,0 +1,48 @@
+"""
+Set performance test
+"""
+
+import timeit
+
+SETUP = '''
+import array
+selected = array.array('d')
+with open('selected.arr', 'rb') as fp:
+    selected.fromfile(fp, {size})
+haystack = {type}(selected)
+# print('haystack: %10d' % len(haystack), end='  ')
+needles = array.array('d')
+with open('not_selected.arr', 'rb') as fp:
+    needles.fromfile(fp, 500)
+needles.extend(selected[:500])
+needles = set(needles)
+# print(' needles: %10d' % len(needles), end='  ')
+'''
+
+tests = [
+('FOR_LOOP_TEST', '''
+found = 0
+for n in needles:
+    if n in haystack:
+        found += 1
+assert found == 500
+'''),
+('SET_&_TEST', '''
+found = len(needles & haystack)
+assert found == 500
+'''
+)]
+
+MAX_EXPONENT = 7
+for collection_type in 'dict.fromkeys set list'.split():
+    if collection_type == 'set':
+        available_tests = tests
+    else:
+        available_tests = tests[:1]
+    for test_name, test in available_tests:
+        print('*' * 25, collection_type, test_name)
+        for n in range(3, MAX_EXPONENT + 1):
+            size = 10**n
+            setup = SETUP.format(type=collection_type, size=size)
+            tt = timeit.repeat(stmt=test, setup=setup, repeat=5, number=1)
+            print('|{:{}d}|{:9.6f}'.format(size, MAX_EXPONENT + 1, min(tt)))
--- a/dicts/strkeydict.py
+++ b/dicts/strkeydict.py
@@ -0,0 +1,72 @@
+"""StrKeyDict always converts non-string keys to `str`
+
+Tests for item retrieval using `d[key]` notation::
+
+    >>> d = StrKeyDict([('2', 'two'), ('4', 'four')])
+    >>> d['2']
+    'two'
+    >>> d[4]
+    'four'
+    >>> d[1]
+    Traceback (most recent call last):
+      ...
+    KeyError: '1'
+
+Tests for the `in` operator::
+
+    >>> 2 in d
+    True
+    >>> 1 in d
+    False
+
+Test for item assignment using non-string key::
+
+    >>> d[0] = 'zero'
+    >>> d['0']
+    'zero'
+
+Tests for update using a `dict` or a sequence of pairs::
+
+    >>> d.update({6:'six', '8':'eight'})
+    >>> sorted(d.keys())
+    ['0', '2', '4', '6', '8']
+    >>> d.update([(10, 'ten'), ('12', 'twelve')])
+    >>> sorted(d.keys())
+    ['0', '10', '12', '2', '4', '6', '8']
+    >>> d.update([1, 3, 5])
+    Traceback (most recent call last):
+      ...
+    TypeError: 'int' object is not iterable
+
+"""
+# BEGIN STRKEYDICT
+
+import collections
+import collections.abc
+
+
+class StrKeyDict(collections.UserDict):  # <1>
+
+    def __missing__(self, key):  # <2>
+        if isinstance(key, str):
+            raise KeyError(key)
+        return self[str(key)]
+
+    def __contains__(self, key):
+        return str(key) in self.data  # <3>
+
+    def __setitem__(self, key, item):
+        self.data[str(key)] = item   # <4>
+
+    def update(self, iterable=None, **kwds):
+        if iterable is not None:
+            if isinstance(iterable, collections.abc.Mapping):  # <5>
+                pairs = iterable.items()
+            else:
+                pairs = ((k, v) for k, v in iterable)  # <6>
+            for key, value in pairs:
+                self[key] = value  # <7>
+        if kwds:
+            self.update(kwds)  # <8>
+
+# END STRKEYDICT
--- a/dicts/strkeydict0.py
+++ b/dicts/strkeydict0.py
@@ -0,0 +1,39 @@
+"""StrKeyDict0 converts non-string keys to `str` on lookup
+
+# BEGIN STRKEYDICT0_TESTS
+
+Tests for item retrieval using `d[key]` notation::
+
+    >>> d = StrKeyDict0([('2', 'two'), ('4', 'four')])
+    >>> d['2']
+    'two'
+    >>> d[4]
+    'four'
+    >>> d[1]
+    Traceback (most recent call last):
+      ...
+    KeyError: '1'
+
+Tests for the `in` operator::
+
+    >>> 2 in d
+    True
+    >>> 1 in d
+    False
+
+# END STRKEYDICT0_TESTS
+"""
+
+# BEGIN STRKEYDICT0
+
+class StrKeyDict0(dict):  # <1>
+
+    def __missing__(self, key):
+        if isinstance(key, str):  # <2>
+            raise KeyError(key)
+        return self[str(key)]  # <3>
+
+    def __contains__(self, key):
+        return key in self.keys() or str(key) in self.keys()  # <4>
+
+# END STRKEYDICT0