updated chapter 4 and appendix-A files

This commit is contained in:
Luciano Ramalho 2020-01-22 22:52:23 -03:00
parent a1d6c125bf
commit 49f52e29c7
25 changed files with 5219 additions and 10 deletions

28
04-text-byte/charfinder/cf.py Executable file
View File

@ -0,0 +1,28 @@
#!/usr/bin/env python3
import sys
import unicodedata
FIRST, LAST = ord(' '), sys.maxunicode # <1>
def find(*query_words, first=FIRST, last=LAST): # <2>
query = {w.upper() for w in query_words} # <3>
count = 0
for code in range(first, last + 1):
char = chr(code) # <4>
name = unicodedata.name(char, None) # <5>
if name and query.issubset(name.split()): # <6>
print(f'U+{code:04X}\t{char}\t{name}') # <7>
count += 1
print(f'({count} found)')
def main(words):
if words:
find(*words)
else:
print('Please provide words to find.')
if __name__ == '__main__':
main(sys.argv[1:])

View File

@ -0,0 +1,36 @@
Doctests for ``cf.py``
======================
How to run the tests
----------------------
Run the ``doctest`` module from the command line::
$ python3 -m doctest cf_tests.rst
Tests
-----
Import functions for testing::
>>> from cf import find, main
Test ``find`` with single result::
>>> find("sign", "registered") # doctest:+NORMALIZE_WHITESPACE
U+00AE ® REGISTERED SIGN
(1 found)
Test ``find`` with two results::
>>> find("chess", "queen", last=0xFFFF) # doctest:+NORMALIZE_WHITESPACE
U+2655 ♕ WHITE CHESS QUEEN
U+265B ♛ BLACK CHESS QUEEN
(2 found)
Test ``main`` with no words::
>>> main([])
Please provide words to find.

View File

@ -0,0 +1,2 @@
#!/bin/bash
python3 -m doctest cf_tests.rst $1

View File

@ -1,4 +1,4 @@
# BEGIN NUMERICS_DEMO
# tag::NUMERICS_DEMO[]
import unicodedata
import re
@ -15,4 +15,4 @@ for char in sample:
format(unicodedata.numeric(char), '5.2f'), # <6>
unicodedata.name(char), # <7>
sep='\t')
# END NUMERICS_DEMO
# end::NUMERICS_DEMO[]

View File

@ -1,4 +1,4 @@
# BEGIN RE_DEMO
# tag::RE_DEMO[]
import re
re_numbers_str = re.compile(r'\d+') # <1>
@ -18,4 +18,4 @@ print(' bytes:', re_numbers_bytes.findall(text_bytes)) # <7>
print('Words')
print(' str :', re_words_str.findall(text_str)) # <8>
print(' bytes:', re_words_bytes.findall(text_bytes)) # <9>
# END RE_DEMO
# end::RE_DEMO[]

View File

@ -28,7 +28,7 @@ Handling a string with Greek and Latin accented characters:
"""
# BEGIN SHAVE_MARKS
# tag::SHAVE_MARKS[]
import unicodedata
import string
@ -39,9 +39,9 @@ def shave_marks(txt):
shaved = ''.join(c for c in norm_txt
if not unicodedata.combining(c)) # <2>
return unicodedata.normalize('NFC', shaved) # <3>
# END SHAVE_MARKS
# end::SHAVE_MARKS[]
# BEGIN SHAVE_MARKS_LATIN
# tag::SHAVE_MARKS_LATIN[]
def shave_marks_latin(txt):
"""Remove all diacritic marks from Latin base characters"""
norm_txt = unicodedata.normalize('NFD', txt) # <1>
@ -56,9 +56,9 @@ def shave_marks_latin(txt):
latin_base = c in string.ascii_letters
shaved = ''.join(keepers)
return unicodedata.normalize('NFC', shaved) # <5>
# END SHAVE_MARKS_LATIN
# end::SHAVE_MARKS_LATIN[]
# BEGIN ASCIIZE
# tag::ASCIIZE[]
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""", # <1>
"""'f"*^<''""---~>""")
@ -84,4 +84,4 @@ def asciize(txt):
no_marks = shave_marks_latin(dewinize(txt)) # <5>
no_marks = no_marks.replace('ß', 'ss') # <6>
return unicodedata.normalize('NFKC', no_marks) # <7>
# END ASCIIZE
# end::ASCIIZE[]

12
04-text-byte/skin.py Executable file
View File

@ -0,0 +1,12 @@
from unicodedata import name
SKIN1 = 0x1F3FB # EMOJI MODIFIER FITZPATRICK TYPE-1-2 # <1>
SKINS = [chr(i) for i in range(SKIN1, SKIN1 + 5)] # <2>
THUMB = '\U0001F44d' # THUMBS UP SIGN 👍
examples = [THUMB] # <3>
examples.extend(THUMB + skin for skin in SKINS) # <4>
for example in examples:
print(example, end='\t') # <5>
print(' + '.join(name(char) for char in example)) # <6>

View File

@ -0,0 +1,6 @@
# REGIONAL INDICATOR SYMBOLS
RIS_A = '\U0001F1E6' # LETTER A
RIS_U = '\U0001F1FA' # LETTER U
print(RIS_A + RIS_U) # AU: Australia
print(RIS_U + RIS_A) # UA: Ukraine
print(RIS_A + RIS_A) # AA: no such country

View File

@ -0,0 +1,28 @@
from unicodedata import name
zwg_sample = """
1F468 200D 1F9B0 |man: red hair |E11.0
1F9D1 200D 1F91D 200D 1F9D1 |people holding hands |E12.0
1F3CA 1F3FF 200D 2640 FE0F |woman swimming: dark skin tone |E4.0
1F469 1F3FE 200D 2708 FE0F |woman pilot: medium-dark skin tone |E4.0
1F468 200D 1F469 200D 1F467 |family: man, woman, girl |E2.0
1F3F3 FE0F 200D 26A7 FE0F |transgender flag |E13.0
1F469 200D 2764 FE0F 200D 1F48B 200D 1F469 |kiss: woman, woman |E2.0
"""
markers = {'\u200D': 'ZWG', # ZERO WIDTH JOINER
'\uFE0F': 'V16', # VARIATION SELECTOR-16
}
for line in zwg_sample.strip().split('\n'):
code, descr, version = (s.strip() for s in line.split('|'))
chars = [chr(int(c, 16)) for c in code.split()]
print(''.join(chars), version, descr, sep='\t', end='')
while chars:
char = chars.pop(0)
if char in markers:
print(' + ' + markers[char], end='')
else:
ucode = f'U+{ord(char):04X}'
print(f'\n\t{char}\t{ucode}\t{name(char)}', end='')
print()

55
appendix-A/arcfour.py Normal file
View File

@ -0,0 +1,55 @@
"""RC4 compatible algorithm"""
def arcfour(key, in_bytes, loops=20):
kbox = bytearray(256) # create key box
for i, car in enumerate(key): # copy key and vector
kbox[i] = car
j = len(key)
for i in range(j, 256): # repeat until full
kbox[i] = kbox[i-j]
# [1] initialize sbox
sbox = bytearray(range(256))
# repeat sbox mixing loop, as recommened in CipherSaber-2
# http://ciphersaber.gurus.com/faq.html#cs2
j = 0
for k in range(loops):
for i in range(256):
j = (j + sbox[i] + kbox[i]) % 256
sbox[i], sbox[j] = sbox[j], sbox[i]
# main loop
i = 0
j = 0
out_bytes = bytearray()
for car in in_bytes:
i = (i + 1) % 256
# [2] shuffle sbox
j = (j + sbox[i]) % 256
sbox[i], sbox[j] = sbox[j], sbox[i]
# [3] compute t
t = (sbox[i] + sbox[j]) % 256
k = sbox[t]
car = car ^ k
out_bytes.append(car)
return out_bytes
def test():
from time import time
clear = bytearray(b'1234567890' * 100000)
t0 = time()
cipher = arcfour(b'key', clear)
print('elapsed time: %.2fs' % (time() - t0))
result = arcfour(b'key', cipher)
assert result == clear, '%r != %r' % (result, clear)
print('elapsed time: %.2fs' % (time() - t0))
print('OK')
if __name__ == '__main__':
test()

View File

@ -0,0 +1,46 @@
import sys
import time
from concurrent import futures
from random import randrange
from arcfour import arcfour
JOBS = 12
SIZE = 2**18
KEY = b"'Twas brillig, and the slithy toves\nDid gyre"
STATUS = '{} workers, elapsed time: {:.2f}s'
def arcfour_test(size, key):
in_text = bytearray(randrange(256) for i in range(size))
cypher_text = arcfour(key, in_text)
out_text = arcfour(key, cypher_text)
assert in_text == out_text, 'Failed arcfour_test'
return size
def main(workers=None):
if workers:
workers = int(workers)
t0 = time.time()
with futures.ProcessPoolExecutor(workers) as executor:
actual_workers = executor._max_workers
to_do = []
for i in range(JOBS, 0, -1):
size = SIZE + int(SIZE / JOBS * (i - JOBS/2))
job = executor.submit(arcfour_test, size, KEY)
to_do.append(job)
for future in futures.as_completed(to_do):
res = future.result()
print('{:.1f} KB'.format(res/2**10))
print(STATUS.format(actual_workers, time.time() - t0))
if __name__ == '__main__':
if len(sys.argv) == 2:
workers = int(sys.argv[1])
else:
workers = None
main(workers)

3771
appendix-A/bottle.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,76 @@
"""
Container ``in`` operator performance test
"""
import sys
import timeit
SETUP = '''
import array
selected = array.array('d')
with open('selected.arr', 'rb') as fp:
selected.fromfile(fp, {size})
if {test_type} is dict:
haystack = dict.fromkeys(selected, 1)
else:
haystack = {test_type}(selected)
if {verbose}:
print(type(haystack), end=' ')
print('haystack: %10d' % len(haystack), end=' ')
needles = array.array('d')
with open('not_selected.arr', 'rb') as fp:
needles.fromfile(fp, 500)
needles.extend(selected[::{size}//500])
if {verbose}:
print(' needles: %10d' % len(needles), end=' ')
'''
SETUP_NEEDLE_SET = '''
needle_set = set(needles)
if {verbose}:
print(' needle_set: %10d' % len(needle_set), end=' ')
'''
TEST = '''
found = 0
for n in needles:
if n in haystack:
found += 1
if {verbose}:
print(' found: %10d' % found)
'''
TEST_INTERSECT = '''
found = len(needle_set & haystack)
if {verbose}:
print(' found: %10d' % found)
'''
def test(test_type, verbose):
MAX_EXPONENT = 7
if test_type == 'intersect':
test_type = 'set'
setup_template = SETUP + SETUP_NEEDLE_SET
test_template = TEST_INTERSECT
else:
setup_template = SETUP
test_template = TEST
for n in range(3, MAX_EXPONENT + 1):
size = 10**n
setup = setup_template.format(test_type=test_type,
size=size, verbose=verbose)
test = test_template.format(verbose=verbose)
tt = timeit.repeat(stmt=test, setup=setup, repeat=5, number=1)
print('|{:{}d}|{:f}'.format(size, MAX_EXPONENT + 1, min(tt)))
if __name__=='__main__':
if '-v' in sys.argv:
sys.argv.remove('-v')
verbose = True
else:
verbose = False
if len(sys.argv) != 2:
print('Usage: %s <test_type>' % sys.argv[0])
else:
test(sys.argv[1], verbose)

View File

@ -0,0 +1,37 @@
"""
Generate data for container performance test
"""
import random
import array
MAX_EXPONENT = 7
HAYSTACK_LEN = 10 ** MAX_EXPONENT
NEEDLES_LEN = 10 ** (MAX_EXPONENT - 1)
SAMPLE_LEN = HAYSTACK_LEN + NEEDLES_LEN // 2
needles = array.array('d')
sample = {1/random.random() for i in range(SAMPLE_LEN)}
print('initial sample: %d elements' % len(sample))
# complete sample, in case duplicate random numbers were discarded
while len(sample) < SAMPLE_LEN:
sample.add(1/random.random())
print('complete sample: %d elements' % len(sample))
sample = array.array('d', sample)
random.shuffle(sample)
not_selected = sample[:NEEDLES_LEN // 2]
print('not selected: %d samples' % len(not_selected))
print(' writing not_selected.arr')
with open('not_selected.arr', 'wb') as fp:
not_selected.tofile(fp)
selected = sample[NEEDLES_LEN // 2:]
print('selected: %d samples' % len(selected))
print(' writing selected.arr')
with open('selected.arr', 'wb') as fp:
selected.tofile(fp)

87
appendix-A/emoji_families.py Executable file
View File

@ -0,0 +1,87 @@
#!/usr/bin/env python3.8
import sys
import io
from itertools import product
from unicodedata import name, unidata_version
from bottle import route, run, response, template
ZWJ = '\u200D' # ZERO WIDTH JOINER
ADULTS = '\N{MAN}\N{WOMAN}'
CHILDREN = '\N{BOY}\N{GIRL}'
adults = list(ADULTS) + list(product(ADULTS, repeat=2))
children = list(CHILDREN) + list(product(CHILDREN, repeat=2))
def matrix():
for kids in children:
row = []
suffix = ZWJ + ZWJ.join(kids)
for p in adults:
row.append(ZWJ.join(p)+suffix)
yield row
def report():
out = io.StringIO()
print('Python', '.'.join(str(n) for n in sys.version_info[:3]), file=out)
print('Unicode', unidata_version, file=out)
print(end='\t', file=out)
for parents in adults:
print('+'.join(parents), end='\t', file=out)
print(file=out)
for kids, row in zip(children, matrix()):
print('+'.join(kids), end='\t', file=out)
for cell in row:
print(cell, end='\t', file=out)
print(file=out)
return out.getvalue()
TABLE = """
<html>
<style>
table {text-align: center;
border-collapse: collapse;}
td {border: 1px solid gray;}
th {padding: 0.5ex;}
</style>
<table ">
<tr>
<th></th>
% for parents in adults:
<th>{{'+'.join(parents)}}</th>
% end
</tr>
% for kids, row in zip(children, matrix()):
<tr>
<th>{{'+'.join(kids)}}</th>
% for cell in row:
<td>{{cell}}</td>
% end
</tr>
% end
</table>
</html>
"""
@route('/')
def index():
response.content_type = 'text/html; charset=utf-8'
return template(TABLE, **globals())
if __name__ == '__main__':
run(host='localhost', port=8080)

149
appendix-A/flags2_common.py Normal file
View File

@ -0,0 +1,149 @@
"""Utilities for second set of flag examples.
"""
import os
import time
import sys
import string
import argparse
from collections import namedtuple
from enum import Enum
Result = namedtuple('Result', 'status data')
HTTPStatus = Enum('Status', 'ok not_found error')
POP20_CC = ('CN IN US ID BR PK NG BD RU JP '
'MX PH VN ET EG DE IR TR CD FR').split()
DEFAULT_CONCUR_REQ = 1
MAX_CONCUR_REQ = 1
SERVERS = {
'REMOTE': 'http://flupy.org/data/flags',
'LOCAL': 'http://localhost:8001/flags',
'DELAY': 'http://localhost:8002/flags',
'ERROR': 'http://localhost:8003/flags',
}
DEFAULT_SERVER = 'LOCAL'
DEST_DIR = 'downloads/'
COUNTRY_CODES_FILE = 'country_codes.txt'
def save_flag(img, filename):
path = os.path.join(DEST_DIR, filename)
with open(path, 'wb') as fp:
fp.write(img)
def initial_report(cc_list, actual_req, server_label):
if len(cc_list) <= 10:
cc_msg = ', '.join(cc_list)
else:
cc_msg = 'from {} to {}'.format(cc_list[0], cc_list[-1])
print('{} site: {}'.format(server_label, SERVERS[server_label]))
msg = 'Searching for {} flag{}: {}'
plural = 's' if len(cc_list) != 1 else ''
print(msg.format(len(cc_list), plural, cc_msg))
plural = 's' if actual_req != 1 else ''
msg = '{} concurrent connection{} will be used.'
print(msg.format(actual_req, plural))
def final_report(cc_list, counter, start_time):
elapsed = time.time() - start_time
print('-' * 20)
msg = '{} flag{} downloaded.'
plural = 's' if counter[HTTPStatus.ok] != 1 else ''
print(msg.format(counter[HTTPStatus.ok], plural))
if counter[HTTPStatus.not_found]:
print(counter[HTTPStatus.not_found], 'not found.')
if counter[HTTPStatus.error]:
plural = 's' if counter[HTTPStatus.error] != 1 else ''
print('{} error{}.'.format(counter[HTTPStatus.error], plural))
print('Elapsed time: {:.2f}s'.format(elapsed))
def expand_cc_args(every_cc, all_cc, cc_args, limit):
codes = set()
A_Z = string.ascii_uppercase
if every_cc:
codes.update(a+b for a in A_Z for b in A_Z)
elif all_cc:
with open(COUNTRY_CODES_FILE) as fp:
text = fp.read()
codes.update(text.split())
else:
for cc in (c.upper() for c in cc_args):
if len(cc) == 1 and cc in A_Z:
codes.update(cc+c for c in A_Z)
elif len(cc) == 2 and all(c in A_Z for c in cc):
codes.add(cc)
else:
msg = 'each CC argument must be A to Z or AA to ZZ.'
raise ValueError('*** Usage error: '+msg)
return sorted(codes)[:limit]
def process_args(default_concur_req):
server_options = ', '.join(sorted(SERVERS))
parser = argparse.ArgumentParser(
description='Download flags for country codes. '
'Default: top 20 countries by population.')
parser.add_argument('cc', metavar='CC', nargs='*',
help='country code or 1st letter (eg. B for BA...BZ)')
parser.add_argument('-a', '--all', action='store_true',
help='get all available flags (AD to ZW)')
parser.add_argument('-e', '--every', action='store_true',
help='get flags for every possible code (AA...ZZ)')
parser.add_argument('-l', '--limit', metavar='N', type=int,
help='limit to N first codes', default=sys.maxsize)
parser.add_argument('-m', '--max_req', metavar='CONCURRENT', type=int,
default=default_concur_req,
help='maximum concurrent requests (default={})'
.format(default_concur_req))
parser.add_argument('-s', '--server', metavar='LABEL',
default=DEFAULT_SERVER,
help='Server to hit; one of {} (default={})'
.format(server_options, DEFAULT_SERVER))
parser.add_argument('-v', '--verbose', action='store_true',
help='output detailed progress info')
args = parser.parse_args()
if args.max_req < 1:
print('*** Usage error: --max_req CONCURRENT must be >= 1')
parser.print_usage()
sys.exit(1)
if args.limit < 1:
print('*** Usage error: --limit N must be >= 1')
parser.print_usage()
sys.exit(1)
args.server = args.server.upper()
if args.server not in SERVERS:
print('*** Usage error: --server LABEL must be one of',
server_options)
parser.print_usage()
sys.exit(1)
try:
cc_list = expand_cc_args(args.every, args.all, args.cc, args.limit)
except ValueError as exc:
print(exc.args[0])
parser.print_usage()
sys.exit(1)
if not cc_list:
cc_list = sorted(POP20_CC)
return args, cc_list
def main(download_many, default_concur_req, max_concur_req):
args, cc_list = process_args(default_concur_req)
actual_req = min(args.max_req, max_concur_req, len(cc_list))
initial_report(cc_list, actual_req, args.server)
base_url = SERVERS[args.server]
t0 = time.time()
counter = download_many(cc_list, base_url, args.verbose, actual_req)
assert sum(counter.values()) == len(cc_list), \
'some downloads are unaccounted for'
final_report(cc_list, counter, t0)

View File

@ -0,0 +1,87 @@
"""Download flags of countries (with error handling).
Sequential version
Sample run::
$ python3 flags2_sequential.py -s DELAY b
DELAY site: http://localhost:8002/flags
Searching for 26 flags: from BA to BZ
1 concurrent connection will be used.
--------------------
17 flags downloaded.
9 not found.
Elapsed time: 13.36s
"""
import collections
import requests
import tqdm
from flags2_common import main, save_flag, HTTPStatus, Result
DEFAULT_CONCUR_REQ = 1
MAX_CONCUR_REQ = 1
# BEGIN FLAGS2_BASIC_HTTP_FUNCTIONS
def get_flag(base_url, cc):
url = '{}/{cc}/{cc}.gif'.format(base_url, cc=cc.lower())
resp = requests.get(url)
if resp.status_code != 200:
resp.raise_for_status()
return resp.content
def download_one(cc, base_url, verbose=False):
try:
image = get_flag(base_url, cc)
except requests.exceptions.HTTPError as exc:
res = exc.response
if res.status_code == 404:
status = HTTPStatus.not_found
msg = 'not found'
else:
raise
else:
save_flag(image, cc.lower() + '.gif')
status = HTTPStatus.ok
msg = 'OK'
if verbose:
print(cc, msg)
return Result(status, cc)
# END FLAGS2_BASIC_HTTP_FUNCTIONS
# BEGIN FLAGS2_DOWNLOAD_MANY_SEQUENTIAL
def download_many(cc_list, base_url, verbose, max_req):
counter = collections.Counter()
cc_iter = sorted(cc_list)
if not verbose:
cc_iter = tqdm.tqdm(cc_iter)
for cc in cc_iter:
try:
res = download_one(cc, base_url, verbose)
except requests.exceptions.HTTPError as exc:
error_msg = 'HTTP error {res.status_code} - {res.reason}'
error_msg = error_msg.format(res=exc.response)
except requests.exceptions.ConnectionError as exc:
error_msg = 'Connection error'
else:
error_msg = ''
status = res.status
if error_msg:
status = HTTPStatus.error
counter[status] += 1
if verbose and error_msg:
print('*** Error for {}: {}'.format(cc, error_msg))
return counter
# END FLAGS2_DOWNLOAD_MANY_SEQUENTIAL
if __name__ == '__main__':
main(download_many, DEFAULT_CONCUR_REQ, MAX_CONCUR_REQ)

20
appendix-A/hashdiff.py Normal file
View File

@ -0,0 +1,20 @@
import sys
MAX_BITS = len(format(sys.maxsize, 'b'))
print('%s-bit Python build' % (MAX_BITS + 1))
def hash_diff(o1, o2):
h1 = '{:>0{}b}'.format(hash(o1), MAX_BITS)
h2 = '{:>0{}b}'.format(hash(o2), MAX_BITS)
diff = ''.join('!' if b1 != b2 else ' ' for b1, b2 in zip(h1, h2))
count = '!= {}'.format(diff.count('!'))
width = max(len(repr(o1)), len(repr(o2)), 8)
sep = '-' * (width * 2 + MAX_BITS)
return '{!r:{width}} {}\n{:{width}} {} {}\n{!r:{width}} {}\n{}'.format(
o1, h1, ' ' * width, diff, count, o2, h2, sep, width=width)
if __name__ == '__main__':
print(hash_diff(1, 1.0))
print(hash_diff(1.0, 1.0001))
print(hash_diff(1.0001, 1.0002))
print(hash_diff(1.0002, 1.0003))

263
appendix-A/isis2json.py Normal file
View File

@ -0,0 +1,263 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# isis2json.py: convert ISIS and ISO-2709 files to JSON
#
# Copyright (C) 2010 BIREME/PAHO/WHO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation, either version 2.1 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
############################
# tag::ISIS2JSON[]
# this script works with Python or Jython (versions >=2.5 and <3)
import sys
import argparse
from uuid import uuid4
import os
try:
import json
except ImportError:
if os.name == 'java': # running Jython
from com.xhaus.jyson import JysonCodec as json
else:
import simplejson as json
SKIP_INACTIVE = True
DEFAULT_QTY = 2**31
ISIS_MFN_KEY = 'mfn'
ISIS_ACTIVE_KEY = 'active'
SUBFIELD_DELIMITER = '^'
INPUT_ENCODING = 'cp1252'
def iter_iso_records(iso_file_name, isis_json_type): # <1>
from iso2709 import IsoFile
from subfield import expand
iso = IsoFile(iso_file_name)
for record in iso:
fields = {}
for field in record.directory:
field_key = str(int(field.tag)) # remove leading zeroes
field_occurrences = fields.setdefault(field_key, [])
content = field.value.decode(INPUT_ENCODING, 'replace')
if isis_json_type == 1:
field_occurrences.append(content)
elif isis_json_type == 2:
field_occurrences.append(expand(content))
elif isis_json_type == 3:
field_occurrences.append(dict(expand(content)))
else:
raise NotImplementedError('ISIS-JSON type %s conversion '
'not yet implemented for .iso input' % isis_json_type)
yield fields
iso.close()
def iter_mst_records(master_file_name, isis_json_type): # <2>
try:
from bruma.master import MasterFactory, Record
except ImportError:
print('IMPORT ERROR: Jython 2.5 and Bruma.jar '
'are required to read .mst files')
raise SystemExit
mst = MasterFactory.getInstance(master_file_name).open()
for record in mst:
fields = {}
if SKIP_INACTIVE:
if record.getStatus() != Record.Status.ACTIVE:
continue
else: # save status only there are non-active records
fields[ISIS_ACTIVE_KEY] = (record.getStatus() ==
Record.Status.ACTIVE)
fields[ISIS_MFN_KEY] = record.getMfn()
for field in record.getFields():
field_key = str(field.getId())
field_occurrences = fields.setdefault(field_key, [])
if isis_json_type == 3:
content = {}
for subfield in field.getSubfields():
subfield_key = subfield.getId()
if subfield_key == '*':
content['_'] = subfield.getContent()
else:
subfield_occurrences = content.setdefault(subfield_key, [])
subfield_occurrences.append(subfield.getContent())
field_occurrences.append(content)
elif isis_json_type == 1:
content = []
for subfield in field.getSubfields():
subfield_key = subfield.getId()
if subfield_key == '*':
content.insert(0, subfield.getContent())
else:
content.append(SUBFIELD_DELIMITER + subfield_key +
subfield.getContent())
field_occurrences.append(''.join(content))
else:
raise NotImplementedError('ISIS-JSON type %s conversion '
'not yet implemented for .mst input' % isis_json_type)
yield fields
mst.close()
def write_json(input_gen, file_name, output, qty, skip, id_tag, # <3>
gen_uuid, mongo, mfn, isis_json_type, prefix,
constant):
start = skip
end = start + qty
if id_tag:
id_tag = str(id_tag)
ids = set()
else:
id_tag = ''
for i, record in enumerate(input_gen):
if i >= end:
break
if not mongo:
if i == 0:
output.write('[')
elif i > start:
output.write(',')
if start <= i < end:
if id_tag:
occurrences = record.get(id_tag, None)
if occurrences is None:
msg = 'id tag #%s not found in record %s'
if ISIS_MFN_KEY in record:
msg = msg + (' (mfn=%s)' % record[ISIS_MFN_KEY])
raise KeyError(msg % (id_tag, i))
if len(occurrences) > 1:
msg = 'multiple id tags #%s found in record %s'
if ISIS_MFN_KEY in record:
msg = msg + (' (mfn=%s)' % record[ISIS_MFN_KEY])
raise TypeError(msg % (id_tag, i))
else: # ok, we have one and only one id field
if isis_json_type == 1:
id = occurrences[0]
elif isis_json_type == 2:
id = occurrences[0][0][1]
elif isis_json_type == 3:
id = occurrences[0]['_']
if id in ids:
msg = 'duplicate id %s in tag #%s, record %s'
if ISIS_MFN_KEY in record:
msg = msg + (' (mfn=%s)' % record[ISIS_MFN_KEY])
raise TypeError(msg % (id, id_tag, i))
record['_id'] = id
ids.add(id)
elif gen_uuid:
record['_id'] = unicode(uuid4())
elif mfn:
record['_id'] = record[ISIS_MFN_KEY]
if prefix:
# iterate over a fixed sequence of tags
for tag in tuple(record):
if str(tag).isdigit():
record[prefix+tag] = record[tag]
del record[tag] # this is why we iterate over a tuple
# with the tags, and not directly on the record dict
if constant:
constant_key, constant_value = constant.split(':')
record[constant_key] = constant_value
output.write(json.dumps(record).encode('utf-8'))
output.write('\n')
if not mongo:
output.write(']\n')
def main(): # <4>
# create the parser
parser = argparse.ArgumentParser(
description='Convert an ISIS .mst or .iso file to a JSON array')
# add the arguments
parser.add_argument(
'file_name', metavar='INPUT.(mst|iso)',
help='.mst or .iso file to read')
parser.add_argument(
'-o', '--out', type=argparse.FileType('w'), default=sys.stdout,
metavar='OUTPUT.json',
help='the file where the JSON output should be written'
' (default: write to stdout)')
parser.add_argument(
'-c', '--couch', action='store_true',
help='output array within a "docs" item in a JSON document'
' for bulk insert to CouchDB via POST to db/_bulk_docs')
parser.add_argument(
'-m', '--mongo', action='store_true',
help='output individual records as separate JSON dictionaries, one'
' per line for bulk insert to MongoDB via mongoimport utility')
parser.add_argument(
'-t', '--type', type=int, metavar='ISIS_JSON_TYPE', default=1,
help='ISIS-JSON type, sets field structure: 1=string, 2=alist,'
' 3=dict (default=1)')
parser.add_argument(
'-q', '--qty', type=int, default=DEFAULT_QTY,
help='maximum quantity of records to read (default=ALL)')
parser.add_argument(
'-s', '--skip', type=int, default=0,
help='records to skip from start of .mst (default=0)')
parser.add_argument(
'-i', '--id', type=int, metavar='TAG_NUMBER', default=0,
help='generate an "_id" from the given unique TAG field number'
' for each record')
parser.add_argument(
'-u', '--uuid', action='store_true',
help='generate an "_id" with a random UUID for each record')
parser.add_argument(
'-p', '--prefix', type=str, metavar='PREFIX', default='',
help='concatenate prefix to every numeric field tag'
' (ex. 99 becomes "v99")')
parser.add_argument(
'-n', '--mfn', action='store_true',
help='generate an "_id" from the MFN of each record'
' (available only for .mst input)')
parser.add_argument(
'-k', '--constant', type=str, metavar='TAG:VALUE', default='',
help='Include a constant tag:value in every record (ex. -k type:AS)')
'''
# TODO: implement this to export large quantities of records to CouchDB
parser.add_argument(
'-r', '--repeat', type=int, default=1,
help='repeat operation, saving multiple JSON files'
' (default=1, use -r 0 to repeat until end of input)')
'''
# parse the command line
args = parser.parse_args()
if args.file_name.lower().endswith('.mst'):
input_gen_func = iter_mst_records # <5>
else:
if args.mfn:
print('UNSUPORTED: -n/--mfn option only available for .mst input.')
raise SystemExit
input_gen_func = iter_iso_records # <6>
input_gen = input_gen_func(args.file_name, args.type) # <7>
if args.couch:
args.out.write('{ "docs" : ')
write_json(input_gen, args.file_name, args.out, args.qty, # <8>
args.skip, args.id, args.uuid, args.mongo, args.mfn,
args.type, args.prefix, args.constant)
if args.couch:
args.out.write('}\n')
args.out.close()
if __name__ == '__main__':
main()
# end::ISIS2JSON[]

24
appendix-A/mem_test.py Normal file
View File

@ -0,0 +1,24 @@
import importlib
import sys
import resource
NUM_VECTORS = 10**7
if len(sys.argv) == 2:
module_name = sys.argv[1].replace('.py', '')
module = importlib.import_module(module_name)
else:
print('Usage: {} <vector-module-to-test>'.format())
sys.exit(1)
fmt = 'Selected Vector2d type: {.__name__}.{.__name__}'
print(fmt.format(module, module.Vector2d))
mem_init = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print('Creating {:,} Vector2d instances'.format(NUM_VECTORS))
vectors = [module.Vector2d(3.0, 4.0) for i in range(NUM_VECTORS)]
mem_final = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print('Initial RAM usage: {:14,}'.format(mem_init))
print(' Final RAM usage: {:14,}'.format(mem_final))

132
appendix-A/schedule2.py Normal file
View File

@ -0,0 +1,132 @@
"""
schedule2.py: traversing OSCON schedule data
>>> import shelve
>>> db = shelve.open(DB_NAME)
>>> if CONFERENCE not in db: load_db(db)
# BEGIN SCHEDULE2_DEMO
>>> DbRecord.set_db(db)
>>> event = DbRecord.fetch('event.33950')
>>> event
<Event 'There *Will* Be Bugs'>
>>> event.venue
<DbRecord serial='venue.1449'>
>>> event.venue.name
'Portland 251'
>>> for spkr in event.speakers:
... print('{0.serial}: {0.name}'.format(spkr))
...
speaker.3471: Anna Martelli Ravenscroft
speaker.5199: Alex Martelli
# END SCHEDULE2_DEMO
>>> db.close()
"""
# BEGIN SCHEDULE2_RECORD
import warnings
import inspect
import osconfeed
DB_NAME = 'data/schedule2_db'
CONFERENCE = 'conference.115'
class Record:
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
def __eq__(self, other):
if isinstance(other, Record):
return self.__dict__ == other.__dict__
else:
return NotImplemented
# END SCHEDULE2_RECORD
# BEGIN SCHEDULE2_DBRECORD
class MissingDatabaseError(RuntimeError):
"""Raised when a database is required but was not set."""
class DbRecord(Record):
__db = None
@staticmethod
def set_db(db):
DbRecord.__db = db
@staticmethod
def get_db():
return DbRecord.__db
@classmethod
def fetch(cls, ident):
db = cls.get_db()
try:
return db[ident]
except TypeError:
if db is None:
msg = "database not set; call '{}.set_db(my_db)'"
raise MissingDatabaseError(msg.format(cls.__name__))
else:
raise
def __repr__(self):
if hasattr(self, 'serial'):
cls_name = self.__class__.__name__
return '<{} serial={!r}>'.format(cls_name, self.serial)
else:
return super().__repr__()
# END SCHEDULE2_DBRECORD
# BEGIN SCHEDULE2_EVENT
class Event(DbRecord):
@property
def venue(self):
key = 'venue.{}'.format(self.venue_serial)
return self.__class__.fetch(key)
@property
def speakers(self):
if not hasattr(self, '_speaker_objs'):
spkr_serials = self.__dict__['speakers']
fetch = self.__class__.fetch
self._speaker_objs = [fetch('speaker.{}'.format(key))
for key in spkr_serials]
return self._speaker_objs
def __repr__(self):
if hasattr(self, 'name'):
cls_name = self.__class__.__name__
return '<{} {!r}>'.format(cls_name, self.name)
else:
return super().__repr__()
# END SCHEDULE2_EVENT
# BEGIN SCHEDULE2_LOAD
def load_db(db):
raw_data = osconfeed.load()
warnings.warn('loading ' + DB_NAME)
for collection, rec_list in raw_data['Schedule'].items():
record_type = collection[:-1]
cls_name = record_type.capitalize()
cls = globals().get(cls_name, DbRecord)
if inspect.isclass(cls) and issubclass(cls, DbRecord):
factory = cls
else:
factory = DbRecord
for record in rec_list:
key = '{}.{}'.format(record_type, record['serial'])
record['serial'] = key
db[key] = factory(**record)
# END SCHEDULE2_LOAD

38
appendix-A/sha_futures.py Normal file
View File

@ -0,0 +1,38 @@
import sys
import time
import hashlib
from concurrent import futures
from random import randrange
JOBS = 12
SIZE = 2**20
STATUS = '{} workers, elapsed time: {:.2f}s'
def sha(size):
data = bytearray(randrange(256) for i in range(size))
algo = hashlib.new('sha256')
algo.update(data)
return algo.hexdigest()
def main(workers=None):
if workers:
workers = int(workers)
t0 = time.time()
with futures.ProcessPoolExecutor(workers) as executor:
actual_workers = executor._max_workers
to_do = (executor.submit(sha, SIZE) for i in range(JOBS))
for future in futures.as_completed(to_do):
res = future.result()
print(res)
print(STATUS.format(actual_workers, time.time() - t0))
if __name__ == '__main__':
if len(sys.argv) == 2:
workers = int(sys.argv[1])
else:
workers = None
main(workers)

203
appendix-A/taxi_sim.py Normal file
View File

@ -0,0 +1,203 @@
"""
Taxi simulator
==============
Driving a taxi from the console::
>>> from taxi_sim import taxi_process
>>> taxi = taxi_process(ident=13, trips=2, start_time=0)
>>> next(taxi)
Event(time=0, proc=13, action='leave garage')
>>> taxi.send(_.time + 7)
Event(time=7, proc=13, action='pick up passenger')
>>> taxi.send(_.time + 23)
Event(time=30, proc=13, action='drop off passenger')
>>> taxi.send(_.time + 5)
Event(time=35, proc=13, action='pick up passenger')
>>> taxi.send(_.time + 48)
Event(time=83, proc=13, action='drop off passenger')
>>> taxi.send(_.time + 1)
Event(time=84, proc=13, action='going home')
>>> taxi.send(_.time + 10)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
StopIteration
Sample run with two cars, random seed 10. This is a valid doctest::
>>> main(num_taxis=2, seed=10)
taxi: 0 Event(time=0, proc=0, action='leave garage')
taxi: 0 Event(time=5, proc=0, action='pick up passenger')
taxi: 1 Event(time=5, proc=1, action='leave garage')
taxi: 1 Event(time=10, proc=1, action='pick up passenger')
taxi: 1 Event(time=15, proc=1, action='drop off passenger')
taxi: 0 Event(time=17, proc=0, action='drop off passenger')
taxi: 1 Event(time=24, proc=1, action='pick up passenger')
taxi: 0 Event(time=26, proc=0, action='pick up passenger')
taxi: 0 Event(time=30, proc=0, action='drop off passenger')
taxi: 0 Event(time=34, proc=0, action='going home')
taxi: 1 Event(time=46, proc=1, action='drop off passenger')
taxi: 1 Event(time=48, proc=1, action='pick up passenger')
taxi: 1 Event(time=110, proc=1, action='drop off passenger')
taxi: 1 Event(time=139, proc=1, action='pick up passenger')
taxi: 1 Event(time=140, proc=1, action='drop off passenger')
taxi: 1 Event(time=150, proc=1, action='going home')
*** end of events ***
See longer sample run at the end of this module.
"""
import random
import collections
import queue
import argparse
import time
DEFAULT_NUMBER_OF_TAXIS = 3
DEFAULT_END_TIME = 180
SEARCH_DURATION = 5
TRIP_DURATION = 20
DEPARTURE_INTERVAL = 5
Event = collections.namedtuple('Event', 'time proc action')
# BEGIN TAXI_PROCESS
def taxi_process(ident, trips, start_time=0):
"""Yield to simulator issuing event at each state change"""
time = yield Event(start_time, ident, 'leave garage')
for i in range(trips):
time = yield Event(time, ident, 'pick up passenger')
time = yield Event(time, ident, 'drop off passenger')
yield Event(time, ident, 'going home')
# end of taxi process
# END TAXI_PROCESS
# BEGIN TAXI_SIMULATOR
class Simulator:
def __init__(self, procs_map):
self.events = queue.PriorityQueue()
self.procs = dict(procs_map)
def run(self, end_time):
"""Schedule and display events until time is up"""
# schedule the first event for each cab
for _, proc in sorted(self.procs.items()):
first_event = next(proc)
self.events.put(first_event)
# main loop of the simulation
sim_time = 0
while sim_time < end_time:
if self.events.empty():
print('*** end of events ***')
break
current_event = self.events.get()
sim_time, proc_id, previous_action = current_event
print('taxi:', proc_id, proc_id * ' ', current_event)
active_proc = self.procs[proc_id]
next_time = sim_time + compute_duration(previous_action)
try:
next_event = active_proc.send(next_time)
except StopIteration:
del self.procs[proc_id]
else:
self.events.put(next_event)
else:
msg = '*** end of simulation time: {} events pending ***'
print(msg.format(self.events.qsize()))
# END TAXI_SIMULATOR
def compute_duration(previous_action):
"""Compute action duration using exponential distribution"""
if previous_action in ['leave garage', 'drop off passenger']:
# new state is prowling
interval = SEARCH_DURATION
elif previous_action == 'pick up passenger':
# new state is trip
interval = TRIP_DURATION
elif previous_action == 'going home':
interval = 1
else:
raise ValueError('Unknown previous_action: %s' % previous_action)
return int(random.expovariate(1/interval)) + 1
def main(end_time=DEFAULT_END_TIME, num_taxis=DEFAULT_NUMBER_OF_TAXIS,
seed=None):
"""Initialize random generator, build procs and run simulation"""
if seed is not None:
random.seed(seed) # get reproducible results
taxis = {i: taxi_process(i, (i+1)*2, i*DEPARTURE_INTERVAL)
for i in range(num_taxis)}
sim = Simulator(taxis)
sim.run(end_time)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Taxi fleet simulator.')
parser.add_argument('-e', '--end-time', type=int,
default=DEFAULT_END_TIME,
help='simulation end time; default = %s'
% DEFAULT_END_TIME)
parser.add_argument('-t', '--taxis', type=int,
default=DEFAULT_NUMBER_OF_TAXIS,
help='number of taxis running; default = %s'
% DEFAULT_NUMBER_OF_TAXIS)
parser.add_argument('-s', '--seed', type=int, default=None,
help='random generator seed (for testing)')
args = parser.parse_args()
main(args.end_time, args.taxis, args.seed)
"""
Sample run from the command line, seed=3, maximum elapsed time=120::
# BEGIN TAXI_SAMPLE_RUN
$ python3 taxi_sim.py -s 3 -e 120
taxi: 0 Event(time=0, proc=0, action='leave garage')
taxi: 0 Event(time=2, proc=0, action='pick up passenger')
taxi: 1 Event(time=5, proc=1, action='leave garage')
taxi: 1 Event(time=8, proc=1, action='pick up passenger')
taxi: 2 Event(time=10, proc=2, action='leave garage')
taxi: 2 Event(time=15, proc=2, action='pick up passenger')
taxi: 2 Event(time=17, proc=2, action='drop off passenger')
taxi: 0 Event(time=18, proc=0, action='drop off passenger')
taxi: 2 Event(time=18, proc=2, action='pick up passenger')
taxi: 2 Event(time=25, proc=2, action='drop off passenger')
taxi: 1 Event(time=27, proc=1, action='drop off passenger')
taxi: 2 Event(time=27, proc=2, action='pick up passenger')
taxi: 0 Event(time=28, proc=0, action='pick up passenger')
taxi: 2 Event(time=40, proc=2, action='drop off passenger')
taxi: 2 Event(time=44, proc=2, action='pick up passenger')
taxi: 1 Event(time=55, proc=1, action='pick up passenger')
taxi: 1 Event(time=59, proc=1, action='drop off passenger')
taxi: 0 Event(time=65, proc=0, action='drop off passenger')
taxi: 1 Event(time=65, proc=1, action='pick up passenger')
taxi: 2 Event(time=65, proc=2, action='drop off passenger')
taxi: 2 Event(time=72, proc=2, action='pick up passenger')
taxi: 0 Event(time=76, proc=0, action='going home')
taxi: 1 Event(time=80, proc=1, action='drop off passenger')
taxi: 1 Event(time=88, proc=1, action='pick up passenger')
taxi: 2 Event(time=95, proc=2, action='drop off passenger')
taxi: 2 Event(time=97, proc=2, action='pick up passenger')
taxi: 2 Event(time=98, proc=2, action='drop off passenger')
taxi: 1 Event(time=106, proc=1, action='drop off passenger')
taxi: 2 Event(time=109, proc=2, action='going home')
taxi: 1 Event(time=110, proc=1, action='going home')
*** end of events ***
# END TAXI_SAMPLE_RUN
"""

View File

@ -0,0 +1,37 @@
import shelve
import pytest
import schedule1 as schedule
@pytest.yield_fixture
def db():
with shelve.open(schedule.DB_NAME) as the_db:
if schedule.CONFERENCE not in the_db:
schedule.load_db(the_db)
yield the_db
def test_record_class():
rec = schedule.Record(spam=99, eggs=12)
assert rec.spam == 99
assert rec.eggs == 12
def test_conference_record(db):
assert schedule.CONFERENCE in db
def test_speaker_record(db):
speaker = db['speaker.3471']
assert speaker.name == 'Anna Martelli Ravenscroft'
def test_event_record(db):
event = db['event.33950']
assert event.name == 'There *Will* Be Bugs'
def test_event_venue(db):
event = db['event.33950']
assert event.venue_serial == 1449

View File

@ -0,0 +1,72 @@
import shelve
import pytest
import schedule2 as schedule
@pytest.yield_fixture
def db():
with shelve.open(schedule.DB_NAME) as the_db:
if schedule.CONFERENCE not in the_db:
schedule.load_db(the_db)
yield the_db
def test_record_attr_access():
rec = schedule.Record(spam=99, eggs=12)
assert rec.spam == 99
assert rec.eggs == 12
def test_record_repr():
rec = schedule.DbRecord(spam=99, eggs=12)
assert 'DbRecord object at 0x' in repr(rec)
rec2 = schedule.DbRecord(serial=13)
assert repr(rec2) == "<DbRecord serial=13>"
def test_conference_record(db):
assert schedule.CONFERENCE in db
def test_speaker_record(db):
speaker = db['speaker.3471']
assert speaker.name == 'Anna Martelli Ravenscroft'
def test_missing_db_exception():
with pytest.raises(schedule.MissingDatabaseError):
schedule.DbRecord.fetch('venue.1585')
def test_dbrecord(db):
schedule.DbRecord.set_db(db)
venue = schedule.DbRecord.fetch('venue.1585')
assert venue.name == 'Exhibit Hall B'
def test_event_record(db):
event = db['event.33950']
assert repr(event) == "<Event 'There *Will* Be Bugs'>"
def test_event_venue(db):
schedule.Event.set_db(db)
event = db['event.33950']
assert event.venue_serial == 1449
assert event.venue == db['venue.1449']
assert event.venue.name == 'Portland 251'
def test_event_speakers(db):
schedule.Event.set_db(db)
event = db['event.33950']
assert len(event.speakers) == 2
anna_and_alex = [db['speaker.3471'], db['speaker.5199']]
assert event.speakers == anna_and_alex
def test_event_no_speakers(db):
schedule.Event.set_db(db)
event = db['event.36848']
assert len(event.speakers) == 0