update from Atlas with major reorg

2015-04-17 21:29:30 -03:00
parent 57902d31b5
commit a786180239
134 changed files with 369 additions and 520 deletions
--- a/attic/concurrency/wikipedia/README.rst
+++ b/attic/concurrency/wikipedia/README.rst
@@ -0,0 +1,138 @@
+====================================
+Configuring a local test environment
+====================================
+
+tl;dr;
+======
+
+This text explains how to configure **nginx** and **vaurien** to build
+a local mirror of the data to run the Wikipedia Picture of the Day
+examples while avoiding network traffic and introducing controlled
+delays and errors for testing, thanks to the **vaurien** proxy.
+
+
+Rationale and overview
+======================
+
+The Wikipedia Picture of the Day examples are designed to demonstrate
+the performance of different approaches to finding and downloading
+images from the Wikipedia. However, we don't want to hit the Wikipedia
+with multiple requests per second while testing, and we want to be
+able to simulate high latency and random network errors.
+
+For this setup I chose **nginx** as the HTTP server because it is very
+fast and easy to configure, and the **vaurien** proxy because it was
+designed by Mozilla to introduce delays and network errors for testing.
+
+The initial fixture data, ``docroot.zip``, contains a directory
+``docroot/Template-POTD/`` with 1096 small text files, each consisting
+of an HTML fragment (just a ``src="..."`` attribute) or an error message
+(for days when no picture was published, like 2013-09-12). These files
+correspond to every day of the years 2012, 2013 and 2014. The year 2012
+was a leap year, that's why there are 1096 files and not 1095.
+
+Once these files are unpacked to the ``docroot/Template-POTD`` directory
+and **nginx** is configured, the ``build_fixture.py`` script can fetch the
+actual images from the Wikipedia for local storage in the directory
+``docroot/wikimedia/``.
+
+When that is done you can configure **nginx** and **vaurien** to experiment
+with the ``daypicts*.py``examples without hitting the network.
+
+
+Instructions
+============
+
+1. Unpack test data
+-------------------
+
+Unpack the initial data in the ``fixture/`` directory and verify that 1096
+files were created in ``fixture/docroot/Template-POTD/``::
+
+    $ ls  # inside the fixture/ directory
+    README.rst  docroot.zip
+    $ unzip docroot.zip
+    ... many lines omitted...
+    inflating: docroot/Template-POTD/2014-12-29
+    inflating: docroot/Template-POTD/2014-12-30
+    inflating: docroot/Template-POTD/2014-12-31
+    $ ls docroot/Template-POTD/ | wc -w
+    1096
+
+
+2. Install **nginx**
+--------------------
+
+Download and install **nginx**. I used version 1.6.2 -- the latest
+stable version as I write this.
+
+- Download page: http://nginx.org/en/download.html
+
+- Beginner's guide: http://nginx.org/en/docs/beginners_guide.html
+
+
+3. Configure **nginx**
+----------------------
+
+Edit the the ``nginx.conf`` file to set the port and document root.
+The file is usually found in ``/usr/local/nginx/conf``, ``/etc/nginx``,
+or ``/usr/local/etc/nginx``.
+
+Most of the content in ``nginx.conf`` is within a block labeled ``http``
+and enclosed in curly braces. Within that block there can be multiple
+blocks labeled ``server``. Add another ``server`` block like this one::
+
+    server {
+        listen       8001;
+
+        location / {
+            root   /full-path-to.../fixture/docroot;
+        }
+    }
+
+After editing ``nginx.conf`` the server must be started (if it's not
+running) or told to reload the configuration file::
+
+    $ nginx  # to start, if necessary
+    $ nginx -s reload  # to reload the configuration
+
+To test the configuration, open the URL below in a browser. Doing so
+will download a small file named ``2014-01-01`` with an HTML fragment::
+
+    http://localhost:8001/Template-POTD/2014-01-01
+
+If the test fails, please double check the procedure just described and
+refer to the **nginx** documentation.
+
+
+Platform-specific instructions
+==============================
+
+Nginx setup on Mac OS X
+-----------------------
+
+Homebrew (copy & paste code at the bottom of http://brew.sh/)::
+
+  $ ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+  $ brew doctor
+  $ brew install nginx
+
+Download and unpack::
+
+Docroot is: /usr/local/var/www
+/usr/local/etc/nginx/nginx.conf
+
+To have launchd start nginx at login:
+    ln -sfv /usr/local/opt/nginx/*.plist ~/Library/LaunchAgents
+Then to load nginx now:
+    launchctl load ~/Library/LaunchAgents/homebrew.mxcl.nginx.plist
+Or, if you don't want/need launchctl, you can just run:
+    nginx
+
+
+
+Nginx setup on Lubuntu 14.04.1 LTS
+----------------------------------
+
+Docroot is: /usr/share/nginx/html
+
--- a/attic/concurrency/wikipedia/build_fixture.py
+++ b/attic/concurrency/wikipedia/build_fixture.py
@@ -0,0 +1,97 @@
+import sys
+import argparse
+import os
+import urllib
+
+import requests
+
+from daypicts import get_picture_url, get_picture_urls
+from daypicts import validate_date, gen_dates, picture_type
+from daypicts import NoPictureForDate
+from daypicts import REMOTE_PICT_BASE_URL, PICT_EXCEPTIONS
+
+FIXTURE_DOC_DIR = 'fixture/docroot/'
+FIXTURE_TEMPLATE_POTD_DIR = FIXTURE_DOC_DIR + 'Template-POTD/'
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day'
+    parser.add_argument('date', help=date_help)
+    parser.add_argument('-u', '--url_only', action='store_true',
+                        help='get picture URLS only')
+
+    args = parser.parse_args(argv)
+
+    try:
+        iso_parts = validate_date(args.date)
+    except ValueError as exc:
+        print('error:', exc.args[0])
+        parser.print_usage()
+        sys.exit(2)
+
+    dates = list(gen_dates(iso_parts))
+    if len(dates) == 1:
+        print('-> Date: ', dates[0])
+    else:
+        fmt = '-> {} days: {}...{}'
+        print(fmt.format(len(dates), dates[0], dates[-1]))
+
+    return dates, args
+
+
+def save_picture_urls(dates, save_path):
+    for date in dates:
+        try:
+            url = get_picture_url(date)
+        except NoPictureForDate as exc:
+            snippet = repr(exc)
+        else:
+            snippet = url.replace('http://', 'src="//') + '"'
+        print(date, end=' ')
+        print(snippet)
+        with open(os.path.join(save_path, date), 'w') as fp:
+            fp.write(snippet)
+
+
+def save_pictures(dates, save_path, verbose=False):
+    urls_ok = []
+    for date, url in get_picture_urls(dates, verbose):
+        response = requests.get(url)
+        file_path = os.path.join(save_path,
+                                 url.replace(REMOTE_PICT_BASE_URL, ''))
+        file_path = urllib.parse.unquote(file_path)
+        octets = response.content
+        # http://en.wikipedia.org/wiki/Template:POTD/2013-06-15
+
+        if date not in PICT_EXCEPTIONS:
+            assert picture_type(octets) is not None, url
+
+        try:
+            os.makedirs(os.path.dirname(file_path))
+        except FileExistsError:
+            pass
+        with open(file_path, 'wb') as fp:
+            fp.write(octets)
+
+        print(file_path)
+    return urls_ok
+
+
+def main(argv):
+    """Build test fixture from Wikipedia "POTD" data"""
+
+    try:
+        os.makedirs(FIXTURE_TEMPLATE_POTD_DIR)
+    except FileExistsError:
+        pass
+
+    dates, args = parse_args(argv)
+
+    if args.url_only:
+        save_picture_urls(dates, FIXTURE_TEMPLATE_POTD_DIR)
+    else:
+        save_pictures(dates, FIXTURE_DOC_DIR)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
--- a/attic/concurrency/wikipedia/daypicts.py
+++ b/attic/concurrency/wikipedia/daypicts.py
@@ -0,0 +1,227 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Note:
+The earliest Pictures of the Day I've found are in this page:
+
+http://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/May_2004
+
+However, I have not found Template:POTD/YYYY-MM-DD pages earlier
+than this:
+
+http://en.wikipedia.org/wiki/Template:POTD/2007-01-01
+
+For simplicity, this script only retrieves pictures starting
+from 2007-01-01.
+
+"""
+import sys
+import argparse
+import re
+import time
+import datetime
+import os
+import imghdr
+import warnings
+
+import requests
+
+SAVE_DIR = 'downloaded/'
+
+HTTP_PORT = 8002
+POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
+#POTD_BASE_URL = 'http://127.0.0.1:{}/Template-POTD/'.format(HTTP_PORT)
+
+REMOTE_PICT_BASE_URL = 'http://upload.wikimedia.org/wikipedia/'
+#LOCAL_PICT_BASE_URL = 'http://127.0.0.1:{}/'.format(HTTP_PORT)
+LOCAL_PICT_BASE_URL = REMOTE_PICT_BASE_URL
+PICT_BASE_URL = REMOTE_PICT_BASE_URL
+
+POTD_IMAGE_RE = re.compile(r'src="(//upload\..*?)"')
+PODT_EARLIEST_TEMPLATE = '2007-01-01'
+
+RE_YEAR = r'([12]\d{3})'
+RE_MONTH = RE_YEAR + r'-([01]\d)'
+RE_DATE = RE_MONTH + r'-([0-3]\d)'
+ISO_DATE_FMT = '%Y-%m-%d'
+
+PICT_EXCEPTIONS = {
+        '2013-06-15', # .webm movie [1]
+    }
+
+#[1] http://en.wikipedia.org/wiki/Template:POTD/2013-06-15
+
+class NoPictureForDate(Exception):
+    '''No Picture of the Day found for {iso_date}'''
+
+
+class NoPictureTemplateBefore(ValueError):
+    '''Template:POTD did not exist before PODT_EARLIEST_TEMPLATE'''
+
+
+def get_picture_url(iso_date):
+    page_url = POTD_BASE_URL + iso_date
+    print(page_url)
+    response = requests.get(page_url)
+    pict_url = POTD_IMAGE_RE.search(response.text)
+    if pict_url is None:
+        raise NoPictureForDate(iso_date)
+    return 'http:' + pict_url.group(1)
+
+
+def validate_date(text):
+    try:
+        parts = [int(part) for part in text.split('-')]
+    except ValueError:
+        raise ValueError('date must use YYYY, YYYY-MM or YYYY-MM-DD format')
+
+    test_parts = parts[:]
+    while len(test_parts) < 3:
+        test_parts.append(1)
+    date = datetime.date(*(int(part) for part in test_parts))
+    iso_date = date.strftime(ISO_DATE_FMT)
+    iso_date = iso_date[:1+len(parts)*3]
+    if iso_date < PODT_EARLIEST_TEMPLATE:
+        raise NoPictureTemplateBefore(PODT_EARLIEST_TEMPLATE)
+    return iso_date
+
+
+def gen_month_dates(iso_month):
+    first = datetime.datetime.strptime(iso_month+'-01', ISO_DATE_FMT)
+    one_day = datetime.timedelta(days=1)
+    date = first.date()
+    while date.month == first.month:
+        yield date.strftime(ISO_DATE_FMT)
+        date += one_day
+
+
+def gen_year_dates(iso_year):
+    for i in range(1, 13):
+        yield from gen_month_dates(iso_year + '-{:02d}'.format(i))
+
+
+def gen_dates(iso_parts):
+    if len(iso_parts) == 4:
+        yield from gen_year_dates(iso_parts)
+    elif len(iso_parts) == 7:
+        yield from gen_month_dates(iso_parts)
+    else:
+        yield iso_parts
+
+
+def get_picture_urls(dates, verbose=False):
+    date_urls = []
+    count = 0
+    for date in dates:
+        try:
+            url = get_picture_url(date)
+        except NoPictureForDate as exc:
+            if verbose:
+                print('*** {!r} ***'.format(exc))
+            continue
+        count += 1
+        if verbose:
+            print(format(count, '3d'), end=' ')
+            print(url.split('/')[-1])
+        else:
+            print(url)
+        date_urls.append((date, url))
+    return date_urls
+
+
+def picture_type(octets):
+    pict_type = imghdr.what(None, octets)
+    if pict_type is None:
+        if (octets.startswith(b'<') and
+                b'<svg' in octets[:200] and
+                octets.rstrip().endswith(b'</svg>')):
+            pict_type = 'svg'
+    return pict_type
+
+
+def get_pictures(dates, verbose=False):
+    urls_ok = []
+    try:
+        os.makedirs(SAVE_DIR)
+    except FileExistsError:
+        pass
+    for date, url in get_picture_urls(dates, verbose):
+        if PICT_BASE_URL == LOCAL_PICT_BASE_URL:
+            url = url.replace(REMOTE_PICT_BASE_URL, PICT_BASE_URL)
+        response = requests.get(url)
+        if response.status_code != 200:
+            warnings.warn('HTTP code {}: {}'.format(response.status_code, url))
+            continue
+        octets = response.content
+        if date not in PICT_EXCEPTIONS:
+            assert picture_type(octets) is not None, url
+        file_path = url.replace(PICT_BASE_URL, '')
+        file_name = os.path.basename(file_path)
+        path = os.path.join(SAVE_DIR, date.split('-')[0])
+        file_path = os.path.join(path, file_name)
+        #import pdb; pdb.set_trace()
+        try:
+            os.makedirs(path)
+        except FileExistsError:
+            pass
+        with open(file_path, 'wb') as fp:
+            fp.write(octets)
+        urls_ok.append(url)
+        print(file_path)
+    return urls_ok
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day'
+    parser.add_argument('date', help=date_help)
+    parser.add_argument('-q', '--max_qty', type=int,
+                        help='maximum number of items to fetch')
+    parser.add_argument('-u', '--url_only', action='store_true',
+                        help='get picture URLS only')
+    parser.add_argument('-f', '--fixture_save', action='store_true',
+                        help='save data for local test fixture')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='display progress information')
+    args = parser.parse_args(argv)
+
+    try:
+        iso_parts = validate_date(args.date)
+    except ValueError as exc:
+        print('error:', exc.args[0])
+        parser.print_usage()
+        sys.exit(2)
+
+    dates = list(gen_dates(iso_parts))
+    if args.verbose:
+        if len(dates) == 1:
+            print('-> Date: ', dates[0])
+        else:
+            fmt = '-> {} days: {}...{}'
+            print(fmt.format(len(dates), dates[0], dates[-1]))
+
+    return dates, args
+
+
+def main(argv, get_picture_urls):
+    """Get Wikipedia "Picture of The Day" for date, month or year"""
+
+    dates, args = parse_args(argv)
+
+    t0 = time.time()
+
+    if args.url_only:
+        urls = get_picture_urls(dates, args.verbose)
+    else:
+        urls = get_pictures(dates, args.verbose)
+
+
+
+    elapsed = time.time() - t0
+    if args.verbose:
+        print('-> found: {} pictures | elapsed time: {:.2f}s'
+              .format(len(urls), elapsed))
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:], get_picture_urls)
--- a/attic/concurrency/wikipedia/daypicts_asyncio.py
+++ b/attic/concurrency/wikipedia/daypicts_asyncio.py
@@ -0,0 +1,62 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+"""
+
+import sys
+import asyncio
+import aiohttp
+
+from daypicts import main, NoPictureForDate
+from daypicts import POTD_BASE_URL, POTD_IMAGE_RE
+
+GLOBAL_TIMEOUT = 300  # seconds
+MAX_CONCURRENT_REQUESTS = 30
+
+
+@asyncio.coroutine
+def get_picture_url(iso_date, semaphore):
+    page_url = POTD_BASE_URL+iso_date
+    with (yield from semaphore):
+        response = yield from aiohttp.request('GET', page_url)
+        text = yield from response.text()
+    pict_url = POTD_IMAGE_RE.search(text)
+    if pict_url is None:
+        raise NoPictureForDate(iso_date)
+    return 'http:' + pict_url.group(1)
+
+
+@asyncio.coroutine
+def get_picture_urls(dates, verbose=False):
+    semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
+    tasks = [get_picture_url(date, semaphore) for date in dates]
+    urls = []
+    count = 0
+    # get results as jobs are done
+    for job in asyncio.as_completed(tasks, timeout=GLOBAL_TIMEOUT):
+        try:
+            url = yield from job
+        except NoPictureForDate as exc:
+            if verbose:
+                print('*** {!r} ***'.format(exc))
+            continue
+        except aiohttp.ClientResponseError as exc:
+            print('****** {!r} ******'.format(exc))
+            continue
+        count += 1
+        if verbose:
+            print(format(count, '3d'), end=' ')
+            print(url.split('/')[-1])
+        else:
+            print(url)
+        urls.append(url)
+    return urls
+
+
+def run_loop(dates, verbose=False):
+
+    loop = asyncio.get_event_loop()
+    return loop.run_until_complete(get_picture_urls(dates, verbose))
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:], run_loop)
--- a/attic/concurrency/wikipedia/daypicts_threads.py
+++ b/attic/concurrency/wikipedia/daypicts_threads.py
@@ -0,0 +1,44 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+"""
+
+import sys
+from concurrent import futures
+
+from daypicts import main, get_picture_url, NoPictureForDate
+
+GLOBAL_TIMEOUT = 300  # seconds
+MAX_CONCURRENT_REQUESTS = 30
+
+
+def get_picture_urls(dates, verbose=False):
+    pool = futures.ThreadPoolExecutor(MAX_CONCURRENT_REQUESTS)
+
+    pending = {}
+    for date in dates:
+        job = pool.submit(get_picture_url, date)
+        pending[job] = date
+
+    urls = []
+    count = 0
+
+    # get results as jobs are done
+    for job in futures.as_completed(pending, timeout=GLOBAL_TIMEOUT):
+        try:
+            url = job.result()
+        except NoPictureForDate as exc:
+            if verbose:
+                print('*** {!r} ***'.format(exc))
+            continue
+        count += 1
+        if verbose:
+            print(format(count, '3d'), end=' ')
+            print(url.split('/')[-1])
+        else:
+            print(url)
+        urls.append(url)
+    return urls
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:], get_picture_urls)
--- a/attic/concurrency/wikipedia/delay.sh
+++ b/attic/concurrency/wikipedia/delay.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+vaurien --protocol http --proxy localhost:8002 --backend localhost:8001 \
+    --behavior 100:delay --behavior-delay-sleep .1
--- a/attic/concurrency/wikipedia/fast_tests.sh
+++ b/attic/concurrency/wikipedia/fast_tests.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+# run tests skipping @pytest.mark.network
+py.test test_daypicts.py -m 'not network' $1 $2 $3
--- a/attic/concurrency/wikipedia/fixture/README.rst
+++ b/attic/concurrency/wikipedia/fixture/README.rst
@@ -0,0 +1,105 @@
+====================================
+Configuring a local test environment
+====================================
+
+tl;dr;
+======
+
+This text explains how to configure **nginx** and **vaurien** to build
+a local mirror of the data to run the Wikipedia Picture of the Day
+examples while avoiding network traffic and introducing controlled
+delays and errors for testing, thanks to the **vaurien** proxy.
+
+
+Rationale and overview
+======================
+
+The Wikipedia Picture of the Day examples are designed to demonstrate
+the performance of different approaches to finding and downloading
+images from the Wikipedia. However, we don't want to hit the Wikipedia
+with multiple requests per second while testing, and we want to be
+able to simulate high latency and random network errors.
+
+For this setup I chose **nginx** as the HTTP server because it is very
+fast and easy to configure, and the **vaurien** proxy because it was
+designed by Mozilla to introduce delays and network errors for testing.
+
+The initial fixture data, ``docroot.zip``, contains a directory
+``docroot/Template-POTD/`` with 1096 small text files, each consisting
+of an HTML fragment (just a ``src="..."`` attribute) or an error message
+(for days when no picture was published, like 2013-09-12). These files
+correspond to every day of the years 2012, 2013 and 2014. The year 2012
+was a leap year, that's why there are 1096 files and not 1095.
+
+Once these files are unpacked to the ``docroot/Template-POTD`` directory
+and **nginx** is configured, the ``build_fixture.py`` script can fetch the
+actual images from the Wikipedia for local storage in the directory
+``docroot/wikimedia/``.
+
+When that is done you can configure **nginx** and **vaurien** to experiment
+with the ``daypicts*.py``examples without hitting the network.
+
+
+Instructions
+============
+
+1. Unpack test data
+-------------------
+
+Unpack the initial data in the ``fixture/`` directory and verify that 1096
+files were created in ``fixture/docroot/Template-POTD/``::
+
+    $ ls  # inside the fixture/ directory
+    README.rst  docroot.zip
+    $ unzip docroot.zip
+    ... many lines omitted...
+    inflating: docroot/Template-POTD/2014-12-29
+    inflating: docroot/Template-POTD/2014-12-30
+    inflating: docroot/Template-POTD/2014-12-31
+    $ ls docroot/Template-POTD/ | wc -w
+    1096
+
+
+2. Install **nginx**
+--------------------
+
+Download and install **nginx**. I used version 1.6.2 -- the latest
+stable version as I write this.
+
+- Download page: http://nginx.org/en/download.html
+
+- Beginner's guide: http://nginx.org/en/docs/beginners_guide.html
+
+
+3. Configure **nginx**
+----------------------
+
+Edit the the ``nginx.conf`` file to set the port and document root.
+The file is usually found in ``/usr/local/nginx/conf``, ``/etc/nginx``,
+or ``/usr/local/etc/nginx``.
+
+Most of the content in ``nginx.conf`` is within a block labeled ``http``
+and enclosed in curly braces. Within that block there can be multiple
+blocks labeled ``server``. Add another ``server`` block like this one::
+
+    server {
+        listen       8001;
+
+        location / {
+            root   /full-path-to.../fixture/docroot;
+        }
+    }
+
+After editing ``nginx.conf`` the server must be started (if it's not
+running) or told to reload the configuration file::
+
+    $ nginx  # to start, if necessary
+    $ nginx -s reload  # to reload the configuration
+
+To test the configuration, open the URL below in a browser. Doing so
+will download a small file named ``2014-01-01`` with an HTML fragment::
+
+    http://localhost:8001/Template-POTD/2014-01-01
+
+If the test fails, please double check the procedure just described and
+refer to the **nginx** documentation.
--- a/attic/concurrency/wikipedia/fixture/docroot.zip
+++ b/attic/concurrency/wikipedia/fixture/docroot.zip
--- a/attic/concurrency/wikipedia/orig/README.rst
+++ b/attic/concurrency/wikipedia/orig/README.rst
@@ -0,0 +1,39 @@
+=====================================
+Wikipedia Picture of the Day examples
+=====================================
+
+These examples use various asynchronous programming techniques to download 
+images and metadata from the English Wikipedia `Picture of the Day`_ archive.
+
+.. _Picture of the Day: http://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/Archive
+
+
+--------
+Timings
+--------
+
+``sync.py``
+===========
+
+::
+
+    $ time python sync.py 2014-06 -q 5
+    5 images downloaded (167.8 Kbytes total)
+
+    real    0m6.272s
+    user    0m0.065s
+    sys 0m0.039s
+
+    $ time python sync.py 2014-06 -q 5
+    5 images downloaded (167.8 Kbytes total)
+
+    real    0m5.447s
+    user    0m0.068s
+    sys 0m0.040s
+
+    $ time python sync.py 2014-06 -q 5
+    5 images downloaded (167.8 Kbytes total)
+
+    real    0m6.314s
+    user    0m0.068s
+    sys 0m0.040s
--- a/attic/concurrency/wikipedia/orig/futureprocs.py
+++ b/attic/concurrency/wikipedia/orig/futureprocs.py
@@ -0,0 +1,36 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Inspired by example at:
+https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
+"""
+
+from concurrent import futures
+
+import potd
+
+def save_month(year_month, verbose):
+    year, month = [int(s) for s in year_month.split('-')]
+    total_size = 0
+    img_count = 0
+    dates = potd.list_days_of_month(year, month)
+
+    with futures.ProcessPoolExecutor(max_workers=100) as executor:
+        downloads = dict((executor.submit(potd.save_one, date, verbose), date)
+                             for date in dates)
+
+        for future in futures.as_completed(downloads):
+            date = downloads[future]
+            if future.exception() is not None:
+                print('%r generated an exception: %s' % (date,
+                                                         future.exception()))
+            else:
+                img_size = future.result()
+                total_size += img_size
+                img_count += 1
+                print('%r OK: %r' % (date, img_size))
+
+    return img_count, total_size
+
+if __name__ == '__main__':
+    potd.main(save_month=save_month)
--- a/attic/concurrency/wikipedia/orig/futurethreads.py
+++ b/attic/concurrency/wikipedia/orig/futurethreads.py
@@ -0,0 +1,36 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Inspired by example at:
+https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
+"""
+
+from concurrent import futures
+
+import potd
+
+def save_month(year_month, verbose):
+    year, month = [int(s) for s in year_month.split('-')]
+    total_size = 0
+    img_count = 0
+    dates = potd.list_days_of_month(year, month)
+
+    with futures.ThreadPoolExecutor(max_workers=100) as executor:
+        downloads = dict((executor.submit(potd.save_one, date, verbose), date)
+                             for date in dates)
+
+        for future in futures.as_completed(downloads):
+            date = downloads[future]
+            if future.exception() is not None:
+                print('%r generated an exception: %s' % (date,
+                                                         future.exception()))
+            else:
+                img_size = future.result()
+                total_size += img_size
+                img_count += 1
+                print('%r OK: %r' % (date, img_size))
+
+    return img_count, total_size
+
+if __name__ == '__main__':
+    potd.main(save_month=save_month)
--- a/attic/concurrency/wikipedia/orig/potd.py
+++ b/attic/concurrency/wikipedia/orig/potd.py
@@ -0,0 +1,100 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Baseline synchronous example for comparison: downloads metadata and
+images in the simple but slow synchronous way i.e. one after the other.
+"""
+
+import calendar
+import datetime
+import re
+import os
+import io
+import time
+
+import requests
+
+import argparse
+
+SAVE_DIR = 'pictures/'
+POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
+
+class NoPictureForDate(Exception):
+    '''No Picture of the Day found for {day}'''
+
+def build_page_url(iso_date):
+    return POTD_BASE_URL + iso_date
+
+def fetch(url):
+    response = requests.get(url)
+    return response
+
+def extract_image_url(html):
+    re_image = r'src="(//upload\..*?)"'
+    image_url = re.search(re_image, html)
+    return 'http:' + image_url.group(1)
+
+def format_date(year, month, day):
+    return '{year}-{month:02d}-{day:02d}'.format(**locals())
+
+def list_days_of_month(year, month):
+    lastday = calendar.monthrange(year, month)[1]
+    days = [format_date(year, month, day) for day in range(1, lastday + 1)]
+    return days
+
+def build_save_path(iso_date, url):
+    head, filename = os.path.split(url)
+    return os.path.join(SAVE_DIR, iso_date+'_'+filename)
+
+def save_one(iso_date, verbose):
+    page_url = build_page_url(iso_date)
+    response = fetch(page_url)
+    if response.status_code != 200:
+        msg = NoPictureForDate.__doc__.format(day=iso_date)
+        raise NoPictureForDate(msg)
+    img_url = extract_image_url(response.text)
+    response = fetch(img_url)
+    path = build_save_path(iso_date, img_url)
+    if verbose:
+        print('saving: '+path)
+    with io.open(path, 'wb') as fp:
+        fp.write(response.content)
+    return len(response.content)
+
+def save_month(year_month, verbose):
+    year, month = [int(s) for s in year_month.split('-')]
+    total_size = 0
+    img_count = 0
+    dates = list_days_of_month(year, month)
+
+    for date in dates:
+        try:
+            total_size += save_one(date, verbose)
+            img_count += 1
+        except NoPictureForDate:
+            continue
+    return img_count, total_size
+
+def main(save_one=save_one, save_month=save_month):
+    """Get "Picture of The Day" from English Wikipedia for a given date or month"""
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    parser.add_argument('date', help='year, month and (optional) day in YYYY-MM-DD format')
+    parser.add_argument('-q', '--max_qty', type=int,
+                        help='maximum number of files to download')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='display progress information')
+    args = parser.parse_args()
+
+    t0 = time.time()
+    if len(args.date) == len('YYYY-MM-DD'):
+        img_count = 1
+        total_size = save_one(args.date, args.verbose)
+    else:
+        img_count, total_size = save_month(args.date, args.verbose)
+    elapsed = time.time() - t0
+    print("images: %3d |  total size: %6.1f Kbytes  |  elapsed time: %3ds" %
+          (img_count, total_size/1024.0, elapsed))
+
+
+if __name__ == '__main__':
+    main()
--- a/attic/concurrency/wikipedia/orig/potd_tests.py
+++ b/attic/concurrency/wikipedia/orig/potd_tests.py
@@ -0,0 +1,96 @@
+
+import unittest
+
+import potd
+
+class TestSequenceFunctions(unittest.TestCase):
+
+    def setUp(self):
+        self.thumb_url = ("""http://upload.wikimedia.org/wikipedia/"""
+            """commons/thumb/f/fe/Orthographic_projection_SW.jpg/350px"""
+            """-Orthographic_projection_SW.jpg""")
+
+    def test_buid_page_url(self):
+        date = '2014-05-01'
+        result = potd.build_page_url(date)
+        self.assertEqual(result, 'http://en.wikipedia.org/wiki/Template:POTD/2014-05-01')
+
+    def test_fetch_status_code(self):
+        date = '2014-05-02'
+        url = potd.build_page_url(date)
+        response = potd.fetch(url)
+        self.assertEqual(response.status_code, 200)
+
+    def test_fetch_status_code_not_found(self):
+        date = '2100-01-01'
+        url = potd.build_page_url(date)
+        response = potd.fetch(url)
+        self.assertEqual(response.status_code, 404)
+
+    def test_extract_image_url(self):
+        image_url = potd.extract_image_url(HTML)
+        self.assertEqual(image_url, self.thumb_url)
+
+    def test_fetch_image_jpeg(self):
+        response = potd.fetch(self.thumb_url)
+        self.assertEqual(response.headers['content-type'], 'image/jpeg')
+
+    def test_list_days_of_month(self):
+        year = 2014
+        month = 5
+        days = potd.list_days_of_month(year, month)
+        self.assertEqual(len(days), 31)
+        self.assertEqual('2014-05-01', days[0])
+        self.assertEqual('2014-05-31', days[-1])
+
+    def test_list_days_of_february(self):
+        year = 2014
+        month = 2
+        days = potd.list_days_of_month(year, month)
+        self.assertEqual(len(days), 28)
+        self.assertEqual('2014-02-01', days[0])
+        self.assertEqual('2014-02-28', days[-1])
+
+    def test_format_date(self):
+        year = 2014
+        month = 2
+        day = 1
+        a_date = '2014-02-01'
+        date = potd.format_date(year, month, day)
+        self.assertEqual(a_date, date)
+        self.assertEqual(potd.format_date(2010, 11, 12), '2010-11-12')
+
+    def test_build_save_path(self):
+        date = '2014-06-04'
+        path = potd.SAVE_DIR + date + '_350px-Orthographic_projection_SW.jpg'
+        self.assertEqual(path, potd.build_save_path(date, self.thumb_url))
+
+
+HTML = (
+'''<td><a href="/wiki/File:Orthographic_projection_SW.jpg" class="image"
+title="Orthographic projection"><img alt="Orthographic projection"
+src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fe/O'''
+'''rthographic_projection_SW.jpg/350px-Orthographic_projection_SW.jpg"
+width="350" height="350" srcset="//upload.wikimedia.org/wikipedia/comm'''
+'''ons/thumb/f/fe/Orthographic_projection_SW.jpg/525px-
+Orthographic_projection_SW.jpg 1.5x, //upload.wikimedia.org/wikipedia/
+commons/thumb/f/fe/Orthographic_projection_SW.jpg/700px-
+Orthographic_projection_SW.jpg 2x" data-file-width="2058" data-file-
+height="2058"></a></td>
+''')
+
+if __name__ == '__main__':
+    unittest.main()
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/attic/concurrency/wikipedia/orig/sync.py
+++ b/attic/concurrency/wikipedia/orig/sync.py
@@ -0,0 +1,115 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Baseline synchronous example for comparison: downloads images and metadata
+in the simple but slow synchronous way i.e. one after the other.
+"""
+
+from __future__ import print_function
+
+import sys
+import os
+import io
+import re
+import argparse
+import datetime
+import urllib2
+import contextlib
+import time
+
+POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
+
+THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
+THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
+
+LOCAL_IMG_PATH = 'pictures/'
+
+verbose = True
+
+
+class ParsingException(ValueError):
+    """Raised if unable to parse POTD MediaWiki source"""
+
+
+def fetch_potd_url(iso_date):
+    """Fetch picture name from iso_date ('YYYY-MM-DD' format)"""
+    potd_url = POTD_BASE_URL + iso_date
+    with contextlib.closing(urllib2.urlopen(potd_url)) as fp:
+        html = fp.read()
+        thumb_src = THUMB_SRC_RE.search(html)
+        if not thumb_src:
+            msg = 'cannot find thumbnail source for ' + potd_url
+            raise ParsingException(msg)
+        thumb_url = THUMB_BASE_URL+thumb_src.group(1)
+    return thumb_url
+
+
+def gen_month_days(year, month):
+    a_date = datetime.date(year, month, 1)
+    one_day = datetime.timedelta(1)
+    while a_date.month == month:
+        yield a_date
+        a_date += one_day
+
+
+def get_img_names(iso_month):
+    """Fetch picture names from iso_month ('YYYY-MM' format)"""
+    year, month = (int(part) for part in iso_month.split('-'))
+    for day in gen_month_days(year, month):
+        iso_date = '{:%Y-%m-%d}'.format(day)
+        if verbose:
+            print(iso_date)
+        try:
+            img_url = fetch_potd_url(iso_date)
+        except urllib2.HTTPError:
+            break
+        yield (iso_date, img_url)
+
+
+def fetch_image(iso_date, img_url):
+    if verbose:
+        print('\t' + img_url)
+    with contextlib.closing(urllib2.urlopen(img_url)) as fp:
+        img = fp.read()
+    img_filename = iso_date + '__' + img_url.split('/')[-1]
+    if verbose:
+        print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
+    img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
+    with io.open(img_path, 'wb') as fp:
+        fp.write(img)
+    return len(img)
+
+
+def get_images(iso_month, max_count=0):
+    if max_count is 0:
+        max_count = sys.maxsize
+    img_count = 0
+    total_size = 0
+    for iso_date, img_url in get_img_names(iso_month):
+        total_size += fetch_image(iso_date, img_url)
+        img_count += 1
+        if img_count == max_count:
+            break
+
+    return (img_count, total_size)
+
+
+def main():
+    """Get "Pictures of The Day" from English Wikipedia for a given month"""
+    global verbose
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    parser.add_argument('year_month', help='year and month in YYYY-MM format')
+    parser.add_argument('-q', '--max_qty', type=int,
+                        help='maximum number of files to download')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='display progress information')
+    args = parser.parse_args()
+    verbose = args.verbose
+    t0 = time.time()
+    img_count, total_size = get_images(args.year_month, args.max_qty)
+    elapsed = time.time() - t0
+    print("images: %3d |  total size: %6.1f Kbytes  |  elapsed time: %3ds" %
+          (img_count, total_size/1024.0, elapsed))
+
+if __name__ == '__main__':
+    main()
--- a/attic/concurrency/wikipedia/orig/sync_py3.py
+++ b/attic/concurrency/wikipedia/orig/sync_py3.py
@@ -0,0 +1,118 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Baseline synchronous example for comparison: downloads images and metadata
+in the simple but slow synchronous way i.e. one after the other.
+"""
+
+import sys
+import os
+import io
+import re
+import argparse
+import datetime
+import urllib.request
+import urllib.error
+import contextlib
+import time
+
+POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
+
+THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
+THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
+
+LOCAL_IMG_PATH = 'pictures/'
+
+verbose = True
+
+
+class ParsingException(ValueError):
+    """Raised if unable to parse POTD MediaWiki source"""
+
+
+def gen_month_dates(year, month):
+    """Produce all dates in a given year, month"""
+    a_date = datetime.date(year, month, 1)
+    one_day = datetime.timedelta(1)
+    while a_date.month == month:
+        yield '{:%Y-%m-%d}'.format(a_date)
+        a_date += one_day
+
+
+def fetch_potd_url(iso_date):
+    """Fetch POTD thumbnail URL for iso_date ('YYYY-MM-DD' format)"""
+    if verbose:
+        print(iso_date)
+    potd_url = POTD_BASE_URL + iso_date
+    try:
+        with urllib.request.urlopen(potd_url) as fp:
+            html = fp.read().decode('utf-8')
+            thumb_src = THUMB_SRC_RE.search(html)
+            if not thumb_src:
+                msg = 'cannot find thumbnail source for ' + potd_url
+                raise ParsingException(msg)
+            thumb_url = THUMB_BASE_URL+thumb_src.group(1)
+    except urllib.error.HTTPError:
+        return None
+    return thumb_url
+
+
+def gen_img_names(iso_month):
+    """Produce picture names by fetching POTD metadata"""
+    year, month = (int(part) for part in iso_month.split('-'))
+    for iso_date in gen_month_dates(year, month):
+        img_url = fetch_potd_url(iso_date)
+        if img_url is None:
+            break
+        yield (iso_date, img_url)
+
+
+def fetch_image(iso_date, img_url):
+    """Fetch and save image data for date and url"""
+    if verbose:
+        print('\t' + img_url)
+    with contextlib.closing(urllib.request.urlopen(img_url)) as fp:
+        img = fp.read()
+    img_filename = iso_date + '__' + img_url.split('/')[-1]
+    if verbose:
+        print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
+    img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
+    with io.open(img_path, 'wb') as fp:
+        fp.write(img)
+    return len(img)
+
+
+def get_images(iso_month, max_count=0):
+    """Download up to max_count images for a given month"""
+    if max_count is 0:
+        max_count = sys.maxsize
+    img_count = 0
+    total_size = 0
+    for iso_date, img_url in gen_img_names(iso_month):
+        total_size += fetch_image(iso_date, img_url)
+        img_count += 1
+        if img_count == max_count:
+            break
+
+    return (img_count, total_size)
+
+
+def main():
+    """Get "Pictures of The Day" from English Wikipedia for a given month"""
+    global verbose
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    parser.add_argument('year_month', help='year and month in YYYY-MM format')
+    parser.add_argument('-q', '--max_qty', type=int,
+                        help='maximum number of files to download')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='display progress information')
+    args = parser.parse_args()
+    verbose = args.verbose
+    t0 = time.time()
+    img_count, total_size = get_images(args.year_month, args.max_qty)
+    elapsed = time.time() - t0
+    print("images: %3d |  total size: %6.1f Kbytes  |  elapsed time: %3ds" %
+          (img_count, total_size/1024.0, elapsed))
+
+if __name__ == '__main__':
+    main()
--- a/attic/concurrency/wikipedia/test_daypicts.py
+++ b/attic/concurrency/wikipedia/test_daypicts.py
@@ -0,0 +1,87 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+"""
+
+import pytest
+
+from daypicts import *
+
+
+@pytest.mark.network
+def test_get_picture_url_existing():
+    url = get_picture_url('2012-01-01')
+    expected = ('http://upload.wikimedia.org/wikipedia/commons/'
+                'thumb/9/9d/MODIS_Map.jpg/550px-MODIS_Map.jpg')
+    assert url == expected
+
+
+@pytest.mark.network
+def test_get_picture_url_not_existing():
+    with pytest.raises(NoPictureForDate):
+        get_picture_url('2013-09-12')
+
+
+def test_validate_full_date():
+    parts = validate_date('2015-1-2')
+    assert parts == '2015-01-02'
+
+
+def test_validate_date_too_early():
+    with pytest.raises(NoPictureTemplateBefore):
+        validate_date('2006-12-31')
+
+
+def test_validate_month():
+    parts = validate_date('2015-1')
+    assert parts == '2015-01'
+
+
+def test_validate_year():
+    parts = validate_date('2015')
+    assert parts == '2015'
+
+
+def test_gen_month_dates():
+    dates = list(gen_month_dates('2015-02'))
+    assert len(dates) == 28
+    assert dates[0] == '2015-02-01'
+    assert dates[27] == '2015-02-28'
+
+
+def test_gen_month_dates_leap():
+    dates = list(gen_month_dates('2012-02'))
+    assert len(dates) == 29
+    assert dates[28] == '2012-02-29'
+
+
+def test_gen_year_dates():
+    dates = list(gen_year_dates('2015'))
+    assert len(dates) == 365
+    assert dates[0] == '2015-01-01'
+    assert dates[364] == '2015-12-31'
+
+
+def test_gen_year_dates_leap():
+    dates = list(gen_year_dates('2012'))
+    assert len(dates) == 366
+    assert dates[365] == '2012-12-31'
+
+
+GIF_MIN = (b'GIF89a\x01\x00\x01\x00\x00\xff\x00,\x00\x00'
+           b'\x00\x00\x01\x00\x01\x00\x00\x02\x00;')
+SVG_MIN = b'<svg xmlns="http://www.w3.org/2000/svg"></svg>'
+SVG_XML_DECL = b'<?xml version="1.0" encoding="UTF-8"?>' + SVG_MIN
+NOISE = b'\xb0\x0bU\xbe]L\n\x92\xbe\xc6\xf65"\xcc\xa3\xe3'
+
+def test_picture_type_imghdr():
+    assert picture_type(GIF_MIN) == 'gif'
+
+
+def test_picture_type_svg():
+    assert picture_type(SVG_MIN) == 'svg'
+    assert picture_type(SVG_XML_DECL) == 'svg'
+
+
+def test_picture_type_unknown():
+    assert picture_type(NOISE) is None
+