wikipedia pictures download example

2015-02-02 02:56:14 -02:00
parent 73d98de6cd
commit ab6ce5b6a4
37 changed files with 2042 additions and 37 deletions
--- a/concurrency/wikipedia/daypicts.py
+++ b/concurrency/wikipedia/daypicts.py
@@ -0,0 +1,184 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Note:
+The earliest Pictures of the Day I've found are in this page:
+
+http://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/May_2004
+
+However, I have not found Template:POTD/YYYY-MM-DD pages earlier
+than this:
+
+http://en.wikipedia.org/wiki/Template:POTD/2007-01-01
+
+For simplicity, this script only retrieves pictures starting
+from 2007-01-01.
+
+"""
+import sys
+import argparse
+import re
+import imghdr
+import time
+import datetime
+
+import requests
+
+SAVE_DIR = 'pictures/'
+POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
+POTD_IMAGE_RE = re.compile(r'src="(//upload\..*?)"')
+PODT_EARLIEST_TEMPLATE = '2007-01-01'
+
+RE_YEAR = r'([12]\d{3})'
+RE_MONTH = RE_YEAR + r'-([01]\d)'
+RE_DATE = RE_MONTH + r'-([0-3]\d)'
+ISO_DATE_FMT = '%Y-%m-%d'
+
+DATEFORMS = [
+    ('date', re.compile('^' + RE_DATE + '$')),
+    ('month', re.compile('^' + RE_MONTH + '$')),
+    ('year', re.compile('^' + RE_YEAR + '$'))
+]
+
+
+class NoPictureForDate(Exception):
+    '''No Picture of the Day found for {iso_date}'''
+
+
+class NoPictureTemplateBefore(ValueError):
+    '''Template:POTD did not exist before PODT_EARLIEST_TEMPLATE'''
+
+
+def get_picture_url(iso_date):
+    page_url = POTD_BASE_URL+iso_date
+    response = requests.get(page_url)
+    pict_url = POTD_IMAGE_RE.search(response.text)
+    if pict_url is None:
+        raise NoPictureForDate(iso_date)
+    return 'http:' + pict_url.group(1)
+
+
+def get_picture(iso_date):
+    pict_url = get_picture_url(iso_date)
+    response = requests.get(pict_url)
+    octets = response.content
+    return octets
+
+
+def get_picture_type(octets):
+    pict_type = imghdr.what(None, octets)
+    if pict_type is None:
+        if (octets.startswith(b'<') and
+                b'<svg' in octets[:200] and
+                octets.rstrip().endswith(b'</svg>')):
+            pict_type = 'svg'
+    return pict_type
+
+
+def validate_date(text):
+    try:
+        parts = [int(part) for part in text.split('-')]
+    except ValueError:
+        raise ValueError('date must use YYYY, YYYY-MM or YYYY-MM-DD format')
+
+    test_parts = parts[:]
+    while len(test_parts) < 3:
+        test_parts.append(1)
+    date = datetime.datetime(*(int(part) for part in test_parts))
+    iso_date = date.strftime(ISO_DATE_FMT)
+    iso_date = iso_date[:1+len(parts)*3]
+    if iso_date < PODT_EARLIEST_TEMPLATE:
+        raise NoPictureTemplateBefore(PODT_EARLIEST_TEMPLATE)
+    return iso_date
+
+
+def gen_month_dates(iso_month):
+    first = datetime.datetime.strptime(iso_month+'-01', ISO_DATE_FMT)
+    one_day = datetime.timedelta(days=1)
+    date = first
+    while date.month == first.month:
+        yield date.strftime(ISO_DATE_FMT)
+        date += one_day
+
+
+def gen_year_dates(iso_year):
+    for i in range(1, 13):
+        yield from gen_month_dates(iso_year + '-{:02d}'.format(i))
+
+
+def gen_dates(iso_parts):
+    if len(iso_parts) == 4:
+        yield from gen_year_dates(iso_parts)
+    elif len(iso_parts) == 7:
+        yield from gen_month_dates(iso_parts)
+    else:
+        yield iso_parts
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day'
+    parser.add_argument('date', help=date_help)
+    parser.add_argument('-q', '--max_qty', type=int,
+                        help='maximum number of items to fetch')
+    parser.add_argument('-u', '--url_only', action='store_true',
+                        help='get picture URLS only')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='display progress information')
+    args = parser.parse_args(argv)
+
+    try:
+        iso_parts = validate_date(args.date)
+    except ValueError as exc:
+        print('error:', exc.args[0])
+        parser.print_usage()
+        sys.exit(2)
+
+    dates = list(gen_dates(iso_parts))
+    if args.verbose:
+        if len(dates) == 1:
+            print('-> Date: ', dates[0])
+        else:
+            fmt = '-> {} days: {}...{}'
+            print(fmt.format(len(dates), dates[0], dates[-1]))
+
+    return dates, args
+
+
+def get_picture_urls(dates, verbose=False):
+    urls = []
+    count = 0
+    for date in dates:
+        try:
+            url = get_picture_url(date)
+        except NoPictureForDate as exc:
+            if verbose:
+                print('*** {!r} ***'.format(exc))
+            continue
+        count += 1
+        if verbose:
+            print(format(count, '3d'), end=' ')
+            print(url.split('/')[-1])
+        else:
+            print(url)
+        urls.append(url)
+    return urls
+
+
+def main(argv, get_picture_urls):
+    """Get Wikipedia "Picture of The Day" for date, month or year"""
+
+    dates, args = parse_args(argv)
+
+    t0 = time.time()
+
+    urls = get_picture_urls(dates, args.verbose)
+
+    elapsed = time.time() - t0
+    if args.verbose:
+        print('-> found: {} pictures | elapsed time: {:.2f}s'
+              .format(len(urls), elapsed))
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:], get_picture_urls)
--- a/concurrency/wikipedia/daypicts_asyncio.py
+++ b/concurrency/wikipedia/daypicts_asyncio.py
@@ -0,0 +1,61 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+"""
+
+import sys
+import asyncio
+import aiohttp
+
+from daypicts import main
+from daypicts import NoPictureForDate
+from daypicts import POTD_BASE_URL
+from daypicts import POTD_IMAGE_RE
+
+GLOBAL_TIMEOUT = 300  # seconds
+
+
+@asyncio.coroutine
+def get_picture_url(iso_date):
+    page_url = POTD_BASE_URL+iso_date
+    response = yield from aiohttp.request('GET', page_url)
+    text = yield from response.text()
+    pict_url = POTD_IMAGE_RE.search(text)
+    if pict_url is None:
+        raise NoPictureForDate(iso_date)
+    return 'http:' + pict_url.group(1)
+
+
+@asyncio.coroutine
+def get_picture_urls(dates, verbose=False):
+    tasks = [get_picture_url(date) for date in dates]
+    urls = []
+    count = 0
+    # get results as jobs are done
+    for job in asyncio.as_completed(tasks, timeout=GLOBAL_TIMEOUT):
+        try:
+            url = yield from job
+        except NoPictureForDate as exc:
+            if verbose:
+                print('*** {!r} ***'.format(exc))
+            continue
+        except aiohttp.ClientResponseError as exc:
+            print('****** {!r} ******'.format(exc))
+            continue
+        count += 1
+        if verbose:
+            print(format(count, '3d'), end=' ')
+            print(url.split('/')[-1])
+        else:
+            print(url)
+        urls.append(url)
+    return urls
+
+
+def run_loop(dates, verbose=False):
+
+    loop = asyncio.get_event_loop()
+    return loop.run_until_complete(get_picture_urls(dates, verbose))
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:], run_loop)
--- a/concurrency/wikipedia/daypicts_threads.py
+++ b/concurrency/wikipedia/daypicts_threads.py
@@ -0,0 +1,45 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+"""
+
+import sys
+from concurrent import futures
+
+from daypicts import main, get_picture_url, NoPictureForDate
+
+MAX_NUM_THREADS = 400
+GLOBAL_TIMEOUT = 300  # seconds
+
+
+def get_picture_urls(dates, verbose=False):
+    num_threads = min(len(dates), MAX_NUM_THREADS)
+    pool = futures.ThreadPoolExecutor(num_threads)
+
+    pending = {}
+    for date in dates:
+        job = pool.submit(get_picture_url, date)
+        pending[job] = date
+
+    urls = []
+    count = 0
+
+    # get results as jobs are done
+    for job in futures.as_completed(pending, timeout=GLOBAL_TIMEOUT):
+        try:
+            url = job.result()
+        except NoPictureForDate as exc:
+            if verbose:
+                print('*** {!r} ***'.format(exc))
+            continue
+        count += 1
+        if verbose:
+            print(format(count, '3d'), end=' ')
+            print(url.split('/')[-1])
+        else:
+            print(url)
+        urls.append(url)
+    return urls
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:], get_picture_urls)
--- a/concurrency/wikipedia/fast_tests.sh
+++ b/concurrency/wikipedia/fast_tests.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+# run tests skipping @pytest.mark.network
+py.test test_daypicts.py -m 'not network' $1 $2 $3
--- a/concurrency/wikipedia/orig/futureprocs.py
+++ b/concurrency/wikipedia/orig/futureprocs.py
@@ -0,0 +1,36 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Inspired by example at:
+https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
+"""
+
+from concurrent import futures
+
+import potd
+
+def save_month(year_month, verbose):
+    year, month = [int(s) for s in year_month.split('-')]
+    total_size = 0
+    img_count = 0
+    dates = potd.list_days_of_month(year, month)
+
+    with futures.ProcessPoolExecutor(max_workers=100) as executor:
+        downloads = dict((executor.submit(potd.save_one, date, verbose), date)
+                             for date in dates)
+
+        for future in futures.as_completed(downloads):
+            date = downloads[future]
+            if future.exception() is not None:
+                print('%r generated an exception: %s' % (date,
+                                                         future.exception()))
+            else:
+                img_size = future.result()
+                total_size += img_size
+                img_count += 1
+                print('%r OK: %r' % (date, img_size))
+
+    return img_count, total_size
+
+if __name__ == '__main__':
+    potd.main(save_month=save_month)
--- a/concurrency/wikipedia/orig/futurethreads.py
+++ b/concurrency/wikipedia/orig/futurethreads.py
@@ -0,0 +1,36 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Inspired by example at:
+https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
+"""
+
+from concurrent import futures
+
+import potd
+
+def save_month(year_month, verbose):
+    year, month = [int(s) for s in year_month.split('-')]
+    total_size = 0
+    img_count = 0
+    dates = potd.list_days_of_month(year, month)
+
+    with futures.ThreadPoolExecutor(max_workers=100) as executor:
+        downloads = dict((executor.submit(potd.save_one, date, verbose), date)
+                             for date in dates)
+
+        for future in futures.as_completed(downloads):
+            date = downloads[future]
+            if future.exception() is not None:
+                print('%r generated an exception: %s' % (date,
+                                                         future.exception()))
+            else:
+                img_size = future.result()
+                total_size += img_size
+                img_count += 1
+                print('%r OK: %r' % (date, img_size))
+
+    return img_count, total_size
+
+if __name__ == '__main__':
+    potd.main(save_month=save_month)
--- a/concurrency/wikipedia/orig/potd.py
+++ b/concurrency/wikipedia/orig/potd.py
@@ -0,0 +1,100 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Baseline synchronous example for comparison: downloads metadata and
+images in the simple but slow synchronous way i.e. one after the other.
+"""
+
+import calendar
+import datetime
+import re
+import os
+import io
+import time
+
+import requests
+
+import argparse
+
+SAVE_DIR = 'pictures/'
+POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
+
+class NoPictureForDate(Exception):
+    '''No Picture of the Day found for {day}'''
+
+def build_page_url(iso_date):
+    return POTD_BASE_URL + iso_date
+
+def fetch(url):
+    response = requests.get(url)
+    return response
+
+def extract_image_url(html):
+    re_image = r'src="(//upload\..*?)"'
+    image_url = re.search(re_image, html)
+    return 'http:' + image_url.group(1)
+
+def format_date(year, month, day):
+    return '{year}-{month:02d}-{day:02d}'.format(**locals())
+
+def list_days_of_month(year, month):
+    lastday = calendar.monthrange(year, month)[1]
+    days = [format_date(year, month, day) for day in range(1, lastday + 1)]
+    return days
+
+def build_save_path(iso_date, url):
+    head, filename = os.path.split(url)
+    return os.path.join(SAVE_DIR, iso_date+'_'+filename)
+
+def save_one(iso_date, verbose):
+    page_url = build_page_url(iso_date)
+    response = fetch(page_url)
+    if response.status_code != 200:
+        msg = NoPictureForDate.__doc__.format(day=iso_date)
+        raise NoPictureForDate(msg)
+    img_url = extract_image_url(response.text)
+    response = fetch(img_url)
+    path = build_save_path(iso_date, img_url)
+    if verbose:
+        print('saving: '+path)
+    with io.open(path, 'wb') as fp:
+        fp.write(response.content)
+    return len(response.content)
+
+def save_month(year_month, verbose):
+    year, month = [int(s) for s in year_month.split('-')]
+    total_size = 0
+    img_count = 0
+    dates = list_days_of_month(year, month)
+
+    for date in dates:
+        try:
+            total_size += save_one(date, verbose)
+            img_count += 1
+        except NoPictureForDate:
+            continue
+    return img_count, total_size
+
+def main(save_one=save_one, save_month=save_month):
+    """Get "Picture of The Day" from English Wikipedia for a given date or month"""
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    parser.add_argument('date', help='year, month and (optional) day in YYYY-MM-DD format')
+    parser.add_argument('-q', '--max_qty', type=int,
+                        help='maximum number of files to download')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='display progress information')
+    args = parser.parse_args()
+
+    t0 = time.time()
+    if len(args.date) == len('YYYY-MM-DD'):
+        img_count = 1
+        total_size = save_one(args.date, args.verbose)
+    else:
+        img_count, total_size = save_month(args.date, args.verbose)
+    elapsed = time.time() - t0
+    print("images: %3d |  total size: %6.1f Kbytes  |  elapsed time: %3ds" %
+          (img_count, total_size/1024.0, elapsed))
+
+
+if __name__ == '__main__':
+    main()
--- a/concurrency/wikipedia/orig/potd_tests.py
+++ b/concurrency/wikipedia/orig/potd_tests.py
@@ -0,0 +1,96 @@
+
+import unittest
+
+import potd
+
+class TestSequenceFunctions(unittest.TestCase):
+
+    def setUp(self):
+        self.thumb_url = ("""http://upload.wikimedia.org/wikipedia/"""
+            """commons/thumb/f/fe/Orthographic_projection_SW.jpg/350px"""
+            """-Orthographic_projection_SW.jpg""")
+
+    def test_buid_page_url(self):
+        date = '2014-05-01'
+        result = potd.build_page_url(date)
+        self.assertEqual(result, 'http://en.wikipedia.org/wiki/Template:POTD/2014-05-01')
+
+    def test_fetch_status_code(self):
+        date = '2014-05-02'
+        url = potd.build_page_url(date)
+        response = potd.fetch(url)
+        self.assertEqual(response.status_code, 200)
+
+    def test_fetch_status_code_not_found(self):
+        date = '2100-01-01'
+        url = potd.build_page_url(date)
+        response = potd.fetch(url)
+        self.assertEqual(response.status_code, 404)
+
+    def test_extract_image_url(self):
+        image_url = potd.extract_image_url(HTML)
+        self.assertEqual(image_url, self.thumb_url)
+
+    def test_fetch_image_jpeg(self):
+        response = potd.fetch(self.thumb_url)
+        self.assertEqual(response.headers['content-type'], 'image/jpeg')
+
+    def test_list_days_of_month(self):
+        year = 2014
+        month = 5
+        days = potd.list_days_of_month(year, month)
+        self.assertEqual(len(days), 31)
+        self.assertEqual('2014-05-01', days[0])
+        self.assertEqual('2014-05-31', days[-1])
+
+    def test_list_days_of_february(self):
+        year = 2014
+        month = 2
+        days = potd.list_days_of_month(year, month)
+        self.assertEqual(len(days), 28)
+        self.assertEqual('2014-02-01', days[0])
+        self.assertEqual('2014-02-28', days[-1])
+
+    def test_format_date(self):
+        year = 2014
+        month = 2
+        day = 1
+        a_date = '2014-02-01'
+        date = potd.format_date(year, month, day)
+        self.assertEqual(a_date, date)
+        self.assertEqual(potd.format_date(2010, 11, 12), '2010-11-12')
+
+    def test_build_save_path(self):
+        date = '2014-06-04'
+        path = potd.SAVE_DIR + date + '_350px-Orthographic_projection_SW.jpg'
+        self.assertEqual(path, potd.build_save_path(date, self.thumb_url))
+
+
+HTML = (
+'''<td><a href="/wiki/File:Orthographic_projection_SW.jpg" class="image"
+title="Orthographic projection"><img alt="Orthographic projection"
+src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fe/O'''
+'''rthographic_projection_SW.jpg/350px-Orthographic_projection_SW.jpg"
+width="350" height="350" srcset="//upload.wikimedia.org/wikipedia/comm'''
+'''ons/thumb/f/fe/Orthographic_projection_SW.jpg/525px-
+Orthographic_projection_SW.jpg 1.5x, //upload.wikimedia.org/wikipedia/
+commons/thumb/f/fe/Orthographic_projection_SW.jpg/700px-
+Orthographic_projection_SW.jpg 2x" data-file-width="2058" data-file-
+height="2058"></a></td>
+''')
+
+if __name__ == '__main__':
+    unittest.main()
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/concurrency/wikipedia/sync.py
+++ b/concurrency/wikipedia/sync.py
@@ -0,0 +1,115 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Baseline synchronous example for comparison: downloads images and metadata
+in the simple but slow synchronous way i.e. one after the other.
+"""
+
+from __future__ import print_function
+
+import sys
+import os
+import io
+import re
+import argparse
+import datetime
+import urllib2
+import contextlib
+import time
+
+POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
+
+THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
+THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
+
+LOCAL_IMG_PATH = 'pictures/'
+
+verbose = True
+
+
+class ParsingException(ValueError):
+    """Raised if unable to parse POTD MediaWiki source"""
+
+
+def fetch_potd_url(iso_date):
+    """Fetch picture name from iso_date ('YYYY-MM-DD' format)"""
+    potd_url = POTD_BASE_URL + iso_date
+    with contextlib.closing(urllib2.urlopen(potd_url)) as fp:
+        html = fp.read()
+        thumb_src = THUMB_SRC_RE.search(html)
+        if not thumb_src:
+            msg = 'cannot find thumbnail source for ' + potd_url
+            raise ParsingException(msg)
+        thumb_url = THUMB_BASE_URL+thumb_src.group(1)
+    return thumb_url
+
+
+def gen_month_days(year, month):
+    a_date = datetime.date(year, month, 1)
+    one_day = datetime.timedelta(1)
+    while a_date.month == month:
+        yield a_date
+        a_date += one_day
+
+
+def get_img_names(iso_month):
+    """Fetch picture names from iso_month ('YYYY-MM' format)"""
+    year, month = (int(part) for part in iso_month.split('-'))
+    for day in gen_month_days(year, month):
+        iso_date = '{:%Y-%m-%d}'.format(day)
+        if verbose:
+            print(iso_date)
+        try:
+            img_url = fetch_potd_url(iso_date)
+        except urllib2.HTTPError:
+            break
+        yield (iso_date, img_url)
+
+
+def fetch_image(iso_date, img_url):
+    if verbose:
+        print('\t' + img_url)
+    with contextlib.closing(urllib2.urlopen(img_url)) as fp:
+        img = fp.read()
+    img_filename = iso_date + '__' + img_url.split('/')[-1]
+    if verbose:
+        print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
+    img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
+    with io.open(img_path, 'wb') as fp:
+        fp.write(img)
+    return len(img)
+
+
+def get_images(iso_month, max_count=0):
+    if max_count is 0:
+        max_count = sys.maxsize
+    img_count = 0
+    total_size = 0
+    for iso_date, img_url in get_img_names(iso_month):
+        total_size += fetch_image(iso_date, img_url)
+        img_count += 1
+        if img_count == max_count:
+            break
+
+    return (img_count, total_size)
+
+
+def main():
+    """Get "Pictures of The Day" from English Wikipedia for a given month"""
+    global verbose
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    parser.add_argument('year_month', help='year and month in YYYY-MM format')
+    parser.add_argument('-q', '--max_qty', type=int,
+                        help='maximum number of files to download')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='display progress information')
+    args = parser.parse_args()
+    verbose = args.verbose
+    t0 = time.time()
+    img_count, total_size = get_images(args.year_month, args.max_qty)
+    elapsed = time.time() - t0
+    print("images: %3d |  total size: %6.1f Kbytes  |  elapsed time: %3ds" %
+          (img_count, total_size/1024.0, elapsed))
+
+if __name__ == '__main__':
+    main()
--- a/concurrency/wikipedia/sync_py3.py
+++ b/concurrency/wikipedia/sync_py3.py
@@ -0,0 +1,118 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Baseline synchronous example for comparison: downloads images and metadata
+in the simple but slow synchronous way i.e. one after the other.
+"""
+
+import sys
+import os
+import io
+import re
+import argparse
+import datetime
+import urllib.request
+import urllib.error
+import contextlib
+import time
+
+POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
+
+THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
+THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
+
+LOCAL_IMG_PATH = 'pictures/'
+
+verbose = True
+
+
+class ParsingException(ValueError):
+    """Raised if unable to parse POTD MediaWiki source"""
+
+
+def gen_month_dates(year, month):
+    """Produce all dates in a given year, month"""
+    a_date = datetime.date(year, month, 1)
+    one_day = datetime.timedelta(1)
+    while a_date.month == month:
+        yield '{:%Y-%m-%d}'.format(a_date)
+        a_date += one_day
+
+
+def fetch_potd_url(iso_date):
+    """Fetch POTD thumbnail URL for iso_date ('YYYY-MM-DD' format)"""
+    if verbose:
+        print(iso_date)
+    potd_url = POTD_BASE_URL + iso_date
+    try:
+        with urllib.request.urlopen(potd_url) as fp:
+            html = fp.read().decode('utf-8')
+            thumb_src = THUMB_SRC_RE.search(html)
+            if not thumb_src:
+                msg = 'cannot find thumbnail source for ' + potd_url
+                raise ParsingException(msg)
+            thumb_url = THUMB_BASE_URL+thumb_src.group(1)
+    except urllib.error.HTTPError:
+        return None
+    return thumb_url
+
+
+def gen_img_names(iso_month):
+    """Produce picture names by fetching POTD metadata"""
+    year, month = (int(part) for part in iso_month.split('-'))
+    for iso_date in gen_month_dates(year, month):
+        img_url = fetch_potd_url(iso_date)
+        if img_url is None:
+            break
+        yield (iso_date, img_url)
+
+
+def fetch_image(iso_date, img_url):
+    """Fetch and save image data for date and url"""
+    if verbose:
+        print('\t' + img_url)
+    with contextlib.closing(urllib.request.urlopen(img_url)) as fp:
+        img = fp.read()
+    img_filename = iso_date + '__' + img_url.split('/')[-1]
+    if verbose:
+        print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
+    img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
+    with io.open(img_path, 'wb') as fp:
+        fp.write(img)
+    return len(img)
+
+
+def get_images(iso_month, max_count=0):
+    """Download up to max_count images for a given month"""
+    if max_count is 0:
+        max_count = sys.maxsize
+    img_count = 0
+    total_size = 0
+    for iso_date, img_url in gen_img_names(iso_month):
+        total_size += fetch_image(iso_date, img_url)
+        img_count += 1
+        if img_count == max_count:
+            break
+
+    return (img_count, total_size)
+
+
+def main():
+    """Get "Pictures of The Day" from English Wikipedia for a given month"""
+    global verbose
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    parser.add_argument('year_month', help='year and month in YYYY-MM format')
+    parser.add_argument('-q', '--max_qty', type=int,
+                        help='maximum number of files to download')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='display progress information')
+    args = parser.parse_args()
+    verbose = args.verbose
+    t0 = time.time()
+    img_count, total_size = get_images(args.year_month, args.max_qty)
+    elapsed = time.time() - t0
+    print("images: %3d |  total size: %6.1f Kbytes  |  elapsed time: %3ds" %
+          (img_count, total_size/1024.0, elapsed))
+
+if __name__ == '__main__':
+    main()
--- a/concurrency/wikipedia/test_daypicts.py
+++ b/concurrency/wikipedia/test_daypicts.py
@@ -0,0 +1,87 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+"""
+
+import pytest
+
+from daypicts import *
+
+
+GIF_MIN = (b'GIF89a\x01\x00\x01\x00\x00\xff\x00,\x00\x00'
+           b'\x00\x00\x01\x00\x01\x00\x00\x02\x00;')
+SVG_MIN = b'<svg xmlns="http://www.w3.org/2000/svg"></svg>'
+SVG_XML_DECL = b'<?xml version="1.0" encoding="UTF-8"?>' + SVG_MIN
+NOISE = b'\xb0\x0bU\xbe]L\n\x92\xbe\xc6\xf65"\xcc\xa3\xe3'
+
+
+@pytest.mark.network
+def test_get_picture_url_existing():
+    url = get_picture_url('2012-01-01')
+    expected = ('http://upload.wikimedia.org/wikipedia/commons/'
+                'thumb/9/9d/MODIS_Map.jpg/550px-MODIS_Map.jpg')
+    assert url == expected
+
+
+@pytest.mark.network
+def test_get_picture_url_not_existing():
+    with pytest.raises(NoPictureForDate):
+        get_picture_url('2013-09-12')
+
+
+def test_get_picture_type_imghdr():
+    assert get_picture_type(GIF_MIN) == 'gif'
+
+
+def test_get_picture_type_svg():
+    assert get_picture_type(SVG_MIN) == 'svg'
+    assert get_picture_type(SVG_XML_DECL) == 'svg'
+
+
+def test_get_picture_type_unknown():
+    assert get_picture_type(NOISE) is None
+
+
+def test_validate_full_date():
+    parts = validate_date('2015-1-2')
+    assert parts == '2015-01-02'
+
+
+def test_validate_date_too_early():
+    with pytest.raises(NoPictureTemplateBefore):
+        validate_date('2006-12-31')
+
+
+def test_validate_month():
+    parts = validate_date('2015-1')
+    assert parts == '2015-01'
+
+
+def test_validate_year():
+    parts = validate_date('2015')
+    assert parts == '2015'
+
+
+def test_gen_month_dates():
+    dates = list(gen_month_dates('2015-02'))
+    assert len(dates) == 28
+    assert dates[0] == '2015-02-01'
+    assert dates[27] == '2015-02-28'
+
+
+def test_gen_month_dates_leap():
+    dates = list(gen_month_dates('2012-02'))
+    assert len(dates) == 29
+    assert dates[28] == '2012-02-29'
+
+
+def test_gen_year_dates():
+    dates = list(gen_year_dates('2015'))
+    assert len(dates) == 365
+    assert dates[0] == '2015-01-01'
+    assert dates[364] == '2015-12-31'
+
+
+def test_gen_year_dates_leap():
+    dates = list(gen_year_dates('2012'))
+    assert len(dates) == 366
+    assert dates[365] == '2012-12-31'