example-code-2e/attic/concurrency/wikipedia/orig/sync.py

"""
Wikipedia Picture of the Day (POTD) download example

Baseline synchronous example for comparison: downloads images and metadata
in the simple but slow synchronous way i.e. one after the other.
"""

from __future__ import print_function

import sys
import os
import io
import re
import argparse
import datetime
import urllib2
import contextlib
import time

POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'

THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')

LOCAL_IMG_PATH = 'pictures/'

verbose = True


class ParsingException(ValueError):
    """Raised if unable to parse POTD MediaWiki source"""


def fetch_potd_url(iso_date):
    """Fetch picture name from iso_date ('YYYY-MM-DD' format)"""
    potd_url = POTD_BASE_URL + iso_date
    with contextlib.closing(urllib2.urlopen(potd_url)) as fp:
        html = fp.read()
        thumb_src = THUMB_SRC_RE.search(html)
        if not thumb_src:
            msg = 'cannot find thumbnail source for ' + potd_url
            raise ParsingException(msg)
        thumb_url = THUMB_BASE_URL+thumb_src.group(1)
    return thumb_url


def gen_month_days(year, month):
    a_date = datetime.date(year, month, 1)
    one_day = datetime.timedelta(1)
    while a_date.month == month:
        yield a_date
        a_date += one_day


def get_img_names(iso_month):
    """Fetch picture names from iso_month ('YYYY-MM' format)"""
    year, month = (int(part) for part in iso_month.split('-'))
    for day in gen_month_days(year, month):
        iso_date = '{:%Y-%m-%d}'.format(day)
        if verbose:
            print(iso_date)
        try:
            img_url = fetch_potd_url(iso_date)
        except urllib2.HTTPError:
            break
        yield (iso_date, img_url)


def fetch_image(iso_date, img_url):
    if verbose:
        print('\t' + img_url)
    with contextlib.closing(urllib2.urlopen(img_url)) as fp:
        img = fp.read()
    img_filename = iso_date + '__' + img_url.split('/')[-1]
    if verbose:
        print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
    img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
    with io.open(img_path, 'wb') as fp:
        fp.write(img)
    return len(img)


def get_images(iso_month, max_count=0):
    if max_count is 0:
        max_count = sys.maxsize
    img_count = 0
    total_size = 0
    for iso_date, img_url in get_img_names(iso_month):
        total_size += fetch_image(iso_date, img_url)
        img_count += 1
        if img_count == max_count:
            break

    return (img_count, total_size)


def main():
    """Get "Pictures of The Day" from English Wikipedia for a given month"""
    global verbose
    parser = argparse.ArgumentParser(description=main.__doc__)
    parser.add_argument('year_month', help='year and month in YYYY-MM format')
    parser.add_argument('-q', '--max_qty', type=int,
                        help='maximum number of files to download')
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='display progress information')
    args = parser.parse_args()
    verbose = args.verbose
    t0 = time.time()
    img_count, total_size = get_images(args.year_month, args.max_qty)
    elapsed = time.time() - t0
    print("images: %3d |  total size: %6.1f Kbytes  |  elapsed time: %3ds" %
          (img_count, total_size/1024.0, elapsed))

if __name__ == '__main__':
    main()