concurrency examples

2015-02-02 14:07:35 -02:00
parent ab6ce5b6a4
commit 70163d2deb
13 changed files with 131 additions and 56 deletions
--- a/concurrency/wikipedia/orig/sync.py
+++ b/concurrency/wikipedia/orig/sync.py
@@ -0,0 +1,115 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Baseline synchronous example for comparison: downloads images and metadata
+in the simple but slow synchronous way i.e. one after the other.
+"""
+
+from __future__ import print_function
+
+import sys
+import os
+import io
+import re
+import argparse
+import datetime
+import urllib2
+import contextlib
+import time
+
+POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
+
+THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
+THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
+
+LOCAL_IMG_PATH = 'pictures/'
+
+verbose = True
+
+
+class ParsingException(ValueError):
+    """Raised if unable to parse POTD MediaWiki source"""
+
+
+def fetch_potd_url(iso_date):
+    """Fetch picture name from iso_date ('YYYY-MM-DD' format)"""
+    potd_url = POTD_BASE_URL + iso_date
+    with contextlib.closing(urllib2.urlopen(potd_url)) as fp:
+        html = fp.read()
+        thumb_src = THUMB_SRC_RE.search(html)
+        if not thumb_src:
+            msg = 'cannot find thumbnail source for ' + potd_url
+            raise ParsingException(msg)
+        thumb_url = THUMB_BASE_URL+thumb_src.group(1)
+    return thumb_url
+
+
+def gen_month_days(year, month):
+    a_date = datetime.date(year, month, 1)
+    one_day = datetime.timedelta(1)
+    while a_date.month == month:
+        yield a_date
+        a_date += one_day
+
+
+def get_img_names(iso_month):
+    """Fetch picture names from iso_month ('YYYY-MM' format)"""
+    year, month = (int(part) for part in iso_month.split('-'))
+    for day in gen_month_days(year, month):
+        iso_date = '{:%Y-%m-%d}'.format(day)
+        if verbose:
+            print(iso_date)
+        try:
+            img_url = fetch_potd_url(iso_date)
+        except urllib2.HTTPError:
+            break
+        yield (iso_date, img_url)
+
+
+def fetch_image(iso_date, img_url):
+    if verbose:
+        print('\t' + img_url)
+    with contextlib.closing(urllib2.urlopen(img_url)) as fp:
+        img = fp.read()
+    img_filename = iso_date + '__' + img_url.split('/')[-1]
+    if verbose:
+        print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
+    img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
+    with io.open(img_path, 'wb') as fp:
+        fp.write(img)
+    return len(img)
+
+
+def get_images(iso_month, max_count=0):
+    if max_count is 0:
+        max_count = sys.maxsize
+    img_count = 0
+    total_size = 0
+    for iso_date, img_url in get_img_names(iso_month):
+        total_size += fetch_image(iso_date, img_url)
+        img_count += 1
+        if img_count == max_count:
+            break
+
+    return (img_count, total_size)
+
+
+def main():
+    """Get "Pictures of The Day" from English Wikipedia for a given month"""
+    global verbose
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    parser.add_argument('year_month', help='year and month in YYYY-MM format')
+    parser.add_argument('-q', '--max_qty', type=int,
+                        help='maximum number of files to download')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='display progress information')
+    args = parser.parse_args()
+    verbose = args.verbose
+    t0 = time.time()
+    img_count, total_size = get_images(args.year_month, args.max_qty)
+    elapsed = time.time() - t0
+    print("images: %3d |  total size: %6.1f Kbytes  |  elapsed time: %3ds" %
+          (img_count, total_size/1024.0, elapsed))
+
+if __name__ == '__main__':
+    main()
--- a/concurrency/wikipedia/orig/sync_py3.py
+++ b/concurrency/wikipedia/orig/sync_py3.py
@@ -0,0 +1,118 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Baseline synchronous example for comparison: downloads images and metadata
+in the simple but slow synchronous way i.e. one after the other.
+"""
+
+import sys
+import os
+import io
+import re
+import argparse
+import datetime
+import urllib.request
+import urllib.error
+import contextlib
+import time
+
+POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
+
+THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
+THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
+
+LOCAL_IMG_PATH = 'pictures/'
+
+verbose = True
+
+
+class ParsingException(ValueError):
+    """Raised if unable to parse POTD MediaWiki source"""
+
+
+def gen_month_dates(year, month):
+    """Produce all dates in a given year, month"""
+    a_date = datetime.date(year, month, 1)
+    one_day = datetime.timedelta(1)
+    while a_date.month == month:
+        yield '{:%Y-%m-%d}'.format(a_date)
+        a_date += one_day
+
+
+def fetch_potd_url(iso_date):
+    """Fetch POTD thumbnail URL for iso_date ('YYYY-MM-DD' format)"""
+    if verbose:
+        print(iso_date)
+    potd_url = POTD_BASE_URL + iso_date
+    try:
+        with urllib.request.urlopen(potd_url) as fp:
+            html = fp.read().decode('utf-8')
+            thumb_src = THUMB_SRC_RE.search(html)
+            if not thumb_src:
+                msg = 'cannot find thumbnail source for ' + potd_url
+                raise ParsingException(msg)
+            thumb_url = THUMB_BASE_URL+thumb_src.group(1)
+    except urllib.error.HTTPError:
+        return None
+    return thumb_url
+
+
+def gen_img_names(iso_month):
+    """Produce picture names by fetching POTD metadata"""
+    year, month = (int(part) for part in iso_month.split('-'))
+    for iso_date in gen_month_dates(year, month):
+        img_url = fetch_potd_url(iso_date)
+        if img_url is None:
+            break
+        yield (iso_date, img_url)
+
+
+def fetch_image(iso_date, img_url):
+    """Fetch and save image data for date and url"""
+    if verbose:
+        print('\t' + img_url)
+    with contextlib.closing(urllib.request.urlopen(img_url)) as fp:
+        img = fp.read()
+    img_filename = iso_date + '__' + img_url.split('/')[-1]
+    if verbose:
+        print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
+    img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
+    with io.open(img_path, 'wb') as fp:
+        fp.write(img)
+    return len(img)
+
+
+def get_images(iso_month, max_count=0):
+    """Download up to max_count images for a given month"""
+    if max_count is 0:
+        max_count = sys.maxsize
+    img_count = 0
+    total_size = 0
+    for iso_date, img_url in gen_img_names(iso_month):
+        total_size += fetch_image(iso_date, img_url)
+        img_count += 1
+        if img_count == max_count:
+            break
+
+    return (img_count, total_size)
+
+
+def main():
+    """Get "Pictures of The Day" from English Wikipedia for a given month"""
+    global verbose
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    parser.add_argument('year_month', help='year and month in YYYY-MM format')
+    parser.add_argument('-q', '--max_qty', type=int,
+                        help='maximum number of files to download')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='display progress information')
+    args = parser.parse_args()
+    verbose = args.verbose
+    t0 = time.time()
+    img_count, total_size = get_images(args.year_month, args.max_qty)
+    elapsed = time.time() - t0
+    print("images: %3d |  total size: %6.1f Kbytes  |  elapsed time: %3ds" %
+          (img_count, total_size/1024.0, elapsed))
+
+if __name__ == '__main__':
+    main()