update from Atlas with major reorg

2015-04-17 21:29:30 -03:00
parent 57902d31b5
commit a786180239
134 changed files with 369 additions and 520 deletions
--- a/attic/concurrency/wikipedia/orig/README.rst
+++ b/attic/concurrency/wikipedia/orig/README.rst
@@ -0,0 +1,39 @@
+=====================================
+Wikipedia Picture of the Day examples
+=====================================
+
+These examples use various asynchronous programming techniques to download 
+images and metadata from the English Wikipedia `Picture of the Day`_ archive.
+
+.. _Picture of the Day: http://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/Archive
+
+
+--------
+Timings
+--------
+
+``sync.py``
+===========
+
+::
+
+    $ time python sync.py 2014-06 -q 5
+    5 images downloaded (167.8 Kbytes total)
+
+    real    0m6.272s
+    user    0m0.065s
+    sys 0m0.039s
+
+    $ time python sync.py 2014-06 -q 5
+    5 images downloaded (167.8 Kbytes total)
+
+    real    0m5.447s
+    user    0m0.068s
+    sys 0m0.040s
+
+    $ time python sync.py 2014-06 -q 5
+    5 images downloaded (167.8 Kbytes total)
+
+    real    0m6.314s
+    user    0m0.068s
+    sys 0m0.040s
--- a/attic/concurrency/wikipedia/orig/futureprocs.py
+++ b/attic/concurrency/wikipedia/orig/futureprocs.py
@@ -0,0 +1,36 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Inspired by example at:
+https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
+"""
+
+from concurrent import futures
+
+import potd
+
+def save_month(year_month, verbose):
+    year, month = [int(s) for s in year_month.split('-')]
+    total_size = 0
+    img_count = 0
+    dates = potd.list_days_of_month(year, month)
+
+    with futures.ProcessPoolExecutor(max_workers=100) as executor:
+        downloads = dict((executor.submit(potd.save_one, date, verbose), date)
+                             for date in dates)
+
+        for future in futures.as_completed(downloads):
+            date = downloads[future]
+            if future.exception() is not None:
+                print('%r generated an exception: %s' % (date,
+                                                         future.exception()))
+            else:
+                img_size = future.result()
+                total_size += img_size
+                img_count += 1
+                print('%r OK: %r' % (date, img_size))
+
+    return img_count, total_size
+
+if __name__ == '__main__':
+    potd.main(save_month=save_month)
--- a/attic/concurrency/wikipedia/orig/futurethreads.py
+++ b/attic/concurrency/wikipedia/orig/futurethreads.py
@@ -0,0 +1,36 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Inspired by example at:
+https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
+"""
+
+from concurrent import futures
+
+import potd
+
+def save_month(year_month, verbose):
+    year, month = [int(s) for s in year_month.split('-')]
+    total_size = 0
+    img_count = 0
+    dates = potd.list_days_of_month(year, month)
+
+    with futures.ThreadPoolExecutor(max_workers=100) as executor:
+        downloads = dict((executor.submit(potd.save_one, date, verbose), date)
+                             for date in dates)
+
+        for future in futures.as_completed(downloads):
+            date = downloads[future]
+            if future.exception() is not None:
+                print('%r generated an exception: %s' % (date,
+                                                         future.exception()))
+            else:
+                img_size = future.result()
+                total_size += img_size
+                img_count += 1
+                print('%r OK: %r' % (date, img_size))
+
+    return img_count, total_size
+
+if __name__ == '__main__':
+    potd.main(save_month=save_month)
--- a/attic/concurrency/wikipedia/orig/potd.py
+++ b/attic/concurrency/wikipedia/orig/potd.py
@@ -0,0 +1,100 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Baseline synchronous example for comparison: downloads metadata and
+images in the simple but slow synchronous way i.e. one after the other.
+"""
+
+import calendar
+import datetime
+import re
+import os
+import io
+import time
+
+import requests
+
+import argparse
+
+SAVE_DIR = 'pictures/'
+POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
+
+class NoPictureForDate(Exception):
+    '''No Picture of the Day found for {day}'''
+
+def build_page_url(iso_date):
+    return POTD_BASE_URL + iso_date
+
+def fetch(url):
+    response = requests.get(url)
+    return response
+
+def extract_image_url(html):
+    re_image = r'src="(//upload\..*?)"'
+    image_url = re.search(re_image, html)
+    return 'http:' + image_url.group(1)
+
+def format_date(year, month, day):
+    return '{year}-{month:02d}-{day:02d}'.format(**locals())
+
+def list_days_of_month(year, month):
+    lastday = calendar.monthrange(year, month)[1]
+    days = [format_date(year, month, day) for day in range(1, lastday + 1)]
+    return days
+
+def build_save_path(iso_date, url):
+    head, filename = os.path.split(url)
+    return os.path.join(SAVE_DIR, iso_date+'_'+filename)
+
+def save_one(iso_date, verbose):
+    page_url = build_page_url(iso_date)
+    response = fetch(page_url)
+    if response.status_code != 200:
+        msg = NoPictureForDate.__doc__.format(day=iso_date)
+        raise NoPictureForDate(msg)
+    img_url = extract_image_url(response.text)
+    response = fetch(img_url)
+    path = build_save_path(iso_date, img_url)
+    if verbose:
+        print('saving: '+path)
+    with io.open(path, 'wb') as fp:
+        fp.write(response.content)
+    return len(response.content)
+
+def save_month(year_month, verbose):
+    year, month = [int(s) for s in year_month.split('-')]
+    total_size = 0
+    img_count = 0
+    dates = list_days_of_month(year, month)
+
+    for date in dates:
+        try:
+            total_size += save_one(date, verbose)
+            img_count += 1
+        except NoPictureForDate:
+            continue
+    return img_count, total_size
+
+def main(save_one=save_one, save_month=save_month):
+    """Get "Picture of The Day" from English Wikipedia for a given date or month"""
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    parser.add_argument('date', help='year, month and (optional) day in YYYY-MM-DD format')
+    parser.add_argument('-q', '--max_qty', type=int,
+                        help='maximum number of files to download')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='display progress information')
+    args = parser.parse_args()
+
+    t0 = time.time()
+    if len(args.date) == len('YYYY-MM-DD'):
+        img_count = 1
+        total_size = save_one(args.date, args.verbose)
+    else:
+        img_count, total_size = save_month(args.date, args.verbose)
+    elapsed = time.time() - t0
+    print("images: %3d |  total size: %6.1f Kbytes  |  elapsed time: %3ds" %
+          (img_count, total_size/1024.0, elapsed))
+
+
+if __name__ == '__main__':
+    main()
--- a/attic/concurrency/wikipedia/orig/potd_tests.py
+++ b/attic/concurrency/wikipedia/orig/potd_tests.py
@@ -0,0 +1,96 @@
+
+import unittest
+
+import potd
+
+class TestSequenceFunctions(unittest.TestCase):
+
+    def setUp(self):
+        self.thumb_url = ("""http://upload.wikimedia.org/wikipedia/"""
+            """commons/thumb/f/fe/Orthographic_projection_SW.jpg/350px"""
+            """-Orthographic_projection_SW.jpg""")
+
+    def test_buid_page_url(self):
+        date = '2014-05-01'
+        result = potd.build_page_url(date)
+        self.assertEqual(result, 'http://en.wikipedia.org/wiki/Template:POTD/2014-05-01')
+
+    def test_fetch_status_code(self):
+        date = '2014-05-02'
+        url = potd.build_page_url(date)
+        response = potd.fetch(url)
+        self.assertEqual(response.status_code, 200)
+
+    def test_fetch_status_code_not_found(self):
+        date = '2100-01-01'
+        url = potd.build_page_url(date)
+        response = potd.fetch(url)
+        self.assertEqual(response.status_code, 404)
+
+    def test_extract_image_url(self):
+        image_url = potd.extract_image_url(HTML)
+        self.assertEqual(image_url, self.thumb_url)
+
+    def test_fetch_image_jpeg(self):
+        response = potd.fetch(self.thumb_url)
+        self.assertEqual(response.headers['content-type'], 'image/jpeg')
+
+    def test_list_days_of_month(self):
+        year = 2014
+        month = 5
+        days = potd.list_days_of_month(year, month)
+        self.assertEqual(len(days), 31)
+        self.assertEqual('2014-05-01', days[0])
+        self.assertEqual('2014-05-31', days[-1])
+
+    def test_list_days_of_february(self):
+        year = 2014
+        month = 2
+        days = potd.list_days_of_month(year, month)
+        self.assertEqual(len(days), 28)
+        self.assertEqual('2014-02-01', days[0])
+        self.assertEqual('2014-02-28', days[-1])
+
+    def test_format_date(self):
+        year = 2014
+        month = 2
+        day = 1
+        a_date = '2014-02-01'
+        date = potd.format_date(year, month, day)
+        self.assertEqual(a_date, date)
+        self.assertEqual(potd.format_date(2010, 11, 12), '2010-11-12')
+
+    def test_build_save_path(self):
+        date = '2014-06-04'
+        path = potd.SAVE_DIR + date + '_350px-Orthographic_projection_SW.jpg'
+        self.assertEqual(path, potd.build_save_path(date, self.thumb_url))
+
+
+HTML = (
+'''<td><a href="/wiki/File:Orthographic_projection_SW.jpg" class="image"
+title="Orthographic projection"><img alt="Orthographic projection"
+src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fe/O'''
+'''rthographic_projection_SW.jpg/350px-Orthographic_projection_SW.jpg"
+width="350" height="350" srcset="//upload.wikimedia.org/wikipedia/comm'''
+'''ons/thumb/f/fe/Orthographic_projection_SW.jpg/525px-
+Orthographic_projection_SW.jpg 1.5x, //upload.wikimedia.org/wikipedia/
+commons/thumb/f/fe/Orthographic_projection_SW.jpg/700px-
+Orthographic_projection_SW.jpg 2x" data-file-width="2058" data-file-
+height="2058"></a></td>
+''')
+
+if __name__ == '__main__':
+    unittest.main()
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/attic/concurrency/wikipedia/orig/sync.py
+++ b/attic/concurrency/wikipedia/orig/sync.py
@@ -0,0 +1,115 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Baseline synchronous example for comparison: downloads images and metadata
+in the simple but slow synchronous way i.e. one after the other.
+"""
+
+from __future__ import print_function
+
+import sys
+import os
+import io
+import re
+import argparse
+import datetime
+import urllib2
+import contextlib
+import time
+
+POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
+
+THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
+THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
+
+LOCAL_IMG_PATH = 'pictures/'
+
+verbose = True
+
+
+class ParsingException(ValueError):
+    """Raised if unable to parse POTD MediaWiki source"""
+
+
+def fetch_potd_url(iso_date):
+    """Fetch picture name from iso_date ('YYYY-MM-DD' format)"""
+    potd_url = POTD_BASE_URL + iso_date
+    with contextlib.closing(urllib2.urlopen(potd_url)) as fp:
+        html = fp.read()
+        thumb_src = THUMB_SRC_RE.search(html)
+        if not thumb_src:
+            msg = 'cannot find thumbnail source for ' + potd_url
+            raise ParsingException(msg)
+        thumb_url = THUMB_BASE_URL+thumb_src.group(1)
+    return thumb_url
+
+
+def gen_month_days(year, month):
+    a_date = datetime.date(year, month, 1)
+    one_day = datetime.timedelta(1)
+    while a_date.month == month:
+        yield a_date
+        a_date += one_day
+
+
+def get_img_names(iso_month):
+    """Fetch picture names from iso_month ('YYYY-MM' format)"""
+    year, month = (int(part) for part in iso_month.split('-'))
+    for day in gen_month_days(year, month):
+        iso_date = '{:%Y-%m-%d}'.format(day)
+        if verbose:
+            print(iso_date)
+        try:
+            img_url = fetch_potd_url(iso_date)
+        except urllib2.HTTPError:
+            break
+        yield (iso_date, img_url)
+
+
+def fetch_image(iso_date, img_url):
+    if verbose:
+        print('\t' + img_url)
+    with contextlib.closing(urllib2.urlopen(img_url)) as fp:
+        img = fp.read()
+    img_filename = iso_date + '__' + img_url.split('/')[-1]
+    if verbose:
+        print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
+    img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
+    with io.open(img_path, 'wb') as fp:
+        fp.write(img)
+    return len(img)
+
+
+def get_images(iso_month, max_count=0):
+    if max_count is 0:
+        max_count = sys.maxsize
+    img_count = 0
+    total_size = 0
+    for iso_date, img_url in get_img_names(iso_month):
+        total_size += fetch_image(iso_date, img_url)
+        img_count += 1
+        if img_count == max_count:
+            break
+
+    return (img_count, total_size)
+
+
+def main():
+    """Get "Pictures of The Day" from English Wikipedia for a given month"""
+    global verbose
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    parser.add_argument('year_month', help='year and month in YYYY-MM format')
+    parser.add_argument('-q', '--max_qty', type=int,
+                        help='maximum number of files to download')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='display progress information')
+    args = parser.parse_args()
+    verbose = args.verbose
+    t0 = time.time()
+    img_count, total_size = get_images(args.year_month, args.max_qty)
+    elapsed = time.time() - t0
+    print("images: %3d |  total size: %6.1f Kbytes  |  elapsed time: %3ds" %
+          (img_count, total_size/1024.0, elapsed))
+
+if __name__ == '__main__':
+    main()
--- a/attic/concurrency/wikipedia/orig/sync_py3.py
+++ b/attic/concurrency/wikipedia/orig/sync_py3.py
@@ -0,0 +1,118 @@
+"""
+Wikipedia Picture of the Day (POTD) download example
+
+Baseline synchronous example for comparison: downloads images and metadata
+in the simple but slow synchronous way i.e. one after the other.
+"""
+
+import sys
+import os
+import io
+import re
+import argparse
+import datetime
+import urllib.request
+import urllib.error
+import contextlib
+import time
+
+POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
+
+THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
+THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
+
+LOCAL_IMG_PATH = 'pictures/'
+
+verbose = True
+
+
+class ParsingException(ValueError):
+    """Raised if unable to parse POTD MediaWiki source"""
+
+
+def gen_month_dates(year, month):
+    """Produce all dates in a given year, month"""
+    a_date = datetime.date(year, month, 1)
+    one_day = datetime.timedelta(1)
+    while a_date.month == month:
+        yield '{:%Y-%m-%d}'.format(a_date)
+        a_date += one_day
+
+
+def fetch_potd_url(iso_date):
+    """Fetch POTD thumbnail URL for iso_date ('YYYY-MM-DD' format)"""
+    if verbose:
+        print(iso_date)
+    potd_url = POTD_BASE_URL + iso_date
+    try:
+        with urllib.request.urlopen(potd_url) as fp:
+            html = fp.read().decode('utf-8')
+            thumb_src = THUMB_SRC_RE.search(html)
+            if not thumb_src:
+                msg = 'cannot find thumbnail source for ' + potd_url
+                raise ParsingException(msg)
+            thumb_url = THUMB_BASE_URL+thumb_src.group(1)
+    except urllib.error.HTTPError:
+        return None
+    return thumb_url
+
+
+def gen_img_names(iso_month):
+    """Produce picture names by fetching POTD metadata"""
+    year, month = (int(part) for part in iso_month.split('-'))
+    for iso_date in gen_month_dates(year, month):
+        img_url = fetch_potd_url(iso_date)
+        if img_url is None:
+            break
+        yield (iso_date, img_url)
+
+
+def fetch_image(iso_date, img_url):
+    """Fetch and save image data for date and url"""
+    if verbose:
+        print('\t' + img_url)
+    with contextlib.closing(urllib.request.urlopen(img_url)) as fp:
+        img = fp.read()
+    img_filename = iso_date + '__' + img_url.split('/')[-1]
+    if verbose:
+        print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
+    img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
+    with io.open(img_path, 'wb') as fp:
+        fp.write(img)
+    return len(img)
+
+
+def get_images(iso_month, max_count=0):
+    """Download up to max_count images for a given month"""
+    if max_count is 0:
+        max_count = sys.maxsize
+    img_count = 0
+    total_size = 0
+    for iso_date, img_url in gen_img_names(iso_month):
+        total_size += fetch_image(iso_date, img_url)
+        img_count += 1
+        if img_count == max_count:
+            break
+
+    return (img_count, total_size)
+
+
+def main():
+    """Get "Pictures of The Day" from English Wikipedia for a given month"""
+    global verbose
+    parser = argparse.ArgumentParser(description=main.__doc__)
+    parser.add_argument('year_month', help='year and month in YYYY-MM format')
+    parser.add_argument('-q', '--max_qty', type=int,
+                        help='maximum number of files to download')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='display progress information')
+    args = parser.parse_args()
+    verbose = args.verbose
+    t0 = time.time()
+    img_count, total_size = get_images(args.year_month, args.max_qty)
+    elapsed = time.time() - t0
+    print("images: %3d |  total size: %6.1f Kbytes  |  elapsed time: %3ds" %
+          (img_count, total_size/1024.0, elapsed))
+
+if __name__ == '__main__':
+    main()