update from Atlas with major reorg

This commit is contained in:
Luciano Ramalho
2015-04-17 21:29:30 -03:00
parent 57902d31b5
commit a786180239
134 changed files with 369 additions and 520 deletions

View File

@@ -0,0 +1,39 @@
=====================================
Wikipedia Picture of the Day examples
=====================================
These examples use various asynchronous programming techniques to download
images and metadata from the English Wikipedia `Picture of the Day`_ archive.
.. _Picture of the Day: http://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/Archive
--------
Timings
--------
``sync.py``
===========
::
$ time python sync.py 2014-06 -q 5
5 images downloaded (167.8 Kbytes total)
real 0m6.272s
user 0m0.065s
sys 0m0.039s
$ time python sync.py 2014-06 -q 5
5 images downloaded (167.8 Kbytes total)
real 0m5.447s
user 0m0.068s
sys 0m0.040s
$ time python sync.py 2014-06 -q 5
5 images downloaded (167.8 Kbytes total)
real 0m6.314s
user 0m0.068s
sys 0m0.040s

View File

@@ -0,0 +1,36 @@
"""
Wikipedia Picture of the Day (POTD) download example
Inspired by example at:
https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
"""
from concurrent import futures
import potd
def save_month(year_month, verbose):
year, month = [int(s) for s in year_month.split('-')]
total_size = 0
img_count = 0
dates = potd.list_days_of_month(year, month)
with futures.ProcessPoolExecutor(max_workers=100) as executor:
downloads = dict((executor.submit(potd.save_one, date, verbose), date)
for date in dates)
for future in futures.as_completed(downloads):
date = downloads[future]
if future.exception() is not None:
print('%r generated an exception: %s' % (date,
future.exception()))
else:
img_size = future.result()
total_size += img_size
img_count += 1
print('%r OK: %r' % (date, img_size))
return img_count, total_size
if __name__ == '__main__':
potd.main(save_month=save_month)

View File

@@ -0,0 +1,36 @@
"""
Wikipedia Picture of the Day (POTD) download example
Inspired by example at:
https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
"""
from concurrent import futures
import potd
def save_month(year_month, verbose):
year, month = [int(s) for s in year_month.split('-')]
total_size = 0
img_count = 0
dates = potd.list_days_of_month(year, month)
with futures.ThreadPoolExecutor(max_workers=100) as executor:
downloads = dict((executor.submit(potd.save_one, date, verbose), date)
for date in dates)
for future in futures.as_completed(downloads):
date = downloads[future]
if future.exception() is not None:
print('%r generated an exception: %s' % (date,
future.exception()))
else:
img_size = future.result()
total_size += img_size
img_count += 1
print('%r OK: %r' % (date, img_size))
return img_count, total_size
if __name__ == '__main__':
potd.main(save_month=save_month)

View File

@@ -0,0 +1,100 @@
"""
Wikipedia Picture of the Day (POTD) download example
Baseline synchronous example for comparison: downloads metadata and
images in the simple but slow synchronous way i.e. one after the other.
"""
import calendar
import datetime
import re
import os
import io
import time
import requests
import argparse
SAVE_DIR = 'pictures/'
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
class NoPictureForDate(Exception):
'''No Picture of the Day found for {day}'''
def build_page_url(iso_date):
return POTD_BASE_URL + iso_date
def fetch(url):
response = requests.get(url)
return response
def extract_image_url(html):
re_image = r'src="(//upload\..*?)"'
image_url = re.search(re_image, html)
return 'http:' + image_url.group(1)
def format_date(year, month, day):
return '{year}-{month:02d}-{day:02d}'.format(**locals())
def list_days_of_month(year, month):
lastday = calendar.monthrange(year, month)[1]
days = [format_date(year, month, day) for day in range(1, lastday + 1)]
return days
def build_save_path(iso_date, url):
head, filename = os.path.split(url)
return os.path.join(SAVE_DIR, iso_date+'_'+filename)
def save_one(iso_date, verbose):
page_url = build_page_url(iso_date)
response = fetch(page_url)
if response.status_code != 200:
msg = NoPictureForDate.__doc__.format(day=iso_date)
raise NoPictureForDate(msg)
img_url = extract_image_url(response.text)
response = fetch(img_url)
path = build_save_path(iso_date, img_url)
if verbose:
print('saving: '+path)
with io.open(path, 'wb') as fp:
fp.write(response.content)
return len(response.content)
def save_month(year_month, verbose):
year, month = [int(s) for s in year_month.split('-')]
total_size = 0
img_count = 0
dates = list_days_of_month(year, month)
for date in dates:
try:
total_size += save_one(date, verbose)
img_count += 1
except NoPictureForDate:
continue
return img_count, total_size
def main(save_one=save_one, save_month=save_month):
"""Get "Picture of The Day" from English Wikipedia for a given date or month"""
parser = argparse.ArgumentParser(description=main.__doc__)
parser.add_argument('date', help='year, month and (optional) day in YYYY-MM-DD format')
parser.add_argument('-q', '--max_qty', type=int,
help='maximum number of files to download')
parser.add_argument('-v', '--verbose', action='store_true',
help='display progress information')
args = parser.parse_args()
t0 = time.time()
if len(args.date) == len('YYYY-MM-DD'):
img_count = 1
total_size = save_one(args.date, args.verbose)
else:
img_count, total_size = save_month(args.date, args.verbose)
elapsed = time.time() - t0
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
(img_count, total_size/1024.0, elapsed))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,96 @@
import unittest
import potd
class TestSequenceFunctions(unittest.TestCase):
def setUp(self):
self.thumb_url = ("""http://upload.wikimedia.org/wikipedia/"""
"""commons/thumb/f/fe/Orthographic_projection_SW.jpg/350px"""
"""-Orthographic_projection_SW.jpg""")
def test_buid_page_url(self):
date = '2014-05-01'
result = potd.build_page_url(date)
self.assertEqual(result, 'http://en.wikipedia.org/wiki/Template:POTD/2014-05-01')
def test_fetch_status_code(self):
date = '2014-05-02'
url = potd.build_page_url(date)
response = potd.fetch(url)
self.assertEqual(response.status_code, 200)
def test_fetch_status_code_not_found(self):
date = '2100-01-01'
url = potd.build_page_url(date)
response = potd.fetch(url)
self.assertEqual(response.status_code, 404)
def test_extract_image_url(self):
image_url = potd.extract_image_url(HTML)
self.assertEqual(image_url, self.thumb_url)
def test_fetch_image_jpeg(self):
response = potd.fetch(self.thumb_url)
self.assertEqual(response.headers['content-type'], 'image/jpeg')
def test_list_days_of_month(self):
year = 2014
month = 5
days = potd.list_days_of_month(year, month)
self.assertEqual(len(days), 31)
self.assertEqual('2014-05-01', days[0])
self.assertEqual('2014-05-31', days[-1])
def test_list_days_of_february(self):
year = 2014
month = 2
days = potd.list_days_of_month(year, month)
self.assertEqual(len(days), 28)
self.assertEqual('2014-02-01', days[0])
self.assertEqual('2014-02-28', days[-1])
def test_format_date(self):
year = 2014
month = 2
day = 1
a_date = '2014-02-01'
date = potd.format_date(year, month, day)
self.assertEqual(a_date, date)
self.assertEqual(potd.format_date(2010, 11, 12), '2010-11-12')
def test_build_save_path(self):
date = '2014-06-04'
path = potd.SAVE_DIR + date + '_350px-Orthographic_projection_SW.jpg'
self.assertEqual(path, potd.build_save_path(date, self.thumb_url))
HTML = (
'''<td><a href="/wiki/File:Orthographic_projection_SW.jpg" class="image"
title="Orthographic projection"><img alt="Orthographic projection"
src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fe/O'''
'''rthographic_projection_SW.jpg/350px-Orthographic_projection_SW.jpg"
width="350" height="350" srcset="//upload.wikimedia.org/wikipedia/comm'''
'''ons/thumb/f/fe/Orthographic_projection_SW.jpg/525px-
Orthographic_projection_SW.jpg 1.5x, //upload.wikimedia.org/wikipedia/
commons/thumb/f/fe/Orthographic_projection_SW.jpg/700px-
Orthographic_projection_SW.jpg 2x" data-file-width="2058" data-file-
height="2058"></a></td>
''')
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,115 @@
"""
Wikipedia Picture of the Day (POTD) download example
Baseline synchronous example for comparison: downloads images and metadata
in the simple but slow synchronous way i.e. one after the other.
"""
from __future__ import print_function
import sys
import os
import io
import re
import argparse
import datetime
import urllib2
import contextlib
import time
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
LOCAL_IMG_PATH = 'pictures/'
verbose = True
class ParsingException(ValueError):
"""Raised if unable to parse POTD MediaWiki source"""
def fetch_potd_url(iso_date):
"""Fetch picture name from iso_date ('YYYY-MM-DD' format)"""
potd_url = POTD_BASE_URL + iso_date
with contextlib.closing(urllib2.urlopen(potd_url)) as fp:
html = fp.read()
thumb_src = THUMB_SRC_RE.search(html)
if not thumb_src:
msg = 'cannot find thumbnail source for ' + potd_url
raise ParsingException(msg)
thumb_url = THUMB_BASE_URL+thumb_src.group(1)
return thumb_url
def gen_month_days(year, month):
a_date = datetime.date(year, month, 1)
one_day = datetime.timedelta(1)
while a_date.month == month:
yield a_date
a_date += one_day
def get_img_names(iso_month):
"""Fetch picture names from iso_month ('YYYY-MM' format)"""
year, month = (int(part) for part in iso_month.split('-'))
for day in gen_month_days(year, month):
iso_date = '{:%Y-%m-%d}'.format(day)
if verbose:
print(iso_date)
try:
img_url = fetch_potd_url(iso_date)
except urllib2.HTTPError:
break
yield (iso_date, img_url)
def fetch_image(iso_date, img_url):
if verbose:
print('\t' + img_url)
with contextlib.closing(urllib2.urlopen(img_url)) as fp:
img = fp.read()
img_filename = iso_date + '__' + img_url.split('/')[-1]
if verbose:
print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
with io.open(img_path, 'wb') as fp:
fp.write(img)
return len(img)
def get_images(iso_month, max_count=0):
if max_count is 0:
max_count = sys.maxsize
img_count = 0
total_size = 0
for iso_date, img_url in get_img_names(iso_month):
total_size += fetch_image(iso_date, img_url)
img_count += 1
if img_count == max_count:
break
return (img_count, total_size)
def main():
"""Get "Pictures of The Day" from English Wikipedia for a given month"""
global verbose
parser = argparse.ArgumentParser(description=main.__doc__)
parser.add_argument('year_month', help='year and month in YYYY-MM format')
parser.add_argument('-q', '--max_qty', type=int,
help='maximum number of files to download')
parser.add_argument('-v', '--verbose', action='store_true',
help='display progress information')
args = parser.parse_args()
verbose = args.verbose
t0 = time.time()
img_count, total_size = get_images(args.year_month, args.max_qty)
elapsed = time.time() - t0
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
(img_count, total_size/1024.0, elapsed))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,118 @@
"""
Wikipedia Picture of the Day (POTD) download example
Baseline synchronous example for comparison: downloads images and metadata
in the simple but slow synchronous way i.e. one after the other.
"""
import sys
import os
import io
import re
import argparse
import datetime
import urllib.request
import urllib.error
import contextlib
import time
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
LOCAL_IMG_PATH = 'pictures/'
verbose = True
class ParsingException(ValueError):
"""Raised if unable to parse POTD MediaWiki source"""
def gen_month_dates(year, month):
"""Produce all dates in a given year, month"""
a_date = datetime.date(year, month, 1)
one_day = datetime.timedelta(1)
while a_date.month == month:
yield '{:%Y-%m-%d}'.format(a_date)
a_date += one_day
def fetch_potd_url(iso_date):
"""Fetch POTD thumbnail URL for iso_date ('YYYY-MM-DD' format)"""
if verbose:
print(iso_date)
potd_url = POTD_BASE_URL + iso_date
try:
with urllib.request.urlopen(potd_url) as fp:
html = fp.read().decode('utf-8')
thumb_src = THUMB_SRC_RE.search(html)
if not thumb_src:
msg = 'cannot find thumbnail source for ' + potd_url
raise ParsingException(msg)
thumb_url = THUMB_BASE_URL+thumb_src.group(1)
except urllib.error.HTTPError:
return None
return thumb_url
def gen_img_names(iso_month):
"""Produce picture names by fetching POTD metadata"""
year, month = (int(part) for part in iso_month.split('-'))
for iso_date in gen_month_dates(year, month):
img_url = fetch_potd_url(iso_date)
if img_url is None:
break
yield (iso_date, img_url)
def fetch_image(iso_date, img_url):
"""Fetch and save image data for date and url"""
if verbose:
print('\t' + img_url)
with contextlib.closing(urllib.request.urlopen(img_url)) as fp:
img = fp.read()
img_filename = iso_date + '__' + img_url.split('/')[-1]
if verbose:
print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
with io.open(img_path, 'wb') as fp:
fp.write(img)
return len(img)
def get_images(iso_month, max_count=0):
"""Download up to max_count images for a given month"""
if max_count is 0:
max_count = sys.maxsize
img_count = 0
total_size = 0
for iso_date, img_url in gen_img_names(iso_month):
total_size += fetch_image(iso_date, img_url)
img_count += 1
if img_count == max_count:
break
return (img_count, total_size)
def main():
"""Get "Pictures of The Day" from English Wikipedia for a given month"""
global verbose
parser = argparse.ArgumentParser(description=main.__doc__)
parser.add_argument('year_month', help='year and month in YYYY-MM format')
parser.add_argument('-q', '--max_qty', type=int,
help='maximum number of files to download')
parser.add_argument('-v', '--verbose', action='store_true',
help='display progress information')
args = parser.parse_args()
verbose = args.verbose
t0 = time.time()
img_count, total_size = get_images(args.year_month, args.max_qty)
elapsed = time.time() - t0
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
(img_count, total_size/1024.0, elapsed))
if __name__ == '__main__':
main()