concurrency examples

This commit is contained in:
Luciano Ramalho
2015-02-02 14:07:35 -02:00
parent ab6ce5b6a4
commit 70163d2deb
13 changed files with 131 additions and 56 deletions

View File

@@ -0,0 +1,115 @@
"""
Wikipedia Picture of the Day (POTD) download example
Baseline synchronous example for comparison: downloads images and metadata
in the simple but slow synchronous way i.e. one after the other.
"""
from __future__ import print_function
import sys
import os
import io
import re
import argparse
import datetime
import urllib2
import contextlib
import time
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
LOCAL_IMG_PATH = 'pictures/'
verbose = True
class ParsingException(ValueError):
"""Raised if unable to parse POTD MediaWiki source"""
def fetch_potd_url(iso_date):
"""Fetch picture name from iso_date ('YYYY-MM-DD' format)"""
potd_url = POTD_BASE_URL + iso_date
with contextlib.closing(urllib2.urlopen(potd_url)) as fp:
html = fp.read()
thumb_src = THUMB_SRC_RE.search(html)
if not thumb_src:
msg = 'cannot find thumbnail source for ' + potd_url
raise ParsingException(msg)
thumb_url = THUMB_BASE_URL+thumb_src.group(1)
return thumb_url
def gen_month_days(year, month):
a_date = datetime.date(year, month, 1)
one_day = datetime.timedelta(1)
while a_date.month == month:
yield a_date
a_date += one_day
def get_img_names(iso_month):
"""Fetch picture names from iso_month ('YYYY-MM' format)"""
year, month = (int(part) for part in iso_month.split('-'))
for day in gen_month_days(year, month):
iso_date = '{:%Y-%m-%d}'.format(day)
if verbose:
print(iso_date)
try:
img_url = fetch_potd_url(iso_date)
except urllib2.HTTPError:
break
yield (iso_date, img_url)
def fetch_image(iso_date, img_url):
if verbose:
print('\t' + img_url)
with contextlib.closing(urllib2.urlopen(img_url)) as fp:
img = fp.read()
img_filename = iso_date + '__' + img_url.split('/')[-1]
if verbose:
print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
with io.open(img_path, 'wb') as fp:
fp.write(img)
return len(img)
def get_images(iso_month, max_count=0):
if max_count is 0:
max_count = sys.maxsize
img_count = 0
total_size = 0
for iso_date, img_url in get_img_names(iso_month):
total_size += fetch_image(iso_date, img_url)
img_count += 1
if img_count == max_count:
break
return (img_count, total_size)
def main():
"""Get "Pictures of The Day" from English Wikipedia for a given month"""
global verbose
parser = argparse.ArgumentParser(description=main.__doc__)
parser.add_argument('year_month', help='year and month in YYYY-MM format')
parser.add_argument('-q', '--max_qty', type=int,
help='maximum number of files to download')
parser.add_argument('-v', '--verbose', action='store_true',
help='display progress information')
args = parser.parse_args()
verbose = args.verbose
t0 = time.time()
img_count, total_size = get_images(args.year_month, args.max_qty)
elapsed = time.time() - t0
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
(img_count, total_size/1024.0, elapsed))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,118 @@
"""
Wikipedia Picture of the Day (POTD) download example
Baseline synchronous example for comparison: downloads images and metadata
in the simple but slow synchronous way i.e. one after the other.
"""
import sys
import os
import io
import re
import argparse
import datetime
import urllib.request
import urllib.error
import contextlib
import time
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
LOCAL_IMG_PATH = 'pictures/'
verbose = True
class ParsingException(ValueError):
"""Raised if unable to parse POTD MediaWiki source"""
def gen_month_dates(year, month):
"""Produce all dates in a given year, month"""
a_date = datetime.date(year, month, 1)
one_day = datetime.timedelta(1)
while a_date.month == month:
yield '{:%Y-%m-%d}'.format(a_date)
a_date += one_day
def fetch_potd_url(iso_date):
"""Fetch POTD thumbnail URL for iso_date ('YYYY-MM-DD' format)"""
if verbose:
print(iso_date)
potd_url = POTD_BASE_URL + iso_date
try:
with urllib.request.urlopen(potd_url) as fp:
html = fp.read().decode('utf-8')
thumb_src = THUMB_SRC_RE.search(html)
if not thumb_src:
msg = 'cannot find thumbnail source for ' + potd_url
raise ParsingException(msg)
thumb_url = THUMB_BASE_URL+thumb_src.group(1)
except urllib.error.HTTPError:
return None
return thumb_url
def gen_img_names(iso_month):
"""Produce picture names by fetching POTD metadata"""
year, month = (int(part) for part in iso_month.split('-'))
for iso_date in gen_month_dates(year, month):
img_url = fetch_potd_url(iso_date)
if img_url is None:
break
yield (iso_date, img_url)
def fetch_image(iso_date, img_url):
"""Fetch and save image data for date and url"""
if verbose:
print('\t' + img_url)
with contextlib.closing(urllib.request.urlopen(img_url)) as fp:
img = fp.read()
img_filename = iso_date + '__' + img_url.split('/')[-1]
if verbose:
print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
with io.open(img_path, 'wb') as fp:
fp.write(img)
return len(img)
def get_images(iso_month, max_count=0):
"""Download up to max_count images for a given month"""
if max_count is 0:
max_count = sys.maxsize
img_count = 0
total_size = 0
for iso_date, img_url in gen_img_names(iso_month):
total_size += fetch_image(iso_date, img_url)
img_count += 1
if img_count == max_count:
break
return (img_count, total_size)
def main():
"""Get "Pictures of The Day" from English Wikipedia for a given month"""
global verbose
parser = argparse.ArgumentParser(description=main.__doc__)
parser.add_argument('year_month', help='year and month in YYYY-MM format')
parser.add_argument('-q', '--max_qty', type=int,
help='maximum number of files to download')
parser.add_argument('-v', '--verbose', action='store_true',
help='display progress information')
args = parser.parse_args()
verbose = args.verbose
t0 = time.time()
img_count, total_size = get_images(args.year_month, args.max_qty)
elapsed = time.time() - t0
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
(img_count, total_size/1024.0, elapsed))
if __name__ == '__main__':
main()