update from Atlas with major reorg
This commit is contained in:
39
attic/concurrency/wikipedia/orig/README.rst
Normal file
39
attic/concurrency/wikipedia/orig/README.rst
Normal file
@@ -0,0 +1,39 @@
|
||||
=====================================
|
||||
Wikipedia Picture of the Day examples
|
||||
=====================================
|
||||
|
||||
These examples use various asynchronous programming techniques to download
|
||||
images and metadata from the English Wikipedia `Picture of the Day`_ archive.
|
||||
|
||||
.. _Picture of the Day: http://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/Archive
|
||||
|
||||
|
||||
--------
|
||||
Timings
|
||||
--------
|
||||
|
||||
``sync.py``
|
||||
===========
|
||||
|
||||
::
|
||||
|
||||
$ time python sync.py 2014-06 -q 5
|
||||
5 images downloaded (167.8 Kbytes total)
|
||||
|
||||
real 0m6.272s
|
||||
user 0m0.065s
|
||||
sys 0m0.039s
|
||||
|
||||
$ time python sync.py 2014-06 -q 5
|
||||
5 images downloaded (167.8 Kbytes total)
|
||||
|
||||
real 0m5.447s
|
||||
user 0m0.068s
|
||||
sys 0m0.040s
|
||||
|
||||
$ time python sync.py 2014-06 -q 5
|
||||
5 images downloaded (167.8 Kbytes total)
|
||||
|
||||
real 0m6.314s
|
||||
user 0m0.068s
|
||||
sys 0m0.040s
|
||||
36
attic/concurrency/wikipedia/orig/futureprocs.py
Normal file
36
attic/concurrency/wikipedia/orig/futureprocs.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
|
||||
Inspired by example at:
|
||||
https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
|
||||
"""
|
||||
|
||||
from concurrent import futures
|
||||
|
||||
import potd
|
||||
|
||||
def save_month(year_month, verbose):
|
||||
year, month = [int(s) for s in year_month.split('-')]
|
||||
total_size = 0
|
||||
img_count = 0
|
||||
dates = potd.list_days_of_month(year, month)
|
||||
|
||||
with futures.ProcessPoolExecutor(max_workers=100) as executor:
|
||||
downloads = dict((executor.submit(potd.save_one, date, verbose), date)
|
||||
for date in dates)
|
||||
|
||||
for future in futures.as_completed(downloads):
|
||||
date = downloads[future]
|
||||
if future.exception() is not None:
|
||||
print('%r generated an exception: %s' % (date,
|
||||
future.exception()))
|
||||
else:
|
||||
img_size = future.result()
|
||||
total_size += img_size
|
||||
img_count += 1
|
||||
print('%r OK: %r' % (date, img_size))
|
||||
|
||||
return img_count, total_size
|
||||
|
||||
if __name__ == '__main__':
|
||||
potd.main(save_month=save_month)
|
||||
36
attic/concurrency/wikipedia/orig/futurethreads.py
Normal file
36
attic/concurrency/wikipedia/orig/futurethreads.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
|
||||
Inspired by example at:
|
||||
https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
|
||||
"""
|
||||
|
||||
from concurrent import futures
|
||||
|
||||
import potd
|
||||
|
||||
def save_month(year_month, verbose):
|
||||
year, month = [int(s) for s in year_month.split('-')]
|
||||
total_size = 0
|
||||
img_count = 0
|
||||
dates = potd.list_days_of_month(year, month)
|
||||
|
||||
with futures.ThreadPoolExecutor(max_workers=100) as executor:
|
||||
downloads = dict((executor.submit(potd.save_one, date, verbose), date)
|
||||
for date in dates)
|
||||
|
||||
for future in futures.as_completed(downloads):
|
||||
date = downloads[future]
|
||||
if future.exception() is not None:
|
||||
print('%r generated an exception: %s' % (date,
|
||||
future.exception()))
|
||||
else:
|
||||
img_size = future.result()
|
||||
total_size += img_size
|
||||
img_count += 1
|
||||
print('%r OK: %r' % (date, img_size))
|
||||
|
||||
return img_count, total_size
|
||||
|
||||
if __name__ == '__main__':
|
||||
potd.main(save_month=save_month)
|
||||
100
attic/concurrency/wikipedia/orig/potd.py
Normal file
100
attic/concurrency/wikipedia/orig/potd.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
|
||||
Baseline synchronous example for comparison: downloads metadata and
|
||||
images in the simple but slow synchronous way i.e. one after the other.
|
||||
"""
|
||||
|
||||
import calendar
|
||||
import datetime
|
||||
import re
|
||||
import os
|
||||
import io
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
import argparse
|
||||
|
||||
SAVE_DIR = 'pictures/'
|
||||
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
|
||||
|
||||
class NoPictureForDate(Exception):
|
||||
'''No Picture of the Day found for {day}'''
|
||||
|
||||
def build_page_url(iso_date):
|
||||
return POTD_BASE_URL + iso_date
|
||||
|
||||
def fetch(url):
|
||||
response = requests.get(url)
|
||||
return response
|
||||
|
||||
def extract_image_url(html):
|
||||
re_image = r'src="(//upload\..*?)"'
|
||||
image_url = re.search(re_image, html)
|
||||
return 'http:' + image_url.group(1)
|
||||
|
||||
def format_date(year, month, day):
|
||||
return '{year}-{month:02d}-{day:02d}'.format(**locals())
|
||||
|
||||
def list_days_of_month(year, month):
|
||||
lastday = calendar.monthrange(year, month)[1]
|
||||
days = [format_date(year, month, day) for day in range(1, lastday + 1)]
|
||||
return days
|
||||
|
||||
def build_save_path(iso_date, url):
|
||||
head, filename = os.path.split(url)
|
||||
return os.path.join(SAVE_DIR, iso_date+'_'+filename)
|
||||
|
||||
def save_one(iso_date, verbose):
|
||||
page_url = build_page_url(iso_date)
|
||||
response = fetch(page_url)
|
||||
if response.status_code != 200:
|
||||
msg = NoPictureForDate.__doc__.format(day=iso_date)
|
||||
raise NoPictureForDate(msg)
|
||||
img_url = extract_image_url(response.text)
|
||||
response = fetch(img_url)
|
||||
path = build_save_path(iso_date, img_url)
|
||||
if verbose:
|
||||
print('saving: '+path)
|
||||
with io.open(path, 'wb') as fp:
|
||||
fp.write(response.content)
|
||||
return len(response.content)
|
||||
|
||||
def save_month(year_month, verbose):
|
||||
year, month = [int(s) for s in year_month.split('-')]
|
||||
total_size = 0
|
||||
img_count = 0
|
||||
dates = list_days_of_month(year, month)
|
||||
|
||||
for date in dates:
|
||||
try:
|
||||
total_size += save_one(date, verbose)
|
||||
img_count += 1
|
||||
except NoPictureForDate:
|
||||
continue
|
||||
return img_count, total_size
|
||||
|
||||
def main(save_one=save_one, save_month=save_month):
|
||||
"""Get "Picture of The Day" from English Wikipedia for a given date or month"""
|
||||
parser = argparse.ArgumentParser(description=main.__doc__)
|
||||
parser.add_argument('date', help='year, month and (optional) day in YYYY-MM-DD format')
|
||||
parser.add_argument('-q', '--max_qty', type=int,
|
||||
help='maximum number of files to download')
|
||||
parser.add_argument('-v', '--verbose', action='store_true',
|
||||
help='display progress information')
|
||||
args = parser.parse_args()
|
||||
|
||||
t0 = time.time()
|
||||
if len(args.date) == len('YYYY-MM-DD'):
|
||||
img_count = 1
|
||||
total_size = save_one(args.date, args.verbose)
|
||||
else:
|
||||
img_count, total_size = save_month(args.date, args.verbose)
|
||||
elapsed = time.time() - t0
|
||||
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
|
||||
(img_count, total_size/1024.0, elapsed))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
96
attic/concurrency/wikipedia/orig/potd_tests.py
Normal file
96
attic/concurrency/wikipedia/orig/potd_tests.py
Normal file
@@ -0,0 +1,96 @@
|
||||
|
||||
import unittest
|
||||
|
||||
import potd
|
||||
|
||||
class TestSequenceFunctions(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.thumb_url = ("""http://upload.wikimedia.org/wikipedia/"""
|
||||
"""commons/thumb/f/fe/Orthographic_projection_SW.jpg/350px"""
|
||||
"""-Orthographic_projection_SW.jpg""")
|
||||
|
||||
def test_buid_page_url(self):
|
||||
date = '2014-05-01'
|
||||
result = potd.build_page_url(date)
|
||||
self.assertEqual(result, 'http://en.wikipedia.org/wiki/Template:POTD/2014-05-01')
|
||||
|
||||
def test_fetch_status_code(self):
|
||||
date = '2014-05-02'
|
||||
url = potd.build_page_url(date)
|
||||
response = potd.fetch(url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
def test_fetch_status_code_not_found(self):
|
||||
date = '2100-01-01'
|
||||
url = potd.build_page_url(date)
|
||||
response = potd.fetch(url)
|
||||
self.assertEqual(response.status_code, 404)
|
||||
|
||||
def test_extract_image_url(self):
|
||||
image_url = potd.extract_image_url(HTML)
|
||||
self.assertEqual(image_url, self.thumb_url)
|
||||
|
||||
def test_fetch_image_jpeg(self):
|
||||
response = potd.fetch(self.thumb_url)
|
||||
self.assertEqual(response.headers['content-type'], 'image/jpeg')
|
||||
|
||||
def test_list_days_of_month(self):
|
||||
year = 2014
|
||||
month = 5
|
||||
days = potd.list_days_of_month(year, month)
|
||||
self.assertEqual(len(days), 31)
|
||||
self.assertEqual('2014-05-01', days[0])
|
||||
self.assertEqual('2014-05-31', days[-1])
|
||||
|
||||
def test_list_days_of_february(self):
|
||||
year = 2014
|
||||
month = 2
|
||||
days = potd.list_days_of_month(year, month)
|
||||
self.assertEqual(len(days), 28)
|
||||
self.assertEqual('2014-02-01', days[0])
|
||||
self.assertEqual('2014-02-28', days[-1])
|
||||
|
||||
def test_format_date(self):
|
||||
year = 2014
|
||||
month = 2
|
||||
day = 1
|
||||
a_date = '2014-02-01'
|
||||
date = potd.format_date(year, month, day)
|
||||
self.assertEqual(a_date, date)
|
||||
self.assertEqual(potd.format_date(2010, 11, 12), '2010-11-12')
|
||||
|
||||
def test_build_save_path(self):
|
||||
date = '2014-06-04'
|
||||
path = potd.SAVE_DIR + date + '_350px-Orthographic_projection_SW.jpg'
|
||||
self.assertEqual(path, potd.build_save_path(date, self.thumb_url))
|
||||
|
||||
|
||||
HTML = (
|
||||
'''<td><a href="/wiki/File:Orthographic_projection_SW.jpg" class="image"
|
||||
title="Orthographic projection"><img alt="Orthographic projection"
|
||||
src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fe/O'''
|
||||
'''rthographic_projection_SW.jpg/350px-Orthographic_projection_SW.jpg"
|
||||
width="350" height="350" srcset="//upload.wikimedia.org/wikipedia/comm'''
|
||||
'''ons/thumb/f/fe/Orthographic_projection_SW.jpg/525px-
|
||||
Orthographic_projection_SW.jpg 1.5x, //upload.wikimedia.org/wikipedia/
|
||||
commons/thumb/f/fe/Orthographic_projection_SW.jpg/700px-
|
||||
Orthographic_projection_SW.jpg 2x" data-file-width="2058" data-file-
|
||||
height="2058"></a></td>
|
||||
''')
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
115
attic/concurrency/wikipedia/orig/sync.py
Normal file
115
attic/concurrency/wikipedia/orig/sync.py
Normal file
@@ -0,0 +1,115 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
|
||||
Baseline synchronous example for comparison: downloads images and metadata
|
||||
in the simple but slow synchronous way i.e. one after the other.
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import os
|
||||
import io
|
||||
import re
|
||||
import argparse
|
||||
import datetime
|
||||
import urllib2
|
||||
import contextlib
|
||||
import time
|
||||
|
||||
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
|
||||
|
||||
THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
|
||||
THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
|
||||
|
||||
LOCAL_IMG_PATH = 'pictures/'
|
||||
|
||||
verbose = True
|
||||
|
||||
|
||||
class ParsingException(ValueError):
|
||||
"""Raised if unable to parse POTD MediaWiki source"""
|
||||
|
||||
|
||||
def fetch_potd_url(iso_date):
|
||||
"""Fetch picture name from iso_date ('YYYY-MM-DD' format)"""
|
||||
potd_url = POTD_BASE_URL + iso_date
|
||||
with contextlib.closing(urllib2.urlopen(potd_url)) as fp:
|
||||
html = fp.read()
|
||||
thumb_src = THUMB_SRC_RE.search(html)
|
||||
if not thumb_src:
|
||||
msg = 'cannot find thumbnail source for ' + potd_url
|
||||
raise ParsingException(msg)
|
||||
thumb_url = THUMB_BASE_URL+thumb_src.group(1)
|
||||
return thumb_url
|
||||
|
||||
|
||||
def gen_month_days(year, month):
|
||||
a_date = datetime.date(year, month, 1)
|
||||
one_day = datetime.timedelta(1)
|
||||
while a_date.month == month:
|
||||
yield a_date
|
||||
a_date += one_day
|
||||
|
||||
|
||||
def get_img_names(iso_month):
|
||||
"""Fetch picture names from iso_month ('YYYY-MM' format)"""
|
||||
year, month = (int(part) for part in iso_month.split('-'))
|
||||
for day in gen_month_days(year, month):
|
||||
iso_date = '{:%Y-%m-%d}'.format(day)
|
||||
if verbose:
|
||||
print(iso_date)
|
||||
try:
|
||||
img_url = fetch_potd_url(iso_date)
|
||||
except urllib2.HTTPError:
|
||||
break
|
||||
yield (iso_date, img_url)
|
||||
|
||||
|
||||
def fetch_image(iso_date, img_url):
|
||||
if verbose:
|
||||
print('\t' + img_url)
|
||||
with contextlib.closing(urllib2.urlopen(img_url)) as fp:
|
||||
img = fp.read()
|
||||
img_filename = iso_date + '__' + img_url.split('/')[-1]
|
||||
if verbose:
|
||||
print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
|
||||
img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
|
||||
with io.open(img_path, 'wb') as fp:
|
||||
fp.write(img)
|
||||
return len(img)
|
||||
|
||||
|
||||
def get_images(iso_month, max_count=0):
|
||||
if max_count is 0:
|
||||
max_count = sys.maxsize
|
||||
img_count = 0
|
||||
total_size = 0
|
||||
for iso_date, img_url in get_img_names(iso_month):
|
||||
total_size += fetch_image(iso_date, img_url)
|
||||
img_count += 1
|
||||
if img_count == max_count:
|
||||
break
|
||||
|
||||
return (img_count, total_size)
|
||||
|
||||
|
||||
def main():
|
||||
"""Get "Pictures of The Day" from English Wikipedia for a given month"""
|
||||
global verbose
|
||||
parser = argparse.ArgumentParser(description=main.__doc__)
|
||||
parser.add_argument('year_month', help='year and month in YYYY-MM format')
|
||||
parser.add_argument('-q', '--max_qty', type=int,
|
||||
help='maximum number of files to download')
|
||||
parser.add_argument('-v', '--verbose', action='store_true',
|
||||
help='display progress information')
|
||||
args = parser.parse_args()
|
||||
verbose = args.verbose
|
||||
t0 = time.time()
|
||||
img_count, total_size = get_images(args.year_month, args.max_qty)
|
||||
elapsed = time.time() - t0
|
||||
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
|
||||
(img_count, total_size/1024.0, elapsed))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
118
attic/concurrency/wikipedia/orig/sync_py3.py
Normal file
118
attic/concurrency/wikipedia/orig/sync_py3.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
|
||||
Baseline synchronous example for comparison: downloads images and metadata
|
||||
in the simple but slow synchronous way i.e. one after the other.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import io
|
||||
import re
|
||||
import argparse
|
||||
import datetime
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import contextlib
|
||||
import time
|
||||
|
||||
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
|
||||
|
||||
THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
|
||||
THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
|
||||
|
||||
LOCAL_IMG_PATH = 'pictures/'
|
||||
|
||||
verbose = True
|
||||
|
||||
|
||||
class ParsingException(ValueError):
|
||||
"""Raised if unable to parse POTD MediaWiki source"""
|
||||
|
||||
|
||||
def gen_month_dates(year, month):
|
||||
"""Produce all dates in a given year, month"""
|
||||
a_date = datetime.date(year, month, 1)
|
||||
one_day = datetime.timedelta(1)
|
||||
while a_date.month == month:
|
||||
yield '{:%Y-%m-%d}'.format(a_date)
|
||||
a_date += one_day
|
||||
|
||||
|
||||
def fetch_potd_url(iso_date):
|
||||
"""Fetch POTD thumbnail URL for iso_date ('YYYY-MM-DD' format)"""
|
||||
if verbose:
|
||||
print(iso_date)
|
||||
potd_url = POTD_BASE_URL + iso_date
|
||||
try:
|
||||
with urllib.request.urlopen(potd_url) as fp:
|
||||
html = fp.read().decode('utf-8')
|
||||
thumb_src = THUMB_SRC_RE.search(html)
|
||||
if not thumb_src:
|
||||
msg = 'cannot find thumbnail source for ' + potd_url
|
||||
raise ParsingException(msg)
|
||||
thumb_url = THUMB_BASE_URL+thumb_src.group(1)
|
||||
except urllib.error.HTTPError:
|
||||
return None
|
||||
return thumb_url
|
||||
|
||||
|
||||
def gen_img_names(iso_month):
|
||||
"""Produce picture names by fetching POTD metadata"""
|
||||
year, month = (int(part) for part in iso_month.split('-'))
|
||||
for iso_date in gen_month_dates(year, month):
|
||||
img_url = fetch_potd_url(iso_date)
|
||||
if img_url is None:
|
||||
break
|
||||
yield (iso_date, img_url)
|
||||
|
||||
|
||||
def fetch_image(iso_date, img_url):
|
||||
"""Fetch and save image data for date and url"""
|
||||
if verbose:
|
||||
print('\t' + img_url)
|
||||
with contextlib.closing(urllib.request.urlopen(img_url)) as fp:
|
||||
img = fp.read()
|
||||
img_filename = iso_date + '__' + img_url.split('/')[-1]
|
||||
if verbose:
|
||||
print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
|
||||
img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
|
||||
with io.open(img_path, 'wb') as fp:
|
||||
fp.write(img)
|
||||
return len(img)
|
||||
|
||||
|
||||
def get_images(iso_month, max_count=0):
|
||||
"""Download up to max_count images for a given month"""
|
||||
if max_count is 0:
|
||||
max_count = sys.maxsize
|
||||
img_count = 0
|
||||
total_size = 0
|
||||
for iso_date, img_url in gen_img_names(iso_month):
|
||||
total_size += fetch_image(iso_date, img_url)
|
||||
img_count += 1
|
||||
if img_count == max_count:
|
||||
break
|
||||
|
||||
return (img_count, total_size)
|
||||
|
||||
|
||||
def main():
|
||||
"""Get "Pictures of The Day" from English Wikipedia for a given month"""
|
||||
global verbose
|
||||
parser = argparse.ArgumentParser(description=main.__doc__)
|
||||
parser.add_argument('year_month', help='year and month in YYYY-MM format')
|
||||
parser.add_argument('-q', '--max_qty', type=int,
|
||||
help='maximum number of files to download')
|
||||
parser.add_argument('-v', '--verbose', action='store_true',
|
||||
help='display progress information')
|
||||
args = parser.parse_args()
|
||||
verbose = args.verbose
|
||||
t0 = time.time()
|
||||
img_count, total_size = get_images(args.year_month, args.max_qty)
|
||||
elapsed = time.time() - t0
|
||||
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
|
||||
(img_count, total_size/1024.0, elapsed))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user