wikipedia pictures download example
This commit is contained in:
184
concurrency/wikipedia/daypicts.py
Normal file
184
concurrency/wikipedia/daypicts.py
Normal file
@@ -0,0 +1,184 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
|
||||
Note:
|
||||
The earliest Pictures of the Day I've found are in this page:
|
||||
|
||||
http://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/May_2004
|
||||
|
||||
However, I have not found Template:POTD/YYYY-MM-DD pages earlier
|
||||
than this:
|
||||
|
||||
http://en.wikipedia.org/wiki/Template:POTD/2007-01-01
|
||||
|
||||
For simplicity, this script only retrieves pictures starting
|
||||
from 2007-01-01.
|
||||
|
||||
"""
|
||||
import sys
|
||||
import argparse
|
||||
import re
|
||||
import imghdr
|
||||
import time
|
||||
import datetime
|
||||
|
||||
import requests
|
||||
|
||||
SAVE_DIR = 'pictures/'
|
||||
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
|
||||
POTD_IMAGE_RE = re.compile(r'src="(//upload\..*?)"')
|
||||
PODT_EARLIEST_TEMPLATE = '2007-01-01'
|
||||
|
||||
RE_YEAR = r'([12]\d{3})'
|
||||
RE_MONTH = RE_YEAR + r'-([01]\d)'
|
||||
RE_DATE = RE_MONTH + r'-([0-3]\d)'
|
||||
ISO_DATE_FMT = '%Y-%m-%d'
|
||||
|
||||
DATEFORMS = [
|
||||
('date', re.compile('^' + RE_DATE + '$')),
|
||||
('month', re.compile('^' + RE_MONTH + '$')),
|
||||
('year', re.compile('^' + RE_YEAR + '$'))
|
||||
]
|
||||
|
||||
|
||||
class NoPictureForDate(Exception):
|
||||
'''No Picture of the Day found for {iso_date}'''
|
||||
|
||||
|
||||
class NoPictureTemplateBefore(ValueError):
|
||||
'''Template:POTD did not exist before PODT_EARLIEST_TEMPLATE'''
|
||||
|
||||
|
||||
def get_picture_url(iso_date):
|
||||
page_url = POTD_BASE_URL+iso_date
|
||||
response = requests.get(page_url)
|
||||
pict_url = POTD_IMAGE_RE.search(response.text)
|
||||
if pict_url is None:
|
||||
raise NoPictureForDate(iso_date)
|
||||
return 'http:' + pict_url.group(1)
|
||||
|
||||
|
||||
def get_picture(iso_date):
|
||||
pict_url = get_picture_url(iso_date)
|
||||
response = requests.get(pict_url)
|
||||
octets = response.content
|
||||
return octets
|
||||
|
||||
|
||||
def get_picture_type(octets):
|
||||
pict_type = imghdr.what(None, octets)
|
||||
if pict_type is None:
|
||||
if (octets.startswith(b'<') and
|
||||
b'<svg' in octets[:200] and
|
||||
octets.rstrip().endswith(b'</svg>')):
|
||||
pict_type = 'svg'
|
||||
return pict_type
|
||||
|
||||
|
||||
def validate_date(text):
|
||||
try:
|
||||
parts = [int(part) for part in text.split('-')]
|
||||
except ValueError:
|
||||
raise ValueError('date must use YYYY, YYYY-MM or YYYY-MM-DD format')
|
||||
|
||||
test_parts = parts[:]
|
||||
while len(test_parts) < 3:
|
||||
test_parts.append(1)
|
||||
date = datetime.datetime(*(int(part) for part in test_parts))
|
||||
iso_date = date.strftime(ISO_DATE_FMT)
|
||||
iso_date = iso_date[:1+len(parts)*3]
|
||||
if iso_date < PODT_EARLIEST_TEMPLATE:
|
||||
raise NoPictureTemplateBefore(PODT_EARLIEST_TEMPLATE)
|
||||
return iso_date
|
||||
|
||||
|
||||
def gen_month_dates(iso_month):
|
||||
first = datetime.datetime.strptime(iso_month+'-01', ISO_DATE_FMT)
|
||||
one_day = datetime.timedelta(days=1)
|
||||
date = first
|
||||
while date.month == first.month:
|
||||
yield date.strftime(ISO_DATE_FMT)
|
||||
date += one_day
|
||||
|
||||
|
||||
def gen_year_dates(iso_year):
|
||||
for i in range(1, 13):
|
||||
yield from gen_month_dates(iso_year + '-{:02d}'.format(i))
|
||||
|
||||
|
||||
def gen_dates(iso_parts):
|
||||
if len(iso_parts) == 4:
|
||||
yield from gen_year_dates(iso_parts)
|
||||
elif len(iso_parts) == 7:
|
||||
yield from gen_month_dates(iso_parts)
|
||||
else:
|
||||
yield iso_parts
|
||||
|
||||
|
||||
def parse_args(argv):
|
||||
parser = argparse.ArgumentParser(description=main.__doc__)
|
||||
date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day'
|
||||
parser.add_argument('date', help=date_help)
|
||||
parser.add_argument('-q', '--max_qty', type=int,
|
||||
help='maximum number of items to fetch')
|
||||
parser.add_argument('-u', '--url_only', action='store_true',
|
||||
help='get picture URLS only')
|
||||
parser.add_argument('-v', '--verbose', action='store_true',
|
||||
help='display progress information')
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
try:
|
||||
iso_parts = validate_date(args.date)
|
||||
except ValueError as exc:
|
||||
print('error:', exc.args[0])
|
||||
parser.print_usage()
|
||||
sys.exit(2)
|
||||
|
||||
dates = list(gen_dates(iso_parts))
|
||||
if args.verbose:
|
||||
if len(dates) == 1:
|
||||
print('-> Date: ', dates[0])
|
||||
else:
|
||||
fmt = '-> {} days: {}...{}'
|
||||
print(fmt.format(len(dates), dates[0], dates[-1]))
|
||||
|
||||
return dates, args
|
||||
|
||||
|
||||
def get_picture_urls(dates, verbose=False):
|
||||
urls = []
|
||||
count = 0
|
||||
for date in dates:
|
||||
try:
|
||||
url = get_picture_url(date)
|
||||
except NoPictureForDate as exc:
|
||||
if verbose:
|
||||
print('*** {!r} ***'.format(exc))
|
||||
continue
|
||||
count += 1
|
||||
if verbose:
|
||||
print(format(count, '3d'), end=' ')
|
||||
print(url.split('/')[-1])
|
||||
else:
|
||||
print(url)
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
|
||||
def main(argv, get_picture_urls):
|
||||
"""Get Wikipedia "Picture of The Day" for date, month or year"""
|
||||
|
||||
dates, args = parse_args(argv)
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
urls = get_picture_urls(dates, args.verbose)
|
||||
|
||||
elapsed = time.time() - t0
|
||||
if args.verbose:
|
||||
print('-> found: {} pictures | elapsed time: {:.2f}s'
|
||||
.format(len(urls), elapsed))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1:], get_picture_urls)
|
||||
61
concurrency/wikipedia/daypicts_asyncio.py
Normal file
61
concurrency/wikipedia/daypicts_asyncio.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
"""
|
||||
|
||||
import sys
|
||||
import asyncio
|
||||
import aiohttp
|
||||
|
||||
from daypicts import main
|
||||
from daypicts import NoPictureForDate
|
||||
from daypicts import POTD_BASE_URL
|
||||
from daypicts import POTD_IMAGE_RE
|
||||
|
||||
GLOBAL_TIMEOUT = 300 # seconds
|
||||
|
||||
|
||||
@asyncio.coroutine
|
||||
def get_picture_url(iso_date):
|
||||
page_url = POTD_BASE_URL+iso_date
|
||||
response = yield from aiohttp.request('GET', page_url)
|
||||
text = yield from response.text()
|
||||
pict_url = POTD_IMAGE_RE.search(text)
|
||||
if pict_url is None:
|
||||
raise NoPictureForDate(iso_date)
|
||||
return 'http:' + pict_url.group(1)
|
||||
|
||||
|
||||
@asyncio.coroutine
|
||||
def get_picture_urls(dates, verbose=False):
|
||||
tasks = [get_picture_url(date) for date in dates]
|
||||
urls = []
|
||||
count = 0
|
||||
# get results as jobs are done
|
||||
for job in asyncio.as_completed(tasks, timeout=GLOBAL_TIMEOUT):
|
||||
try:
|
||||
url = yield from job
|
||||
except NoPictureForDate as exc:
|
||||
if verbose:
|
||||
print('*** {!r} ***'.format(exc))
|
||||
continue
|
||||
except aiohttp.ClientResponseError as exc:
|
||||
print('****** {!r} ******'.format(exc))
|
||||
continue
|
||||
count += 1
|
||||
if verbose:
|
||||
print(format(count, '3d'), end=' ')
|
||||
print(url.split('/')[-1])
|
||||
else:
|
||||
print(url)
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
|
||||
def run_loop(dates, verbose=False):
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
return loop.run_until_complete(get_picture_urls(dates, verbose))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1:], run_loop)
|
||||
45
concurrency/wikipedia/daypicts_threads.py
Normal file
45
concurrency/wikipedia/daypicts_threads.py
Normal file
@@ -0,0 +1,45 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
"""
|
||||
|
||||
import sys
|
||||
from concurrent import futures
|
||||
|
||||
from daypicts import main, get_picture_url, NoPictureForDate
|
||||
|
||||
MAX_NUM_THREADS = 400
|
||||
GLOBAL_TIMEOUT = 300 # seconds
|
||||
|
||||
|
||||
def get_picture_urls(dates, verbose=False):
|
||||
num_threads = min(len(dates), MAX_NUM_THREADS)
|
||||
pool = futures.ThreadPoolExecutor(num_threads)
|
||||
|
||||
pending = {}
|
||||
for date in dates:
|
||||
job = pool.submit(get_picture_url, date)
|
||||
pending[job] = date
|
||||
|
||||
urls = []
|
||||
count = 0
|
||||
|
||||
# get results as jobs are done
|
||||
for job in futures.as_completed(pending, timeout=GLOBAL_TIMEOUT):
|
||||
try:
|
||||
url = job.result()
|
||||
except NoPictureForDate as exc:
|
||||
if verbose:
|
||||
print('*** {!r} ***'.format(exc))
|
||||
continue
|
||||
count += 1
|
||||
if verbose:
|
||||
print(format(count, '3d'), end=' ')
|
||||
print(url.split('/')[-1])
|
||||
else:
|
||||
print(url)
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1:], get_picture_urls)
|
||||
4
concurrency/wikipedia/fast_tests.sh
Executable file
4
concurrency/wikipedia/fast_tests.sh
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
# run tests skipping @pytest.mark.network
|
||||
py.test test_daypicts.py -m 'not network' $1 $2 $3
|
||||
36
concurrency/wikipedia/orig/futureprocs.py
Normal file
36
concurrency/wikipedia/orig/futureprocs.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
|
||||
Inspired by example at:
|
||||
https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
|
||||
"""
|
||||
|
||||
from concurrent import futures
|
||||
|
||||
import potd
|
||||
|
||||
def save_month(year_month, verbose):
|
||||
year, month = [int(s) for s in year_month.split('-')]
|
||||
total_size = 0
|
||||
img_count = 0
|
||||
dates = potd.list_days_of_month(year, month)
|
||||
|
||||
with futures.ProcessPoolExecutor(max_workers=100) as executor:
|
||||
downloads = dict((executor.submit(potd.save_one, date, verbose), date)
|
||||
for date in dates)
|
||||
|
||||
for future in futures.as_completed(downloads):
|
||||
date = downloads[future]
|
||||
if future.exception() is not None:
|
||||
print('%r generated an exception: %s' % (date,
|
||||
future.exception()))
|
||||
else:
|
||||
img_size = future.result()
|
||||
total_size += img_size
|
||||
img_count += 1
|
||||
print('%r OK: %r' % (date, img_size))
|
||||
|
||||
return img_count, total_size
|
||||
|
||||
if __name__ == '__main__':
|
||||
potd.main(save_month=save_month)
|
||||
36
concurrency/wikipedia/orig/futurethreads.py
Normal file
36
concurrency/wikipedia/orig/futurethreads.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
|
||||
Inspired by example at:
|
||||
https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
|
||||
"""
|
||||
|
||||
from concurrent import futures
|
||||
|
||||
import potd
|
||||
|
||||
def save_month(year_month, verbose):
|
||||
year, month = [int(s) for s in year_month.split('-')]
|
||||
total_size = 0
|
||||
img_count = 0
|
||||
dates = potd.list_days_of_month(year, month)
|
||||
|
||||
with futures.ThreadPoolExecutor(max_workers=100) as executor:
|
||||
downloads = dict((executor.submit(potd.save_one, date, verbose), date)
|
||||
for date in dates)
|
||||
|
||||
for future in futures.as_completed(downloads):
|
||||
date = downloads[future]
|
||||
if future.exception() is not None:
|
||||
print('%r generated an exception: %s' % (date,
|
||||
future.exception()))
|
||||
else:
|
||||
img_size = future.result()
|
||||
total_size += img_size
|
||||
img_count += 1
|
||||
print('%r OK: %r' % (date, img_size))
|
||||
|
||||
return img_count, total_size
|
||||
|
||||
if __name__ == '__main__':
|
||||
potd.main(save_month=save_month)
|
||||
100
concurrency/wikipedia/orig/potd.py
Normal file
100
concurrency/wikipedia/orig/potd.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
|
||||
Baseline synchronous example for comparison: downloads metadata and
|
||||
images in the simple but slow synchronous way i.e. one after the other.
|
||||
"""
|
||||
|
||||
import calendar
|
||||
import datetime
|
||||
import re
|
||||
import os
|
||||
import io
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
import argparse
|
||||
|
||||
SAVE_DIR = 'pictures/'
|
||||
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
|
||||
|
||||
class NoPictureForDate(Exception):
|
||||
'''No Picture of the Day found for {day}'''
|
||||
|
||||
def build_page_url(iso_date):
|
||||
return POTD_BASE_URL + iso_date
|
||||
|
||||
def fetch(url):
|
||||
response = requests.get(url)
|
||||
return response
|
||||
|
||||
def extract_image_url(html):
|
||||
re_image = r'src="(//upload\..*?)"'
|
||||
image_url = re.search(re_image, html)
|
||||
return 'http:' + image_url.group(1)
|
||||
|
||||
def format_date(year, month, day):
|
||||
return '{year}-{month:02d}-{day:02d}'.format(**locals())
|
||||
|
||||
def list_days_of_month(year, month):
|
||||
lastday = calendar.monthrange(year, month)[1]
|
||||
days = [format_date(year, month, day) for day in range(1, lastday + 1)]
|
||||
return days
|
||||
|
||||
def build_save_path(iso_date, url):
|
||||
head, filename = os.path.split(url)
|
||||
return os.path.join(SAVE_DIR, iso_date+'_'+filename)
|
||||
|
||||
def save_one(iso_date, verbose):
|
||||
page_url = build_page_url(iso_date)
|
||||
response = fetch(page_url)
|
||||
if response.status_code != 200:
|
||||
msg = NoPictureForDate.__doc__.format(day=iso_date)
|
||||
raise NoPictureForDate(msg)
|
||||
img_url = extract_image_url(response.text)
|
||||
response = fetch(img_url)
|
||||
path = build_save_path(iso_date, img_url)
|
||||
if verbose:
|
||||
print('saving: '+path)
|
||||
with io.open(path, 'wb') as fp:
|
||||
fp.write(response.content)
|
||||
return len(response.content)
|
||||
|
||||
def save_month(year_month, verbose):
|
||||
year, month = [int(s) for s in year_month.split('-')]
|
||||
total_size = 0
|
||||
img_count = 0
|
||||
dates = list_days_of_month(year, month)
|
||||
|
||||
for date in dates:
|
||||
try:
|
||||
total_size += save_one(date, verbose)
|
||||
img_count += 1
|
||||
except NoPictureForDate:
|
||||
continue
|
||||
return img_count, total_size
|
||||
|
||||
def main(save_one=save_one, save_month=save_month):
|
||||
"""Get "Picture of The Day" from English Wikipedia for a given date or month"""
|
||||
parser = argparse.ArgumentParser(description=main.__doc__)
|
||||
parser.add_argument('date', help='year, month and (optional) day in YYYY-MM-DD format')
|
||||
parser.add_argument('-q', '--max_qty', type=int,
|
||||
help='maximum number of files to download')
|
||||
parser.add_argument('-v', '--verbose', action='store_true',
|
||||
help='display progress information')
|
||||
args = parser.parse_args()
|
||||
|
||||
t0 = time.time()
|
||||
if len(args.date) == len('YYYY-MM-DD'):
|
||||
img_count = 1
|
||||
total_size = save_one(args.date, args.verbose)
|
||||
else:
|
||||
img_count, total_size = save_month(args.date, args.verbose)
|
||||
elapsed = time.time() - t0
|
||||
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
|
||||
(img_count, total_size/1024.0, elapsed))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
96
concurrency/wikipedia/orig/potd_tests.py
Normal file
96
concurrency/wikipedia/orig/potd_tests.py
Normal file
@@ -0,0 +1,96 @@
|
||||
|
||||
import unittest
|
||||
|
||||
import potd
|
||||
|
||||
class TestSequenceFunctions(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.thumb_url = ("""http://upload.wikimedia.org/wikipedia/"""
|
||||
"""commons/thumb/f/fe/Orthographic_projection_SW.jpg/350px"""
|
||||
"""-Orthographic_projection_SW.jpg""")
|
||||
|
||||
def test_buid_page_url(self):
|
||||
date = '2014-05-01'
|
||||
result = potd.build_page_url(date)
|
||||
self.assertEqual(result, 'http://en.wikipedia.org/wiki/Template:POTD/2014-05-01')
|
||||
|
||||
def test_fetch_status_code(self):
|
||||
date = '2014-05-02'
|
||||
url = potd.build_page_url(date)
|
||||
response = potd.fetch(url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
def test_fetch_status_code_not_found(self):
|
||||
date = '2100-01-01'
|
||||
url = potd.build_page_url(date)
|
||||
response = potd.fetch(url)
|
||||
self.assertEqual(response.status_code, 404)
|
||||
|
||||
def test_extract_image_url(self):
|
||||
image_url = potd.extract_image_url(HTML)
|
||||
self.assertEqual(image_url, self.thumb_url)
|
||||
|
||||
def test_fetch_image_jpeg(self):
|
||||
response = potd.fetch(self.thumb_url)
|
||||
self.assertEqual(response.headers['content-type'], 'image/jpeg')
|
||||
|
||||
def test_list_days_of_month(self):
|
||||
year = 2014
|
||||
month = 5
|
||||
days = potd.list_days_of_month(year, month)
|
||||
self.assertEqual(len(days), 31)
|
||||
self.assertEqual('2014-05-01', days[0])
|
||||
self.assertEqual('2014-05-31', days[-1])
|
||||
|
||||
def test_list_days_of_february(self):
|
||||
year = 2014
|
||||
month = 2
|
||||
days = potd.list_days_of_month(year, month)
|
||||
self.assertEqual(len(days), 28)
|
||||
self.assertEqual('2014-02-01', days[0])
|
||||
self.assertEqual('2014-02-28', days[-1])
|
||||
|
||||
def test_format_date(self):
|
||||
year = 2014
|
||||
month = 2
|
||||
day = 1
|
||||
a_date = '2014-02-01'
|
||||
date = potd.format_date(year, month, day)
|
||||
self.assertEqual(a_date, date)
|
||||
self.assertEqual(potd.format_date(2010, 11, 12), '2010-11-12')
|
||||
|
||||
def test_build_save_path(self):
|
||||
date = '2014-06-04'
|
||||
path = potd.SAVE_DIR + date + '_350px-Orthographic_projection_SW.jpg'
|
||||
self.assertEqual(path, potd.build_save_path(date, self.thumb_url))
|
||||
|
||||
|
||||
HTML = (
|
||||
'''<td><a href="/wiki/File:Orthographic_projection_SW.jpg" class="image"
|
||||
title="Orthographic projection"><img alt="Orthographic projection"
|
||||
src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fe/O'''
|
||||
'''rthographic_projection_SW.jpg/350px-Orthographic_projection_SW.jpg"
|
||||
width="350" height="350" srcset="//upload.wikimedia.org/wikipedia/comm'''
|
||||
'''ons/thumb/f/fe/Orthographic_projection_SW.jpg/525px-
|
||||
Orthographic_projection_SW.jpg 1.5x, //upload.wikimedia.org/wikipedia/
|
||||
commons/thumb/f/fe/Orthographic_projection_SW.jpg/700px-
|
||||
Orthographic_projection_SW.jpg 2x" data-file-width="2058" data-file-
|
||||
height="2058"></a></td>
|
||||
''')
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
115
concurrency/wikipedia/sync.py
Normal file
115
concurrency/wikipedia/sync.py
Normal file
@@ -0,0 +1,115 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
|
||||
Baseline synchronous example for comparison: downloads images and metadata
|
||||
in the simple but slow synchronous way i.e. one after the other.
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import os
|
||||
import io
|
||||
import re
|
||||
import argparse
|
||||
import datetime
|
||||
import urllib2
|
||||
import contextlib
|
||||
import time
|
||||
|
||||
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
|
||||
|
||||
THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
|
||||
THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
|
||||
|
||||
LOCAL_IMG_PATH = 'pictures/'
|
||||
|
||||
verbose = True
|
||||
|
||||
|
||||
class ParsingException(ValueError):
|
||||
"""Raised if unable to parse POTD MediaWiki source"""
|
||||
|
||||
|
||||
def fetch_potd_url(iso_date):
|
||||
"""Fetch picture name from iso_date ('YYYY-MM-DD' format)"""
|
||||
potd_url = POTD_BASE_URL + iso_date
|
||||
with contextlib.closing(urllib2.urlopen(potd_url)) as fp:
|
||||
html = fp.read()
|
||||
thumb_src = THUMB_SRC_RE.search(html)
|
||||
if not thumb_src:
|
||||
msg = 'cannot find thumbnail source for ' + potd_url
|
||||
raise ParsingException(msg)
|
||||
thumb_url = THUMB_BASE_URL+thumb_src.group(1)
|
||||
return thumb_url
|
||||
|
||||
|
||||
def gen_month_days(year, month):
|
||||
a_date = datetime.date(year, month, 1)
|
||||
one_day = datetime.timedelta(1)
|
||||
while a_date.month == month:
|
||||
yield a_date
|
||||
a_date += one_day
|
||||
|
||||
|
||||
def get_img_names(iso_month):
|
||||
"""Fetch picture names from iso_month ('YYYY-MM' format)"""
|
||||
year, month = (int(part) for part in iso_month.split('-'))
|
||||
for day in gen_month_days(year, month):
|
||||
iso_date = '{:%Y-%m-%d}'.format(day)
|
||||
if verbose:
|
||||
print(iso_date)
|
||||
try:
|
||||
img_url = fetch_potd_url(iso_date)
|
||||
except urllib2.HTTPError:
|
||||
break
|
||||
yield (iso_date, img_url)
|
||||
|
||||
|
||||
def fetch_image(iso_date, img_url):
|
||||
if verbose:
|
||||
print('\t' + img_url)
|
||||
with contextlib.closing(urllib2.urlopen(img_url)) as fp:
|
||||
img = fp.read()
|
||||
img_filename = iso_date + '__' + img_url.split('/')[-1]
|
||||
if verbose:
|
||||
print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
|
||||
img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
|
||||
with io.open(img_path, 'wb') as fp:
|
||||
fp.write(img)
|
||||
return len(img)
|
||||
|
||||
|
||||
def get_images(iso_month, max_count=0):
|
||||
if max_count is 0:
|
||||
max_count = sys.maxsize
|
||||
img_count = 0
|
||||
total_size = 0
|
||||
for iso_date, img_url in get_img_names(iso_month):
|
||||
total_size += fetch_image(iso_date, img_url)
|
||||
img_count += 1
|
||||
if img_count == max_count:
|
||||
break
|
||||
|
||||
return (img_count, total_size)
|
||||
|
||||
|
||||
def main():
|
||||
"""Get "Pictures of The Day" from English Wikipedia for a given month"""
|
||||
global verbose
|
||||
parser = argparse.ArgumentParser(description=main.__doc__)
|
||||
parser.add_argument('year_month', help='year and month in YYYY-MM format')
|
||||
parser.add_argument('-q', '--max_qty', type=int,
|
||||
help='maximum number of files to download')
|
||||
parser.add_argument('-v', '--verbose', action='store_true',
|
||||
help='display progress information')
|
||||
args = parser.parse_args()
|
||||
verbose = args.verbose
|
||||
t0 = time.time()
|
||||
img_count, total_size = get_images(args.year_month, args.max_qty)
|
||||
elapsed = time.time() - t0
|
||||
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
|
||||
(img_count, total_size/1024.0, elapsed))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
118
concurrency/wikipedia/sync_py3.py
Normal file
118
concurrency/wikipedia/sync_py3.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
|
||||
Baseline synchronous example for comparison: downloads images and metadata
|
||||
in the simple but slow synchronous way i.e. one after the other.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import io
|
||||
import re
|
||||
import argparse
|
||||
import datetime
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import contextlib
|
||||
import time
|
||||
|
||||
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
|
||||
|
||||
THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
|
||||
THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
|
||||
|
||||
LOCAL_IMG_PATH = 'pictures/'
|
||||
|
||||
verbose = True
|
||||
|
||||
|
||||
class ParsingException(ValueError):
|
||||
"""Raised if unable to parse POTD MediaWiki source"""
|
||||
|
||||
|
||||
def gen_month_dates(year, month):
|
||||
"""Produce all dates in a given year, month"""
|
||||
a_date = datetime.date(year, month, 1)
|
||||
one_day = datetime.timedelta(1)
|
||||
while a_date.month == month:
|
||||
yield '{:%Y-%m-%d}'.format(a_date)
|
||||
a_date += one_day
|
||||
|
||||
|
||||
def fetch_potd_url(iso_date):
|
||||
"""Fetch POTD thumbnail URL for iso_date ('YYYY-MM-DD' format)"""
|
||||
if verbose:
|
||||
print(iso_date)
|
||||
potd_url = POTD_BASE_URL + iso_date
|
||||
try:
|
||||
with urllib.request.urlopen(potd_url) as fp:
|
||||
html = fp.read().decode('utf-8')
|
||||
thumb_src = THUMB_SRC_RE.search(html)
|
||||
if not thumb_src:
|
||||
msg = 'cannot find thumbnail source for ' + potd_url
|
||||
raise ParsingException(msg)
|
||||
thumb_url = THUMB_BASE_URL+thumb_src.group(1)
|
||||
except urllib.error.HTTPError:
|
||||
return None
|
||||
return thumb_url
|
||||
|
||||
|
||||
def gen_img_names(iso_month):
|
||||
"""Produce picture names by fetching POTD metadata"""
|
||||
year, month = (int(part) for part in iso_month.split('-'))
|
||||
for iso_date in gen_month_dates(year, month):
|
||||
img_url = fetch_potd_url(iso_date)
|
||||
if img_url is None:
|
||||
break
|
||||
yield (iso_date, img_url)
|
||||
|
||||
|
||||
def fetch_image(iso_date, img_url):
|
||||
"""Fetch and save image data for date and url"""
|
||||
if verbose:
|
||||
print('\t' + img_url)
|
||||
with contextlib.closing(urllib.request.urlopen(img_url)) as fp:
|
||||
img = fp.read()
|
||||
img_filename = iso_date + '__' + img_url.split('/')[-1]
|
||||
if verbose:
|
||||
print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
|
||||
img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
|
||||
with io.open(img_path, 'wb') as fp:
|
||||
fp.write(img)
|
||||
return len(img)
|
||||
|
||||
|
||||
def get_images(iso_month, max_count=0):
|
||||
"""Download up to max_count images for a given month"""
|
||||
if max_count is 0:
|
||||
max_count = sys.maxsize
|
||||
img_count = 0
|
||||
total_size = 0
|
||||
for iso_date, img_url in gen_img_names(iso_month):
|
||||
total_size += fetch_image(iso_date, img_url)
|
||||
img_count += 1
|
||||
if img_count == max_count:
|
||||
break
|
||||
|
||||
return (img_count, total_size)
|
||||
|
||||
|
||||
def main():
|
||||
"""Get "Pictures of The Day" from English Wikipedia for a given month"""
|
||||
global verbose
|
||||
parser = argparse.ArgumentParser(description=main.__doc__)
|
||||
parser.add_argument('year_month', help='year and month in YYYY-MM format')
|
||||
parser.add_argument('-q', '--max_qty', type=int,
|
||||
help='maximum number of files to download')
|
||||
parser.add_argument('-v', '--verbose', action='store_true',
|
||||
help='display progress information')
|
||||
args = parser.parse_args()
|
||||
verbose = args.verbose
|
||||
t0 = time.time()
|
||||
img_count, total_size = get_images(args.year_month, args.max_qty)
|
||||
elapsed = time.time() - t0
|
||||
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
|
||||
(img_count, total_size/1024.0, elapsed))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
87
concurrency/wikipedia/test_daypicts.py
Normal file
87
concurrency/wikipedia/test_daypicts.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
Wikipedia Picture of the Day (POTD) download example
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from daypicts import *
|
||||
|
||||
|
||||
GIF_MIN = (b'GIF89a\x01\x00\x01\x00\x00\xff\x00,\x00\x00'
|
||||
b'\x00\x00\x01\x00\x01\x00\x00\x02\x00;')
|
||||
SVG_MIN = b'<svg xmlns="http://www.w3.org/2000/svg"></svg>'
|
||||
SVG_XML_DECL = b'<?xml version="1.0" encoding="UTF-8"?>' + SVG_MIN
|
||||
NOISE = b'\xb0\x0bU\xbe]L\n\x92\xbe\xc6\xf65"\xcc\xa3\xe3'
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_get_picture_url_existing():
|
||||
url = get_picture_url('2012-01-01')
|
||||
expected = ('http://upload.wikimedia.org/wikipedia/commons/'
|
||||
'thumb/9/9d/MODIS_Map.jpg/550px-MODIS_Map.jpg')
|
||||
assert url == expected
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_get_picture_url_not_existing():
|
||||
with pytest.raises(NoPictureForDate):
|
||||
get_picture_url('2013-09-12')
|
||||
|
||||
|
||||
def test_get_picture_type_imghdr():
|
||||
assert get_picture_type(GIF_MIN) == 'gif'
|
||||
|
||||
|
||||
def test_get_picture_type_svg():
|
||||
assert get_picture_type(SVG_MIN) == 'svg'
|
||||
assert get_picture_type(SVG_XML_DECL) == 'svg'
|
||||
|
||||
|
||||
def test_get_picture_type_unknown():
|
||||
assert get_picture_type(NOISE) is None
|
||||
|
||||
|
||||
def test_validate_full_date():
|
||||
parts = validate_date('2015-1-2')
|
||||
assert parts == '2015-01-02'
|
||||
|
||||
|
||||
def test_validate_date_too_early():
|
||||
with pytest.raises(NoPictureTemplateBefore):
|
||||
validate_date('2006-12-31')
|
||||
|
||||
|
||||
def test_validate_month():
|
||||
parts = validate_date('2015-1')
|
||||
assert parts == '2015-01'
|
||||
|
||||
|
||||
def test_validate_year():
|
||||
parts = validate_date('2015')
|
||||
assert parts == '2015'
|
||||
|
||||
|
||||
def test_gen_month_dates():
|
||||
dates = list(gen_month_dates('2015-02'))
|
||||
assert len(dates) == 28
|
||||
assert dates[0] == '2015-02-01'
|
||||
assert dates[27] == '2015-02-28'
|
||||
|
||||
|
||||
def test_gen_month_dates_leap():
|
||||
dates = list(gen_month_dates('2012-02'))
|
||||
assert len(dates) == 29
|
||||
assert dates[28] == '2012-02-29'
|
||||
|
||||
|
||||
def test_gen_year_dates():
|
||||
dates = list(gen_year_dates('2015'))
|
||||
assert len(dates) == 365
|
||||
assert dates[0] == '2015-01-01'
|
||||
assert dates[364] == '2015-12-31'
|
||||
|
||||
|
||||
def test_gen_year_dates_leap():
|
||||
dates = list(gen_year_dates('2012'))
|
||||
assert len(dates) == 366
|
||||
assert dates[365] == '2012-12-31'
|
||||
Reference in New Issue
Block a user