update from Atlas with major reorg

This commit is contained in:
Luciano Ramalho
2015-04-17 21:29:30 -03:00
parent 57902d31b5
commit a786180239
134 changed files with 369 additions and 520 deletions

View File

@@ -0,0 +1,138 @@
====================================
Configuring a local test environment
====================================
tl;dr;
======
This text explains how to configure **nginx** and **vaurien** to build
a local mirror of the data to run the Wikipedia Picture of the Day
examples while avoiding network traffic and introducing controlled
delays and errors for testing, thanks to the **vaurien** proxy.
Rationale and overview
======================
The Wikipedia Picture of the Day examples are designed to demonstrate
the performance of different approaches to finding and downloading
images from the Wikipedia. However, we don't want to hit the Wikipedia
with multiple requests per second while testing, and we want to be
able to simulate high latency and random network errors.
For this setup I chose **nginx** as the HTTP server because it is very
fast and easy to configure, and the **vaurien** proxy because it was
designed by Mozilla to introduce delays and network errors for testing.
The initial fixture data, ``docroot.zip``, contains a directory
``docroot/Template-POTD/`` with 1096 small text files, each consisting
of an HTML fragment (just a ``src="..."`` attribute) or an error message
(for days when no picture was published, like 2013-09-12). These files
correspond to every day of the years 2012, 2013 and 2014. The year 2012
was a leap year, that's why there are 1096 files and not 1095.
Once these files are unpacked to the ``docroot/Template-POTD`` directory
and **nginx** is configured, the ``build_fixture.py`` script can fetch the
actual images from the Wikipedia for local storage in the directory
``docroot/wikimedia/``.
When that is done you can configure **nginx** and **vaurien** to experiment
with the ``daypicts*.py``examples without hitting the network.
Instructions
============
1. Unpack test data
-------------------
Unpack the initial data in the ``fixture/`` directory and verify that 1096
files were created in ``fixture/docroot/Template-POTD/``::
$ ls # inside the fixture/ directory
README.rst docroot.zip
$ unzip docroot.zip
... many lines omitted...
inflating: docroot/Template-POTD/2014-12-29
inflating: docroot/Template-POTD/2014-12-30
inflating: docroot/Template-POTD/2014-12-31
$ ls docroot/Template-POTD/ | wc -w
1096
2. Install **nginx**
--------------------
Download and install **nginx**. I used version 1.6.2 -- the latest
stable version as I write this.
- Download page: http://nginx.org/en/download.html
- Beginner's guide: http://nginx.org/en/docs/beginners_guide.html
3. Configure **nginx**
----------------------
Edit the the ``nginx.conf`` file to set the port and document root.
The file is usually found in ``/usr/local/nginx/conf``, ``/etc/nginx``,
or ``/usr/local/etc/nginx``.
Most of the content in ``nginx.conf`` is within a block labeled ``http``
and enclosed in curly braces. Within that block there can be multiple
blocks labeled ``server``. Add another ``server`` block like this one::
server {
listen 8001;
location / {
root /full-path-to.../fixture/docroot;
}
}
After editing ``nginx.conf`` the server must be started (if it's not
running) or told to reload the configuration file::
$ nginx # to start, if necessary
$ nginx -s reload # to reload the configuration
To test the configuration, open the URL below in a browser. Doing so
will download a small file named ``2014-01-01`` with an HTML fragment::
http://localhost:8001/Template-POTD/2014-01-01
If the test fails, please double check the procedure just described and
refer to the **nginx** documentation.
Platform-specific instructions
==============================
Nginx setup on Mac OS X
-----------------------
Homebrew (copy & paste code at the bottom of http://brew.sh/)::
$ ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
$ brew doctor
$ brew install nginx
Download and unpack::
Docroot is: /usr/local/var/www
/usr/local/etc/nginx/nginx.conf
To have launchd start nginx at login:
ln -sfv /usr/local/opt/nginx/*.plist ~/Library/LaunchAgents
Then to load nginx now:
launchctl load ~/Library/LaunchAgents/homebrew.mxcl.nginx.plist
Or, if you don't want/need launchctl, you can just run:
nginx
Nginx setup on Lubuntu 14.04.1 LTS
----------------------------------
Docroot is: /usr/share/nginx/html

View File

@@ -0,0 +1,97 @@
import sys
import argparse
import os
import urllib
import requests
from daypicts import get_picture_url, get_picture_urls
from daypicts import validate_date, gen_dates, picture_type
from daypicts import NoPictureForDate
from daypicts import REMOTE_PICT_BASE_URL, PICT_EXCEPTIONS
FIXTURE_DOC_DIR = 'fixture/docroot/'
FIXTURE_TEMPLATE_POTD_DIR = FIXTURE_DOC_DIR + 'Template-POTD/'
def parse_args(argv):
parser = argparse.ArgumentParser(description=main.__doc__)
date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day'
parser.add_argument('date', help=date_help)
parser.add_argument('-u', '--url_only', action='store_true',
help='get picture URLS only')
args = parser.parse_args(argv)
try:
iso_parts = validate_date(args.date)
except ValueError as exc:
print('error:', exc.args[0])
parser.print_usage()
sys.exit(2)
dates = list(gen_dates(iso_parts))
if len(dates) == 1:
print('-> Date: ', dates[0])
else:
fmt = '-> {} days: {}...{}'
print(fmt.format(len(dates), dates[0], dates[-1]))
return dates, args
def save_picture_urls(dates, save_path):
for date in dates:
try:
url = get_picture_url(date)
except NoPictureForDate as exc:
snippet = repr(exc)
else:
snippet = url.replace('http://', 'src="//') + '"'
print(date, end=' ')
print(snippet)
with open(os.path.join(save_path, date), 'w') as fp:
fp.write(snippet)
def save_pictures(dates, save_path, verbose=False):
urls_ok = []
for date, url in get_picture_urls(dates, verbose):
response = requests.get(url)
file_path = os.path.join(save_path,
url.replace(REMOTE_PICT_BASE_URL, ''))
file_path = urllib.parse.unquote(file_path)
octets = response.content
# http://en.wikipedia.org/wiki/Template:POTD/2013-06-15
if date not in PICT_EXCEPTIONS:
assert picture_type(octets) is not None, url
try:
os.makedirs(os.path.dirname(file_path))
except FileExistsError:
pass
with open(file_path, 'wb') as fp:
fp.write(octets)
print(file_path)
return urls_ok
def main(argv):
"""Build test fixture from Wikipedia "POTD" data"""
try:
os.makedirs(FIXTURE_TEMPLATE_POTD_DIR)
except FileExistsError:
pass
dates, args = parse_args(argv)
if args.url_only:
save_picture_urls(dates, FIXTURE_TEMPLATE_POTD_DIR)
else:
save_pictures(dates, FIXTURE_DOC_DIR)
if __name__ == '__main__':
main(sys.argv[1:])

View File

@@ -0,0 +1,227 @@
"""
Wikipedia Picture of the Day (POTD) download example
Note:
The earliest Pictures of the Day I've found are in this page:
http://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/May_2004
However, I have not found Template:POTD/YYYY-MM-DD pages earlier
than this:
http://en.wikipedia.org/wiki/Template:POTD/2007-01-01
For simplicity, this script only retrieves pictures starting
from 2007-01-01.
"""
import sys
import argparse
import re
import time
import datetime
import os
import imghdr
import warnings
import requests
SAVE_DIR = 'downloaded/'
HTTP_PORT = 8002
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
#POTD_BASE_URL = 'http://127.0.0.1:{}/Template-POTD/'.format(HTTP_PORT)
REMOTE_PICT_BASE_URL = 'http://upload.wikimedia.org/wikipedia/'
#LOCAL_PICT_BASE_URL = 'http://127.0.0.1:{}/'.format(HTTP_PORT)
LOCAL_PICT_BASE_URL = REMOTE_PICT_BASE_URL
PICT_BASE_URL = REMOTE_PICT_BASE_URL
POTD_IMAGE_RE = re.compile(r'src="(//upload\..*?)"')
PODT_EARLIEST_TEMPLATE = '2007-01-01'
RE_YEAR = r'([12]\d{3})'
RE_MONTH = RE_YEAR + r'-([01]\d)'
RE_DATE = RE_MONTH + r'-([0-3]\d)'
ISO_DATE_FMT = '%Y-%m-%d'
PICT_EXCEPTIONS = {
'2013-06-15', # .webm movie [1]
}
#[1] http://en.wikipedia.org/wiki/Template:POTD/2013-06-15
class NoPictureForDate(Exception):
'''No Picture of the Day found for {iso_date}'''
class NoPictureTemplateBefore(ValueError):
'''Template:POTD did not exist before PODT_EARLIEST_TEMPLATE'''
def get_picture_url(iso_date):
page_url = POTD_BASE_URL + iso_date
print(page_url)
response = requests.get(page_url)
pict_url = POTD_IMAGE_RE.search(response.text)
if pict_url is None:
raise NoPictureForDate(iso_date)
return 'http:' + pict_url.group(1)
def validate_date(text):
try:
parts = [int(part) for part in text.split('-')]
except ValueError:
raise ValueError('date must use YYYY, YYYY-MM or YYYY-MM-DD format')
test_parts = parts[:]
while len(test_parts) < 3:
test_parts.append(1)
date = datetime.date(*(int(part) for part in test_parts))
iso_date = date.strftime(ISO_DATE_FMT)
iso_date = iso_date[:1+len(parts)*3]
if iso_date < PODT_EARLIEST_TEMPLATE:
raise NoPictureTemplateBefore(PODT_EARLIEST_TEMPLATE)
return iso_date
def gen_month_dates(iso_month):
first = datetime.datetime.strptime(iso_month+'-01', ISO_DATE_FMT)
one_day = datetime.timedelta(days=1)
date = first.date()
while date.month == first.month:
yield date.strftime(ISO_DATE_FMT)
date += one_day
def gen_year_dates(iso_year):
for i in range(1, 13):
yield from gen_month_dates(iso_year + '-{:02d}'.format(i))
def gen_dates(iso_parts):
if len(iso_parts) == 4:
yield from gen_year_dates(iso_parts)
elif len(iso_parts) == 7:
yield from gen_month_dates(iso_parts)
else:
yield iso_parts
def get_picture_urls(dates, verbose=False):
date_urls = []
count = 0
for date in dates:
try:
url = get_picture_url(date)
except NoPictureForDate as exc:
if verbose:
print('*** {!r} ***'.format(exc))
continue
count += 1
if verbose:
print(format(count, '3d'), end=' ')
print(url.split('/')[-1])
else:
print(url)
date_urls.append((date, url))
return date_urls
def picture_type(octets):
pict_type = imghdr.what(None, octets)
if pict_type is None:
if (octets.startswith(b'<') and
b'<svg' in octets[:200] and
octets.rstrip().endswith(b'</svg>')):
pict_type = 'svg'
return pict_type
def get_pictures(dates, verbose=False):
urls_ok = []
try:
os.makedirs(SAVE_DIR)
except FileExistsError:
pass
for date, url in get_picture_urls(dates, verbose):
if PICT_BASE_URL == LOCAL_PICT_BASE_URL:
url = url.replace(REMOTE_PICT_BASE_URL, PICT_BASE_URL)
response = requests.get(url)
if response.status_code != 200:
warnings.warn('HTTP code {}: {}'.format(response.status_code, url))
continue
octets = response.content
if date not in PICT_EXCEPTIONS:
assert picture_type(octets) is not None, url
file_path = url.replace(PICT_BASE_URL, '')
file_name = os.path.basename(file_path)
path = os.path.join(SAVE_DIR, date.split('-')[0])
file_path = os.path.join(path, file_name)
#import pdb; pdb.set_trace()
try:
os.makedirs(path)
except FileExistsError:
pass
with open(file_path, 'wb') as fp:
fp.write(octets)
urls_ok.append(url)
print(file_path)
return urls_ok
def parse_args(argv):
parser = argparse.ArgumentParser(description=main.__doc__)
date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day'
parser.add_argument('date', help=date_help)
parser.add_argument('-q', '--max_qty', type=int,
help='maximum number of items to fetch')
parser.add_argument('-u', '--url_only', action='store_true',
help='get picture URLS only')
parser.add_argument('-f', '--fixture_save', action='store_true',
help='save data for local test fixture')
parser.add_argument('-v', '--verbose', action='store_true',
help='display progress information')
args = parser.parse_args(argv)
try:
iso_parts = validate_date(args.date)
except ValueError as exc:
print('error:', exc.args[0])
parser.print_usage()
sys.exit(2)
dates = list(gen_dates(iso_parts))
if args.verbose:
if len(dates) == 1:
print('-> Date: ', dates[0])
else:
fmt = '-> {} days: {}...{}'
print(fmt.format(len(dates), dates[0], dates[-1]))
return dates, args
def main(argv, get_picture_urls):
"""Get Wikipedia "Picture of The Day" for date, month or year"""
dates, args = parse_args(argv)
t0 = time.time()
if args.url_only:
urls = get_picture_urls(dates, args.verbose)
else:
urls = get_pictures(dates, args.verbose)
elapsed = time.time() - t0
if args.verbose:
print('-> found: {} pictures | elapsed time: {:.2f}s'
.format(len(urls), elapsed))
if __name__ == '__main__':
main(sys.argv[1:], get_picture_urls)

View File

@@ -0,0 +1,62 @@
"""
Wikipedia Picture of the Day (POTD) download example
"""
import sys
import asyncio
import aiohttp
from daypicts import main, NoPictureForDate
from daypicts import POTD_BASE_URL, POTD_IMAGE_RE
GLOBAL_TIMEOUT = 300 # seconds
MAX_CONCURRENT_REQUESTS = 30
@asyncio.coroutine
def get_picture_url(iso_date, semaphore):
page_url = POTD_BASE_URL+iso_date
with (yield from semaphore):
response = yield from aiohttp.request('GET', page_url)
text = yield from response.text()
pict_url = POTD_IMAGE_RE.search(text)
if pict_url is None:
raise NoPictureForDate(iso_date)
return 'http:' + pict_url.group(1)
@asyncio.coroutine
def get_picture_urls(dates, verbose=False):
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
tasks = [get_picture_url(date, semaphore) for date in dates]
urls = []
count = 0
# get results as jobs are done
for job in asyncio.as_completed(tasks, timeout=GLOBAL_TIMEOUT):
try:
url = yield from job
except NoPictureForDate as exc:
if verbose:
print('*** {!r} ***'.format(exc))
continue
except aiohttp.ClientResponseError as exc:
print('****** {!r} ******'.format(exc))
continue
count += 1
if verbose:
print(format(count, '3d'), end=' ')
print(url.split('/')[-1])
else:
print(url)
urls.append(url)
return urls
def run_loop(dates, verbose=False):
loop = asyncio.get_event_loop()
return loop.run_until_complete(get_picture_urls(dates, verbose))
if __name__ == '__main__':
main(sys.argv[1:], run_loop)

View File

@@ -0,0 +1,44 @@
"""
Wikipedia Picture of the Day (POTD) download example
"""
import sys
from concurrent import futures
from daypicts import main, get_picture_url, NoPictureForDate
GLOBAL_TIMEOUT = 300 # seconds
MAX_CONCURRENT_REQUESTS = 30
def get_picture_urls(dates, verbose=False):
pool = futures.ThreadPoolExecutor(MAX_CONCURRENT_REQUESTS)
pending = {}
for date in dates:
job = pool.submit(get_picture_url, date)
pending[job] = date
urls = []
count = 0
# get results as jobs are done
for job in futures.as_completed(pending, timeout=GLOBAL_TIMEOUT):
try:
url = job.result()
except NoPictureForDate as exc:
if verbose:
print('*** {!r} ***'.format(exc))
continue
count += 1
if verbose:
print(format(count, '3d'), end=' ')
print(url.split('/')[-1])
else:
print(url)
urls.append(url)
return urls
if __name__ == '__main__':
main(sys.argv[1:], get_picture_urls)

View File

@@ -0,0 +1,3 @@
#!/bin/bash
vaurien --protocol http --proxy localhost:8002 --backend localhost:8001 \
--behavior 100:delay --behavior-delay-sleep .1

View File

@@ -0,0 +1,4 @@
#!/bin/bash
# run tests skipping @pytest.mark.network
py.test test_daypicts.py -m 'not network' $1 $2 $3

View File

@@ -0,0 +1,105 @@
====================================
Configuring a local test environment
====================================
tl;dr;
======
This text explains how to configure **nginx** and **vaurien** to build
a local mirror of the data to run the Wikipedia Picture of the Day
examples while avoiding network traffic and introducing controlled
delays and errors for testing, thanks to the **vaurien** proxy.
Rationale and overview
======================
The Wikipedia Picture of the Day examples are designed to demonstrate
the performance of different approaches to finding and downloading
images from the Wikipedia. However, we don't want to hit the Wikipedia
with multiple requests per second while testing, and we want to be
able to simulate high latency and random network errors.
For this setup I chose **nginx** as the HTTP server because it is very
fast and easy to configure, and the **vaurien** proxy because it was
designed by Mozilla to introduce delays and network errors for testing.
The initial fixture data, ``docroot.zip``, contains a directory
``docroot/Template-POTD/`` with 1096 small text files, each consisting
of an HTML fragment (just a ``src="..."`` attribute) or an error message
(for days when no picture was published, like 2013-09-12). These files
correspond to every day of the years 2012, 2013 and 2014. The year 2012
was a leap year, that's why there are 1096 files and not 1095.
Once these files are unpacked to the ``docroot/Template-POTD`` directory
and **nginx** is configured, the ``build_fixture.py`` script can fetch the
actual images from the Wikipedia for local storage in the directory
``docroot/wikimedia/``.
When that is done you can configure **nginx** and **vaurien** to experiment
with the ``daypicts*.py``examples without hitting the network.
Instructions
============
1. Unpack test data
-------------------
Unpack the initial data in the ``fixture/`` directory and verify that 1096
files were created in ``fixture/docroot/Template-POTD/``::
$ ls # inside the fixture/ directory
README.rst docroot.zip
$ unzip docroot.zip
... many lines omitted...
inflating: docroot/Template-POTD/2014-12-29
inflating: docroot/Template-POTD/2014-12-30
inflating: docroot/Template-POTD/2014-12-31
$ ls docroot/Template-POTD/ | wc -w
1096
2. Install **nginx**
--------------------
Download and install **nginx**. I used version 1.6.2 -- the latest
stable version as I write this.
- Download page: http://nginx.org/en/download.html
- Beginner's guide: http://nginx.org/en/docs/beginners_guide.html
3. Configure **nginx**
----------------------
Edit the the ``nginx.conf`` file to set the port and document root.
The file is usually found in ``/usr/local/nginx/conf``, ``/etc/nginx``,
or ``/usr/local/etc/nginx``.
Most of the content in ``nginx.conf`` is within a block labeled ``http``
and enclosed in curly braces. Within that block there can be multiple
blocks labeled ``server``. Add another ``server`` block like this one::
server {
listen 8001;
location / {
root /full-path-to.../fixture/docroot;
}
}
After editing ``nginx.conf`` the server must be started (if it's not
running) or told to reload the configuration file::
$ nginx # to start, if necessary
$ nginx -s reload # to reload the configuration
To test the configuration, open the URL below in a browser. Doing so
will download a small file named ``2014-01-01`` with an HTML fragment::
http://localhost:8001/Template-POTD/2014-01-01
If the test fails, please double check the procedure just described and
refer to the **nginx** documentation.

Binary file not shown.

View File

@@ -0,0 +1,39 @@
=====================================
Wikipedia Picture of the Day examples
=====================================
These examples use various asynchronous programming techniques to download
images and metadata from the English Wikipedia `Picture of the Day`_ archive.
.. _Picture of the Day: http://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/Archive
--------
Timings
--------
``sync.py``
===========
::
$ time python sync.py 2014-06 -q 5
5 images downloaded (167.8 Kbytes total)
real 0m6.272s
user 0m0.065s
sys 0m0.039s
$ time python sync.py 2014-06 -q 5
5 images downloaded (167.8 Kbytes total)
real 0m5.447s
user 0m0.068s
sys 0m0.040s
$ time python sync.py 2014-06 -q 5
5 images downloaded (167.8 Kbytes total)
real 0m6.314s
user 0m0.068s
sys 0m0.040s

View File

@@ -0,0 +1,36 @@
"""
Wikipedia Picture of the Day (POTD) download example
Inspired by example at:
https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
"""
from concurrent import futures
import potd
def save_month(year_month, verbose):
year, month = [int(s) for s in year_month.split('-')]
total_size = 0
img_count = 0
dates = potd.list_days_of_month(year, month)
with futures.ProcessPoolExecutor(max_workers=100) as executor:
downloads = dict((executor.submit(potd.save_one, date, verbose), date)
for date in dates)
for future in futures.as_completed(downloads):
date = downloads[future]
if future.exception() is not None:
print('%r generated an exception: %s' % (date,
future.exception()))
else:
img_size = future.result()
total_size += img_size
img_count += 1
print('%r OK: %r' % (date, img_size))
return img_count, total_size
if __name__ == '__main__':
potd.main(save_month=save_month)

View File

@@ -0,0 +1,36 @@
"""
Wikipedia Picture of the Day (POTD) download example
Inspired by example at:
https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
"""
from concurrent import futures
import potd
def save_month(year_month, verbose):
year, month = [int(s) for s in year_month.split('-')]
total_size = 0
img_count = 0
dates = potd.list_days_of_month(year, month)
with futures.ThreadPoolExecutor(max_workers=100) as executor:
downloads = dict((executor.submit(potd.save_one, date, verbose), date)
for date in dates)
for future in futures.as_completed(downloads):
date = downloads[future]
if future.exception() is not None:
print('%r generated an exception: %s' % (date,
future.exception()))
else:
img_size = future.result()
total_size += img_size
img_count += 1
print('%r OK: %r' % (date, img_size))
return img_count, total_size
if __name__ == '__main__':
potd.main(save_month=save_month)

View File

@@ -0,0 +1,100 @@
"""
Wikipedia Picture of the Day (POTD) download example
Baseline synchronous example for comparison: downloads metadata and
images in the simple but slow synchronous way i.e. one after the other.
"""
import calendar
import datetime
import re
import os
import io
import time
import requests
import argparse
SAVE_DIR = 'pictures/'
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
class NoPictureForDate(Exception):
'''No Picture of the Day found for {day}'''
def build_page_url(iso_date):
return POTD_BASE_URL + iso_date
def fetch(url):
response = requests.get(url)
return response
def extract_image_url(html):
re_image = r'src="(//upload\..*?)"'
image_url = re.search(re_image, html)
return 'http:' + image_url.group(1)
def format_date(year, month, day):
return '{year}-{month:02d}-{day:02d}'.format(**locals())
def list_days_of_month(year, month):
lastday = calendar.monthrange(year, month)[1]
days = [format_date(year, month, day) for day in range(1, lastday + 1)]
return days
def build_save_path(iso_date, url):
head, filename = os.path.split(url)
return os.path.join(SAVE_DIR, iso_date+'_'+filename)
def save_one(iso_date, verbose):
page_url = build_page_url(iso_date)
response = fetch(page_url)
if response.status_code != 200:
msg = NoPictureForDate.__doc__.format(day=iso_date)
raise NoPictureForDate(msg)
img_url = extract_image_url(response.text)
response = fetch(img_url)
path = build_save_path(iso_date, img_url)
if verbose:
print('saving: '+path)
with io.open(path, 'wb') as fp:
fp.write(response.content)
return len(response.content)
def save_month(year_month, verbose):
year, month = [int(s) for s in year_month.split('-')]
total_size = 0
img_count = 0
dates = list_days_of_month(year, month)
for date in dates:
try:
total_size += save_one(date, verbose)
img_count += 1
except NoPictureForDate:
continue
return img_count, total_size
def main(save_one=save_one, save_month=save_month):
"""Get "Picture of The Day" from English Wikipedia for a given date or month"""
parser = argparse.ArgumentParser(description=main.__doc__)
parser.add_argument('date', help='year, month and (optional) day in YYYY-MM-DD format')
parser.add_argument('-q', '--max_qty', type=int,
help='maximum number of files to download')
parser.add_argument('-v', '--verbose', action='store_true',
help='display progress information')
args = parser.parse_args()
t0 = time.time()
if len(args.date) == len('YYYY-MM-DD'):
img_count = 1
total_size = save_one(args.date, args.verbose)
else:
img_count, total_size = save_month(args.date, args.verbose)
elapsed = time.time() - t0
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
(img_count, total_size/1024.0, elapsed))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,96 @@
import unittest
import potd
class TestSequenceFunctions(unittest.TestCase):
def setUp(self):
self.thumb_url = ("""http://upload.wikimedia.org/wikipedia/"""
"""commons/thumb/f/fe/Orthographic_projection_SW.jpg/350px"""
"""-Orthographic_projection_SW.jpg""")
def test_buid_page_url(self):
date = '2014-05-01'
result = potd.build_page_url(date)
self.assertEqual(result, 'http://en.wikipedia.org/wiki/Template:POTD/2014-05-01')
def test_fetch_status_code(self):
date = '2014-05-02'
url = potd.build_page_url(date)
response = potd.fetch(url)
self.assertEqual(response.status_code, 200)
def test_fetch_status_code_not_found(self):
date = '2100-01-01'
url = potd.build_page_url(date)
response = potd.fetch(url)
self.assertEqual(response.status_code, 404)
def test_extract_image_url(self):
image_url = potd.extract_image_url(HTML)
self.assertEqual(image_url, self.thumb_url)
def test_fetch_image_jpeg(self):
response = potd.fetch(self.thumb_url)
self.assertEqual(response.headers['content-type'], 'image/jpeg')
def test_list_days_of_month(self):
year = 2014
month = 5
days = potd.list_days_of_month(year, month)
self.assertEqual(len(days), 31)
self.assertEqual('2014-05-01', days[0])
self.assertEqual('2014-05-31', days[-1])
def test_list_days_of_february(self):
year = 2014
month = 2
days = potd.list_days_of_month(year, month)
self.assertEqual(len(days), 28)
self.assertEqual('2014-02-01', days[0])
self.assertEqual('2014-02-28', days[-1])
def test_format_date(self):
year = 2014
month = 2
day = 1
a_date = '2014-02-01'
date = potd.format_date(year, month, day)
self.assertEqual(a_date, date)
self.assertEqual(potd.format_date(2010, 11, 12), '2010-11-12')
def test_build_save_path(self):
date = '2014-06-04'
path = potd.SAVE_DIR + date + '_350px-Orthographic_projection_SW.jpg'
self.assertEqual(path, potd.build_save_path(date, self.thumb_url))
HTML = (
'''<td><a href="/wiki/File:Orthographic_projection_SW.jpg" class="image"
title="Orthographic projection"><img alt="Orthographic projection"
src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fe/O'''
'''rthographic_projection_SW.jpg/350px-Orthographic_projection_SW.jpg"
width="350" height="350" srcset="//upload.wikimedia.org/wikipedia/comm'''
'''ons/thumb/f/fe/Orthographic_projection_SW.jpg/525px-
Orthographic_projection_SW.jpg 1.5x, //upload.wikimedia.org/wikipedia/
commons/thumb/f/fe/Orthographic_projection_SW.jpg/700px-
Orthographic_projection_SW.jpg 2x" data-file-width="2058" data-file-
height="2058"></a></td>
''')
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,115 @@
"""
Wikipedia Picture of the Day (POTD) download example
Baseline synchronous example for comparison: downloads images and metadata
in the simple but slow synchronous way i.e. one after the other.
"""
from __future__ import print_function
import sys
import os
import io
import re
import argparse
import datetime
import urllib2
import contextlib
import time
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
LOCAL_IMG_PATH = 'pictures/'
verbose = True
class ParsingException(ValueError):
"""Raised if unable to parse POTD MediaWiki source"""
def fetch_potd_url(iso_date):
"""Fetch picture name from iso_date ('YYYY-MM-DD' format)"""
potd_url = POTD_BASE_URL + iso_date
with contextlib.closing(urllib2.urlopen(potd_url)) as fp:
html = fp.read()
thumb_src = THUMB_SRC_RE.search(html)
if not thumb_src:
msg = 'cannot find thumbnail source for ' + potd_url
raise ParsingException(msg)
thumb_url = THUMB_BASE_URL+thumb_src.group(1)
return thumb_url
def gen_month_days(year, month):
a_date = datetime.date(year, month, 1)
one_day = datetime.timedelta(1)
while a_date.month == month:
yield a_date
a_date += one_day
def get_img_names(iso_month):
"""Fetch picture names from iso_month ('YYYY-MM' format)"""
year, month = (int(part) for part in iso_month.split('-'))
for day in gen_month_days(year, month):
iso_date = '{:%Y-%m-%d}'.format(day)
if verbose:
print(iso_date)
try:
img_url = fetch_potd_url(iso_date)
except urllib2.HTTPError:
break
yield (iso_date, img_url)
def fetch_image(iso_date, img_url):
if verbose:
print('\t' + img_url)
with contextlib.closing(urllib2.urlopen(img_url)) as fp:
img = fp.read()
img_filename = iso_date + '__' + img_url.split('/')[-1]
if verbose:
print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
with io.open(img_path, 'wb') as fp:
fp.write(img)
return len(img)
def get_images(iso_month, max_count=0):
if max_count is 0:
max_count = sys.maxsize
img_count = 0
total_size = 0
for iso_date, img_url in get_img_names(iso_month):
total_size += fetch_image(iso_date, img_url)
img_count += 1
if img_count == max_count:
break
return (img_count, total_size)
def main():
"""Get "Pictures of The Day" from English Wikipedia for a given month"""
global verbose
parser = argparse.ArgumentParser(description=main.__doc__)
parser.add_argument('year_month', help='year and month in YYYY-MM format')
parser.add_argument('-q', '--max_qty', type=int,
help='maximum number of files to download')
parser.add_argument('-v', '--verbose', action='store_true',
help='display progress information')
args = parser.parse_args()
verbose = args.verbose
t0 = time.time()
img_count, total_size = get_images(args.year_month, args.max_qty)
elapsed = time.time() - t0
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
(img_count, total_size/1024.0, elapsed))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,118 @@
"""
Wikipedia Picture of the Day (POTD) download example
Baseline synchronous example for comparison: downloads images and metadata
in the simple but slow synchronous way i.e. one after the other.
"""
import sys
import os
import io
import re
import argparse
import datetime
import urllib.request
import urllib.error
import contextlib
import time
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
THUMB_BASE_URL = 'http://upload.wikimedia.org/wikipedia/commons/thumb/'
THUMB_SRC_RE = re.compile(r'src=".*?/thumb/(.*?/\d+px-[^"]+)')
LOCAL_IMG_PATH = 'pictures/'
verbose = True
class ParsingException(ValueError):
"""Raised if unable to parse POTD MediaWiki source"""
def gen_month_dates(year, month):
"""Produce all dates in a given year, month"""
a_date = datetime.date(year, month, 1)
one_day = datetime.timedelta(1)
while a_date.month == month:
yield '{:%Y-%m-%d}'.format(a_date)
a_date += one_day
def fetch_potd_url(iso_date):
"""Fetch POTD thumbnail URL for iso_date ('YYYY-MM-DD' format)"""
if verbose:
print(iso_date)
potd_url = POTD_BASE_URL + iso_date
try:
with urllib.request.urlopen(potd_url) as fp:
html = fp.read().decode('utf-8')
thumb_src = THUMB_SRC_RE.search(html)
if not thumb_src:
msg = 'cannot find thumbnail source for ' + potd_url
raise ParsingException(msg)
thumb_url = THUMB_BASE_URL+thumb_src.group(1)
except urllib.error.HTTPError:
return None
return thumb_url
def gen_img_names(iso_month):
"""Produce picture names by fetching POTD metadata"""
year, month = (int(part) for part in iso_month.split('-'))
for iso_date in gen_month_dates(year, month):
img_url = fetch_potd_url(iso_date)
if img_url is None:
break
yield (iso_date, img_url)
def fetch_image(iso_date, img_url):
"""Fetch and save image data for date and url"""
if verbose:
print('\t' + img_url)
with contextlib.closing(urllib.request.urlopen(img_url)) as fp:
img = fp.read()
img_filename = iso_date + '__' + img_url.split('/')[-1]
if verbose:
print('\t\twriting %0.1f Kbytes' % (len(img)/1024.0))
img_path = os.path.join(LOCAL_IMG_PATH, img_filename)
with io.open(img_path, 'wb') as fp:
fp.write(img)
return len(img)
def get_images(iso_month, max_count=0):
"""Download up to max_count images for a given month"""
if max_count is 0:
max_count = sys.maxsize
img_count = 0
total_size = 0
for iso_date, img_url in gen_img_names(iso_month):
total_size += fetch_image(iso_date, img_url)
img_count += 1
if img_count == max_count:
break
return (img_count, total_size)
def main():
"""Get "Pictures of The Day" from English Wikipedia for a given month"""
global verbose
parser = argparse.ArgumentParser(description=main.__doc__)
parser.add_argument('year_month', help='year and month in YYYY-MM format')
parser.add_argument('-q', '--max_qty', type=int,
help='maximum number of files to download')
parser.add_argument('-v', '--verbose', action='store_true',
help='display progress information')
args = parser.parse_args()
verbose = args.verbose
t0 = time.time()
img_count, total_size = get_images(args.year_month, args.max_qty)
elapsed = time.time() - t0
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
(img_count, total_size/1024.0, elapsed))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,87 @@
"""
Wikipedia Picture of the Day (POTD) download example
"""
import pytest
from daypicts import *
@pytest.mark.network
def test_get_picture_url_existing():
url = get_picture_url('2012-01-01')
expected = ('http://upload.wikimedia.org/wikipedia/commons/'
'thumb/9/9d/MODIS_Map.jpg/550px-MODIS_Map.jpg')
assert url == expected
@pytest.mark.network
def test_get_picture_url_not_existing():
with pytest.raises(NoPictureForDate):
get_picture_url('2013-09-12')
def test_validate_full_date():
parts = validate_date('2015-1-2')
assert parts == '2015-01-02'
def test_validate_date_too_early():
with pytest.raises(NoPictureTemplateBefore):
validate_date('2006-12-31')
def test_validate_month():
parts = validate_date('2015-1')
assert parts == '2015-01'
def test_validate_year():
parts = validate_date('2015')
assert parts == '2015'
def test_gen_month_dates():
dates = list(gen_month_dates('2015-02'))
assert len(dates) == 28
assert dates[0] == '2015-02-01'
assert dates[27] == '2015-02-28'
def test_gen_month_dates_leap():
dates = list(gen_month_dates('2012-02'))
assert len(dates) == 29
assert dates[28] == '2012-02-29'
def test_gen_year_dates():
dates = list(gen_year_dates('2015'))
assert len(dates) == 365
assert dates[0] == '2015-01-01'
assert dates[364] == '2015-12-31'
def test_gen_year_dates_leap():
dates = list(gen_year_dates('2012'))
assert len(dates) == 366
assert dates[365] == '2012-12-31'
GIF_MIN = (b'GIF89a\x01\x00\x01\x00\x00\xff\x00,\x00\x00'
b'\x00\x00\x01\x00\x01\x00\x00\x02\x00;')
SVG_MIN = b'<svg xmlns="http://www.w3.org/2000/svg"></svg>'
SVG_XML_DECL = b'<?xml version="1.0" encoding="UTF-8"?>' + SVG_MIN
NOISE = b'\xb0\x0bU\xbe]L\n\x92\xbe\xc6\xf65"\xcc\xa3\xe3'
def test_picture_type_imghdr():
assert picture_type(GIF_MIN) == 'gif'
def test_picture_type_svg():
assert picture_type(SVG_MIN) == 'svg'
assert picture_type(SVG_XML_DECL) == 'svg'
def test_picture_type_unknown():
assert picture_type(NOISE) is None