wikipedia pictures download example

This commit is contained in:
Luciano Ramalho
2015-02-02 02:56:14 -02:00
parent 73d98de6cd
commit ab6ce5b6a4
37 changed files with 2042 additions and 37 deletions

View File

@@ -0,0 +1,36 @@
"""
Wikipedia Picture of the Day (POTD) download example
Inspired by example at:
https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
"""
from concurrent import futures
import potd
def save_month(year_month, verbose):
year, month = [int(s) for s in year_month.split('-')]
total_size = 0
img_count = 0
dates = potd.list_days_of_month(year, month)
with futures.ProcessPoolExecutor(max_workers=100) as executor:
downloads = dict((executor.submit(potd.save_one, date, verbose), date)
for date in dates)
for future in futures.as_completed(downloads):
date = downloads[future]
if future.exception() is not None:
print('%r generated an exception: %s' % (date,
future.exception()))
else:
img_size = future.result()
total_size += img_size
img_count += 1
print('%r OK: %r' % (date, img_size))
return img_count, total_size
if __name__ == '__main__':
potd.main(save_month=save_month)

View File

@@ -0,0 +1,36 @@
"""
Wikipedia Picture of the Day (POTD) download example
Inspired by example at:
https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
"""
from concurrent import futures
import potd
def save_month(year_month, verbose):
year, month = [int(s) for s in year_month.split('-')]
total_size = 0
img_count = 0
dates = potd.list_days_of_month(year, month)
with futures.ThreadPoolExecutor(max_workers=100) as executor:
downloads = dict((executor.submit(potd.save_one, date, verbose), date)
for date in dates)
for future in futures.as_completed(downloads):
date = downloads[future]
if future.exception() is not None:
print('%r generated an exception: %s' % (date,
future.exception()))
else:
img_size = future.result()
total_size += img_size
img_count += 1
print('%r OK: %r' % (date, img_size))
return img_count, total_size
if __name__ == '__main__':
potd.main(save_month=save_month)

View File

@@ -0,0 +1,100 @@
"""
Wikipedia Picture of the Day (POTD) download example
Baseline synchronous example for comparison: downloads metadata and
images in the simple but slow synchronous way i.e. one after the other.
"""
import calendar
import datetime
import re
import os
import io
import time
import requests
import argparse
SAVE_DIR = 'pictures/'
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
class NoPictureForDate(Exception):
'''No Picture of the Day found for {day}'''
def build_page_url(iso_date):
return POTD_BASE_URL + iso_date
def fetch(url):
response = requests.get(url)
return response
def extract_image_url(html):
re_image = r'src="(//upload\..*?)"'
image_url = re.search(re_image, html)
return 'http:' + image_url.group(1)
def format_date(year, month, day):
return '{year}-{month:02d}-{day:02d}'.format(**locals())
def list_days_of_month(year, month):
lastday = calendar.monthrange(year, month)[1]
days = [format_date(year, month, day) for day in range(1, lastday + 1)]
return days
def build_save_path(iso_date, url):
head, filename = os.path.split(url)
return os.path.join(SAVE_DIR, iso_date+'_'+filename)
def save_one(iso_date, verbose):
page_url = build_page_url(iso_date)
response = fetch(page_url)
if response.status_code != 200:
msg = NoPictureForDate.__doc__.format(day=iso_date)
raise NoPictureForDate(msg)
img_url = extract_image_url(response.text)
response = fetch(img_url)
path = build_save_path(iso_date, img_url)
if verbose:
print('saving: '+path)
with io.open(path, 'wb') as fp:
fp.write(response.content)
return len(response.content)
def save_month(year_month, verbose):
year, month = [int(s) for s in year_month.split('-')]
total_size = 0
img_count = 0
dates = list_days_of_month(year, month)
for date in dates:
try:
total_size += save_one(date, verbose)
img_count += 1
except NoPictureForDate:
continue
return img_count, total_size
def main(save_one=save_one, save_month=save_month):
"""Get "Picture of The Day" from English Wikipedia for a given date or month"""
parser = argparse.ArgumentParser(description=main.__doc__)
parser.add_argument('date', help='year, month and (optional) day in YYYY-MM-DD format')
parser.add_argument('-q', '--max_qty', type=int,
help='maximum number of files to download')
parser.add_argument('-v', '--verbose', action='store_true',
help='display progress information')
args = parser.parse_args()
t0 = time.time()
if len(args.date) == len('YYYY-MM-DD'):
img_count = 1
total_size = save_one(args.date, args.verbose)
else:
img_count, total_size = save_month(args.date, args.verbose)
elapsed = time.time() - t0
print("images: %3d | total size: %6.1f Kbytes | elapsed time: %3ds" %
(img_count, total_size/1024.0, elapsed))
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,96 @@
import unittest
import potd
class TestSequenceFunctions(unittest.TestCase):
def setUp(self):
self.thumb_url = ("""http://upload.wikimedia.org/wikipedia/"""
"""commons/thumb/f/fe/Orthographic_projection_SW.jpg/350px"""
"""-Orthographic_projection_SW.jpg""")
def test_buid_page_url(self):
date = '2014-05-01'
result = potd.build_page_url(date)
self.assertEqual(result, 'http://en.wikipedia.org/wiki/Template:POTD/2014-05-01')
def test_fetch_status_code(self):
date = '2014-05-02'
url = potd.build_page_url(date)
response = potd.fetch(url)
self.assertEqual(response.status_code, 200)
def test_fetch_status_code_not_found(self):
date = '2100-01-01'
url = potd.build_page_url(date)
response = potd.fetch(url)
self.assertEqual(response.status_code, 404)
def test_extract_image_url(self):
image_url = potd.extract_image_url(HTML)
self.assertEqual(image_url, self.thumb_url)
def test_fetch_image_jpeg(self):
response = potd.fetch(self.thumb_url)
self.assertEqual(response.headers['content-type'], 'image/jpeg')
def test_list_days_of_month(self):
year = 2014
month = 5
days = potd.list_days_of_month(year, month)
self.assertEqual(len(days), 31)
self.assertEqual('2014-05-01', days[0])
self.assertEqual('2014-05-31', days[-1])
def test_list_days_of_february(self):
year = 2014
month = 2
days = potd.list_days_of_month(year, month)
self.assertEqual(len(days), 28)
self.assertEqual('2014-02-01', days[0])
self.assertEqual('2014-02-28', days[-1])
def test_format_date(self):
year = 2014
month = 2
day = 1
a_date = '2014-02-01'
date = potd.format_date(year, month, day)
self.assertEqual(a_date, date)
self.assertEqual(potd.format_date(2010, 11, 12), '2010-11-12')
def test_build_save_path(self):
date = '2014-06-04'
path = potd.SAVE_DIR + date + '_350px-Orthographic_projection_SW.jpg'
self.assertEqual(path, potd.build_save_path(date, self.thumb_url))
HTML = (
'''<td><a href="/wiki/File:Orthographic_projection_SW.jpg" class="image"
title="Orthographic projection"><img alt="Orthographic projection"
src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fe/O'''
'''rthographic_projection_SW.jpg/350px-Orthographic_projection_SW.jpg"
width="350" height="350" srcset="//upload.wikimedia.org/wikipedia/comm'''
'''ons/thumb/f/fe/Orthographic_projection_SW.jpg/525px-
Orthographic_projection_SW.jpg 1.5x, //upload.wikimedia.org/wikipedia/
commons/thumb/f/fe/Orthographic_projection_SW.jpg/700px-
Orthographic_projection_SW.jpg 2x" data-file-width="2058" data-file-
height="2058"></a></td>
''')
if __name__ == '__main__':
unittest.main()