""" Wikipedia Picture of the Day (POTD) download example Note: The earliest Pictures of the Day I've found are in this page: http://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/May_2004 However, I have not found Template:POTD/YYYY-MM-DD pages earlier than this: http://en.wikipedia.org/wiki/Template:POTD/2007-01-01 For simplicity, this script only retrieves pictures starting from 2007-01-01. """ import sys import argparse import re import time import datetime import os import imghdr import warnings import requests SAVE_DIR = 'downloaded/' HTTP_PORT = 8002 POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/' #POTD_BASE_URL = 'http://127.0.0.1:{}/Template-POTD/'.format(HTTP_PORT) REMOTE_PICT_BASE_URL = 'http://upload.wikimedia.org/wikipedia/' #LOCAL_PICT_BASE_URL = 'http://127.0.0.1:{}/'.format(HTTP_PORT) LOCAL_PICT_BASE_URL = REMOTE_PICT_BASE_URL PICT_BASE_URL = REMOTE_PICT_BASE_URL POTD_IMAGE_RE = re.compile(r'src="(//upload\..*?)"') PODT_EARLIEST_TEMPLATE = '2007-01-01' RE_YEAR = r'([12]\d{3})' RE_MONTH = RE_YEAR + r'-([01]\d)' RE_DATE = RE_MONTH + r'-([0-3]\d)' ISO_DATE_FMT = '%Y-%m-%d' PICT_EXCEPTIONS = { '2013-06-15', # .webm movie [1] } #[1] http://en.wikipedia.org/wiki/Template:POTD/2013-06-15 class NoPictureForDate(Exception): '''No Picture of the Day found for {iso_date}''' class NoPictureTemplateBefore(ValueError): '''Template:POTD did not exist before PODT_EARLIEST_TEMPLATE''' def get_picture_url(iso_date): page_url = POTD_BASE_URL + iso_date print(page_url) response = requests.get(page_url) pict_url = POTD_IMAGE_RE.search(response.text) if pict_url is None: raise NoPictureForDate(iso_date) return 'http:' + pict_url.group(1) def validate_date(text): try: parts = [int(part) for part in text.split('-')] except ValueError: raise ValueError('date must use YYYY, YYYY-MM or YYYY-MM-DD format') test_parts = parts[:] while len(test_parts) < 3: test_parts.append(1) date = datetime.date(*(int(part) for part in test_parts)) iso_date = date.strftime(ISO_DATE_FMT) iso_date = iso_date[:1+len(parts)*3] if iso_date < PODT_EARLIEST_TEMPLATE: raise NoPictureTemplateBefore(PODT_EARLIEST_TEMPLATE) return iso_date def gen_month_dates(iso_month): first = datetime.datetime.strptime(iso_month+'-01', ISO_DATE_FMT) one_day = datetime.timedelta(days=1) date = first.date() while date.month == first.month: yield date.strftime(ISO_DATE_FMT) date += one_day def gen_year_dates(iso_year): for i in range(1, 13): yield from gen_month_dates(iso_year + '-{:02d}'.format(i)) def gen_dates(iso_parts): if len(iso_parts) == 4: yield from gen_year_dates(iso_parts) elif len(iso_parts) == 7: yield from gen_month_dates(iso_parts) else: yield iso_parts def get_picture_urls(dates, verbose=False): date_urls = [] count = 0 for date in dates: try: url = get_picture_url(date) except NoPictureForDate as exc: if verbose: print('*** {!r} ***'.format(exc)) continue count += 1 if verbose: print(format(count, '3d'), end=' ') print(url.split('/')[-1]) else: print(url) date_urls.append((date, url)) return date_urls def picture_type(octets): pict_type = imghdr.what(None, octets) if pict_type is None: if (octets.startswith(b'<') and b'')): pict_type = 'svg' return pict_type def get_pictures(dates, verbose=False): urls_ok = [] try: os.makedirs(SAVE_DIR) except FileExistsError: pass for date, url in get_picture_urls(dates, verbose): if PICT_BASE_URL == LOCAL_PICT_BASE_URL: url = url.replace(REMOTE_PICT_BASE_URL, PICT_BASE_URL) response = requests.get(url) if response.status_code != 200: warnings.warn('HTTP code {}: {}'.format(response.status_code, url)) continue octets = response.content if date not in PICT_EXCEPTIONS: assert picture_type(octets) is not None, url file_path = url.replace(PICT_BASE_URL, '') file_name = os.path.basename(file_path) path = os.path.join(SAVE_DIR, date.split('-')[0]) file_path = os.path.join(path, file_name) #import pdb; pdb.set_trace() try: os.makedirs(path) except FileExistsError: pass with open(file_path, 'wb') as fp: fp.write(octets) urls_ok.append(url) print(file_path) return urls_ok def parse_args(argv): parser = argparse.ArgumentParser(description=main.__doc__) date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day' parser.add_argument('date', help=date_help) parser.add_argument('-q', '--max_qty', type=int, help='maximum number of items to fetch') parser.add_argument('-u', '--url_only', action='store_true', help='get picture URLS only') parser.add_argument('-f', '--fixture_save', action='store_true', help='save data for local test fixture') parser.add_argument('-v', '--verbose', action='store_true', help='display progress information') args = parser.parse_args(argv) try: iso_parts = validate_date(args.date) except ValueError as exc: print('error:', exc.args[0]) parser.print_usage() sys.exit(2) dates = list(gen_dates(iso_parts)) if args.verbose: if len(dates) == 1: print('-> Date: ', dates[0]) else: fmt = '-> {} days: {}...{}' print(fmt.format(len(dates), dates[0], dates[-1])) return dates, args def main(argv, get_picture_urls): """Get Wikipedia "Picture of The Day" for date, month or year""" dates, args = parse_args(argv) t0 = time.time() if args.url_only: urls = get_picture_urls(dates, args.verbose) else: urls = get_pictures(dates, args.verbose) elapsed = time.time() - t0 if args.verbose: print('-> found: {} pictures | elapsed time: {:.2f}s' .format(len(urls), elapsed)) if __name__ == '__main__': main(sys.argv[1:], get_picture_urls)