updated wikipedia examples

This commit is contained in:
Luciano Ramalho
2015-02-02 20:17:33 -02:00
parent a38ab9548b
commit c6df60bb66
3 changed files with 130 additions and 59 deletions

View File

@@ -1,18 +1,24 @@
import sys import sys
import argparse import argparse
import os import os
import urllib
from daypicts import get_picture_url, validate_date, gen_dates import requests
from daypicts import get_picture_url, get_picture_urls
from daypicts import validate_date, gen_dates, picture_type
from daypicts import NoPictureForDate from daypicts import NoPictureForDate
from daypicts import POTD_PATH from daypicts import REMOTE_PICT_BASE_URL, PICT_EXCEPTIONS
FIXTURE_DIR = 'fixture/'
FIXTURE_DOC_DIR = 'fixture/docroot/'
FIXTURE_TEMPLATE_POTD_DIR = FIXTURE_DOC_DIR + 'Template-POTD/'
def parse_args(argv): def parse_args(argv):
parser = argparse.ArgumentParser(description=main.__doc__) parser = argparse.ArgumentParser(description=main.__doc__)
date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day' date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day'
parser.add_argument('date', help=date_help) parser.add_argument('date', help=date_help)
parser.add_argument('-u', '--url_only', action='store_true',
help='get picture URLS only')
args = parser.parse_args(argv) args = parser.parse_args(argv)
@@ -47,18 +53,45 @@ def save_picture_urls(dates, save_path):
fp.write(snippet) fp.write(snippet)
def save_pictures(dates, save_path, verbose=False):
urls_ok = []
for date, url in get_picture_urls(dates, verbose):
response = requests.get(url)
file_path = os.path.join(save_path,
url.replace(REMOTE_PICT_BASE_URL, ''))
file_path = urllib.parse.unquote(file_path)
octets = response.content
# http://en.wikipedia.org/wiki/Template:POTD/2013-06-15
if date not in PICT_EXCEPTIONS:
assert picture_type(octets) is not None, url
try:
os.makedirs(os.path.dirname(file_path))
except FileExistsError:
pass
with open(file_path, 'wb') as fp:
fp.write(octets)
print(file_path)
return urls_ok
def main(argv): def main(argv):
"""Build test fixture from Wikipedia "POTD" data""" """Build test fixture from Wikipedia "POTD" data"""
save_path = os.path.join(FIXTURE_DIR,POTD_PATH)
try: try:
os.makedirs(save_path) os.makedirs(FIXTURE_TEMPLATE_POTD_DIR)
except FileExistsError: except FileExistsError:
pass pass
dates, args = parse_args(argv) dates, args = parse_args(argv)
save_picture_urls(dates, save_path) if args.url_only:
save_picture_urls(dates, FIXTURE_TEMPLATE_POTD_DIR)
else:
save_pictures(dates, FIXTURE_DOC_DIR)
if __name__ == '__main__': if __name__ == '__main__':
main(sys.argv[1:]) main(sys.argv[1:])

View File

@@ -18,15 +18,23 @@ from 2007-01-01.
import sys import sys
import argparse import argparse
import re import re
import imghdr
import time import time
import datetime import datetime
import os
import imghdr
import warnings
import requests import requests
SAVE_DIR = 'pictures/' SAVE_DIR = 'downloaded/'
POTD_PATH = 'Template:POTD/'
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/' + POTD_PATH #POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
POTD_BASE_URL = 'http://127.0.0.1:8001/Template-POTD/'
REMOTE_PICT_BASE_URL = 'http://upload.wikimedia.org/wikipedia/'
LOCAL_PICT_BASE_URL = 'http://127.0.0.1:8001/'
PICT_BASE_URL = LOCAL_PICT_BASE_URL
POTD_IMAGE_RE = re.compile(r'src="(//upload\..*?)"') POTD_IMAGE_RE = re.compile(r'src="(//upload\..*?)"')
PODT_EARLIEST_TEMPLATE = '2007-01-01' PODT_EARLIEST_TEMPLATE = '2007-01-01'
@@ -35,12 +43,11 @@ RE_MONTH = RE_YEAR + r'-([01]\d)'
RE_DATE = RE_MONTH + r'-([0-3]\d)' RE_DATE = RE_MONTH + r'-([0-3]\d)'
ISO_DATE_FMT = '%Y-%m-%d' ISO_DATE_FMT = '%Y-%m-%d'
DATEFORMS = [ PICT_EXCEPTIONS = {
('date', re.compile('^' + RE_DATE + '$')), '2013-06-15', # .webm movie [1]
('month', re.compile('^' + RE_MONTH + '$')), }
('year', re.compile('^' + RE_YEAR + '$'))
]
#[1] http://en.wikipedia.org/wiki/Template:POTD/2013-06-15
class NoPictureForDate(Exception): class NoPictureForDate(Exception):
'''No Picture of the Day found for {iso_date}''' '''No Picture of the Day found for {iso_date}'''
@@ -51,7 +58,8 @@ class NoPictureTemplateBefore(ValueError):
def get_picture_url(iso_date): def get_picture_url(iso_date):
page_url = POTD_BASE_URL+iso_date page_url = POTD_BASE_URL + iso_date
print(page_url)
response = requests.get(page_url) response = requests.get(page_url)
pict_url = POTD_IMAGE_RE.search(response.text) pict_url = POTD_IMAGE_RE.search(response.text)
if pict_url is None: if pict_url is None:
@@ -59,23 +67,6 @@ def get_picture_url(iso_date):
return 'http:' + pict_url.group(1) return 'http:' + pict_url.group(1)
def get_picture(iso_date):
pict_url = get_picture_url(iso_date)
response = requests.get(pict_url)
octets = response.content
return octets
def get_picture_type(octets):
pict_type = imghdr.what(None, octets)
if pict_type is None:
if (octets.startswith(b'<') and
b'<svg' in octets[:200] and
octets.rstrip().endswith(b'</svg>')):
pict_type = 'svg'
return pict_type
def validate_date(text): def validate_date(text):
try: try:
parts = [int(part) for part in text.split('-')] parts = [int(part) for part in text.split('-')]
@@ -116,8 +107,8 @@ def gen_dates(iso_parts):
yield iso_parts yield iso_parts
def get_picture_urls(dates, verbose=False, save_fixture=False): def get_picture_urls(dates, verbose=False):
urls = [] date_urls = []
count = 0 count = 0
for date in dates: for date in dates:
try: try:
@@ -132,8 +123,50 @@ def get_picture_urls(dates, verbose=False, save_fixture=False):
print(url.split('/')[-1]) print(url.split('/')[-1])
else: else:
print(url) print(url)
urls.append(url) date_urls.append((date, url))
return urls return date_urls
def picture_type(octets):
pict_type = imghdr.what(None, octets)
if pict_type is None:
if (octets.startswith(b'<') and
b'<svg' in octets[:200] and
octets.rstrip().endswith(b'</svg>')):
pict_type = 'svg'
return pict_type
def get_pictures(dates, verbose=False):
urls_ok = []
try:
os.makedirs(SAVE_DIR)
except FileExistsError:
pass
for date, url in get_picture_urls(dates, verbose):
if PICT_BASE_URL == LOCAL_PICT_BASE_URL:
url = url.replace(REMOTE_PICT_BASE_URL, PICT_BASE_URL)
response = requests.get(url)
if response.status_code != 200:
warnings.warn('HTTP code {}: {}'.format(response.status_code, url))
continue
octets = response.content
if date not in PICT_EXCEPTIONS:
assert picture_type(octets) is not None, url
file_path = url.replace(PICT_BASE_URL, '')
file_name = os.path.basename(file_path)
path = os.path.join(SAVE_DIR, date.split('-')[0])
file_path = os.path.join(path, file_name)
#import pdb; pdb.set_trace()
try:
os.makedirs(path)
except FileExistsError:
pass
with open(file_path, 'wb') as fp:
fp.write(octets)
urls_ok.append(url)
print(file_path)
return urls_ok
def parse_args(argv): def parse_args(argv):
@@ -175,7 +208,12 @@ def main(argv, get_picture_urls):
t0 = time.time() t0 = time.time()
urls = get_picture_urls(dates, args.verbose, args.fixture_save) if args.url_only:
urls = get_picture_urls(dates, args.verbose)
else:
urls = get_pictures(dates, args.verbose)
elapsed = time.time() - t0 elapsed = time.time() - t0
if args.verbose: if args.verbose:

View File

@@ -7,13 +7,6 @@ import pytest
from daypicts import * from daypicts import *
GIF_MIN = (b'GIF89a\x01\x00\x01\x00\x00\xff\x00,\x00\x00'
b'\x00\x00\x01\x00\x01\x00\x00\x02\x00;')
SVG_MIN = b'<svg xmlns="http://www.w3.org/2000/svg"></svg>'
SVG_XML_DECL = b'<?xml version="1.0" encoding="UTF-8"?>' + SVG_MIN
NOISE = b'\xb0\x0bU\xbe]L\n\x92\xbe\xc6\xf65"\xcc\xa3\xe3'
@pytest.mark.network @pytest.mark.network
def test_get_picture_url_existing(): def test_get_picture_url_existing():
url = get_picture_url('2012-01-01') url = get_picture_url('2012-01-01')
@@ -28,19 +21,6 @@ def test_get_picture_url_not_existing():
get_picture_url('2013-09-12') get_picture_url('2013-09-12')
def test_get_picture_type_imghdr():
assert get_picture_type(GIF_MIN) == 'gif'
def test_get_picture_type_svg():
assert get_picture_type(SVG_MIN) == 'svg'
assert get_picture_type(SVG_XML_DECL) == 'svg'
def test_get_picture_type_unknown():
assert get_picture_type(NOISE) is None
def test_validate_full_date(): def test_validate_full_date():
parts = validate_date('2015-1-2') parts = validate_date('2015-1-2')
assert parts == '2015-01-02' assert parts == '2015-01-02'
@@ -85,3 +65,23 @@ def test_gen_year_dates_leap():
dates = list(gen_year_dates('2012')) dates = list(gen_year_dates('2012'))
assert len(dates) == 366 assert len(dates) == 366
assert dates[365] == '2012-12-31' assert dates[365] == '2012-12-31'
GIF_MIN = (b'GIF89a\x01\x00\x01\x00\x00\xff\x00,\x00\x00'
b'\x00\x00\x01\x00\x01\x00\x00\x02\x00;')
SVG_MIN = b'<svg xmlns="http://www.w3.org/2000/svg"></svg>'
SVG_XML_DECL = b'<?xml version="1.0" encoding="UTF-8"?>' + SVG_MIN
NOISE = b'\xb0\x0bU\xbe]L\n\x92\xbe\xc6\xf65"\xcc\xa3\xe3'
def test_picture_type_imghdr():
assert picture_type(GIF_MIN) == 'gif'
def test_picture_type_svg():
assert picture_type(SVG_MIN) == 'svg'
assert picture_type(SVG_XML_DECL) == 'svg'
def test_picture_type_unknown():
assert picture_type(NOISE) is None