updated wikipedia examples
This commit is contained in:
@@ -1,18 +1,24 @@
|
|||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
|
import urllib
|
||||||
|
|
||||||
from daypicts import get_picture_url, validate_date, gen_dates
|
import requests
|
||||||
|
|
||||||
|
from daypicts import get_picture_url, get_picture_urls
|
||||||
|
from daypicts import validate_date, gen_dates, picture_type
|
||||||
from daypicts import NoPictureForDate
|
from daypicts import NoPictureForDate
|
||||||
from daypicts import POTD_PATH
|
from daypicts import REMOTE_PICT_BASE_URL, PICT_EXCEPTIONS
|
||||||
|
|
||||||
FIXTURE_DIR = 'fixture/'
|
|
||||||
|
|
||||||
|
FIXTURE_DOC_DIR = 'fixture/docroot/'
|
||||||
|
FIXTURE_TEMPLATE_POTD_DIR = FIXTURE_DOC_DIR + 'Template-POTD/'
|
||||||
|
|
||||||
def parse_args(argv):
|
def parse_args(argv):
|
||||||
parser = argparse.ArgumentParser(description=main.__doc__)
|
parser = argparse.ArgumentParser(description=main.__doc__)
|
||||||
date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day'
|
date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day'
|
||||||
parser.add_argument('date', help=date_help)
|
parser.add_argument('date', help=date_help)
|
||||||
|
parser.add_argument('-u', '--url_only', action='store_true',
|
||||||
|
help='get picture URLS only')
|
||||||
|
|
||||||
args = parser.parse_args(argv)
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
@@ -47,18 +53,45 @@ def save_picture_urls(dates, save_path):
|
|||||||
fp.write(snippet)
|
fp.write(snippet)
|
||||||
|
|
||||||
|
|
||||||
|
def save_pictures(dates, save_path, verbose=False):
|
||||||
|
urls_ok = []
|
||||||
|
for date, url in get_picture_urls(dates, verbose):
|
||||||
|
response = requests.get(url)
|
||||||
|
file_path = os.path.join(save_path,
|
||||||
|
url.replace(REMOTE_PICT_BASE_URL, ''))
|
||||||
|
file_path = urllib.parse.unquote(file_path)
|
||||||
|
octets = response.content
|
||||||
|
# http://en.wikipedia.org/wiki/Template:POTD/2013-06-15
|
||||||
|
|
||||||
|
if date not in PICT_EXCEPTIONS:
|
||||||
|
assert picture_type(octets) is not None, url
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.makedirs(os.path.dirname(file_path))
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
with open(file_path, 'wb') as fp:
|
||||||
|
fp.write(octets)
|
||||||
|
|
||||||
|
print(file_path)
|
||||||
|
return urls_ok
|
||||||
|
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
"""Build test fixture from Wikipedia "POTD" data"""
|
"""Build test fixture from Wikipedia "POTD" data"""
|
||||||
|
|
||||||
save_path = os.path.join(FIXTURE_DIR,POTD_PATH)
|
|
||||||
try:
|
try:
|
||||||
os.makedirs(save_path)
|
os.makedirs(FIXTURE_TEMPLATE_POTD_DIR)
|
||||||
except FileExistsError:
|
except FileExistsError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
dates, args = parse_args(argv)
|
dates, args = parse_args(argv)
|
||||||
|
|
||||||
save_picture_urls(dates, save_path)
|
if args.url_only:
|
||||||
|
save_picture_urls(dates, FIXTURE_TEMPLATE_POTD_DIR)
|
||||||
|
else:
|
||||||
|
save_pictures(dates, FIXTURE_DOC_DIR)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main(sys.argv[1:])
|
main(sys.argv[1:])
|
||||||
|
|||||||
@@ -18,15 +18,23 @@ from 2007-01-01.
|
|||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
import re
|
import re
|
||||||
import imghdr
|
|
||||||
import time
|
import time
|
||||||
import datetime
|
import datetime
|
||||||
|
import os
|
||||||
|
import imghdr
|
||||||
|
import warnings
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
SAVE_DIR = 'pictures/'
|
SAVE_DIR = 'downloaded/'
|
||||||
POTD_PATH = 'Template:POTD/'
|
|
||||||
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/' + POTD_PATH
|
#POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
|
||||||
|
POTD_BASE_URL = 'http://127.0.0.1:8001/Template-POTD/'
|
||||||
|
|
||||||
|
REMOTE_PICT_BASE_URL = 'http://upload.wikimedia.org/wikipedia/'
|
||||||
|
LOCAL_PICT_BASE_URL = 'http://127.0.0.1:8001/'
|
||||||
|
PICT_BASE_URL = LOCAL_PICT_BASE_URL
|
||||||
|
|
||||||
POTD_IMAGE_RE = re.compile(r'src="(//upload\..*?)"')
|
POTD_IMAGE_RE = re.compile(r'src="(//upload\..*?)"')
|
||||||
PODT_EARLIEST_TEMPLATE = '2007-01-01'
|
PODT_EARLIEST_TEMPLATE = '2007-01-01'
|
||||||
|
|
||||||
@@ -35,12 +43,11 @@ RE_MONTH = RE_YEAR + r'-([01]\d)'
|
|||||||
RE_DATE = RE_MONTH + r'-([0-3]\d)'
|
RE_DATE = RE_MONTH + r'-([0-3]\d)'
|
||||||
ISO_DATE_FMT = '%Y-%m-%d'
|
ISO_DATE_FMT = '%Y-%m-%d'
|
||||||
|
|
||||||
DATEFORMS = [
|
PICT_EXCEPTIONS = {
|
||||||
('date', re.compile('^' + RE_DATE + '$')),
|
'2013-06-15', # .webm movie [1]
|
||||||
('month', re.compile('^' + RE_MONTH + '$')),
|
}
|
||||||
('year', re.compile('^' + RE_YEAR + '$'))
|
|
||||||
]
|
|
||||||
|
|
||||||
|
#[1] http://en.wikipedia.org/wiki/Template:POTD/2013-06-15
|
||||||
|
|
||||||
class NoPictureForDate(Exception):
|
class NoPictureForDate(Exception):
|
||||||
'''No Picture of the Day found for {iso_date}'''
|
'''No Picture of the Day found for {iso_date}'''
|
||||||
@@ -51,7 +58,8 @@ class NoPictureTemplateBefore(ValueError):
|
|||||||
|
|
||||||
|
|
||||||
def get_picture_url(iso_date):
|
def get_picture_url(iso_date):
|
||||||
page_url = POTD_BASE_URL+iso_date
|
page_url = POTD_BASE_URL + iso_date
|
||||||
|
print(page_url)
|
||||||
response = requests.get(page_url)
|
response = requests.get(page_url)
|
||||||
pict_url = POTD_IMAGE_RE.search(response.text)
|
pict_url = POTD_IMAGE_RE.search(response.text)
|
||||||
if pict_url is None:
|
if pict_url is None:
|
||||||
@@ -59,23 +67,6 @@ def get_picture_url(iso_date):
|
|||||||
return 'http:' + pict_url.group(1)
|
return 'http:' + pict_url.group(1)
|
||||||
|
|
||||||
|
|
||||||
def get_picture(iso_date):
|
|
||||||
pict_url = get_picture_url(iso_date)
|
|
||||||
response = requests.get(pict_url)
|
|
||||||
octets = response.content
|
|
||||||
return octets
|
|
||||||
|
|
||||||
|
|
||||||
def get_picture_type(octets):
|
|
||||||
pict_type = imghdr.what(None, octets)
|
|
||||||
if pict_type is None:
|
|
||||||
if (octets.startswith(b'<') and
|
|
||||||
b'<svg' in octets[:200] and
|
|
||||||
octets.rstrip().endswith(b'</svg>')):
|
|
||||||
pict_type = 'svg'
|
|
||||||
return pict_type
|
|
||||||
|
|
||||||
|
|
||||||
def validate_date(text):
|
def validate_date(text):
|
||||||
try:
|
try:
|
||||||
parts = [int(part) for part in text.split('-')]
|
parts = [int(part) for part in text.split('-')]
|
||||||
@@ -116,8 +107,8 @@ def gen_dates(iso_parts):
|
|||||||
yield iso_parts
|
yield iso_parts
|
||||||
|
|
||||||
|
|
||||||
def get_picture_urls(dates, verbose=False, save_fixture=False):
|
def get_picture_urls(dates, verbose=False):
|
||||||
urls = []
|
date_urls = []
|
||||||
count = 0
|
count = 0
|
||||||
for date in dates:
|
for date in dates:
|
||||||
try:
|
try:
|
||||||
@@ -132,8 +123,50 @@ def get_picture_urls(dates, verbose=False, save_fixture=False):
|
|||||||
print(url.split('/')[-1])
|
print(url.split('/')[-1])
|
||||||
else:
|
else:
|
||||||
print(url)
|
print(url)
|
||||||
urls.append(url)
|
date_urls.append((date, url))
|
||||||
return urls
|
return date_urls
|
||||||
|
|
||||||
|
|
||||||
|
def picture_type(octets):
|
||||||
|
pict_type = imghdr.what(None, octets)
|
||||||
|
if pict_type is None:
|
||||||
|
if (octets.startswith(b'<') and
|
||||||
|
b'<svg' in octets[:200] and
|
||||||
|
octets.rstrip().endswith(b'</svg>')):
|
||||||
|
pict_type = 'svg'
|
||||||
|
return pict_type
|
||||||
|
|
||||||
|
|
||||||
|
def get_pictures(dates, verbose=False):
|
||||||
|
urls_ok = []
|
||||||
|
try:
|
||||||
|
os.makedirs(SAVE_DIR)
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
for date, url in get_picture_urls(dates, verbose):
|
||||||
|
if PICT_BASE_URL == LOCAL_PICT_BASE_URL:
|
||||||
|
url = url.replace(REMOTE_PICT_BASE_URL, PICT_BASE_URL)
|
||||||
|
response = requests.get(url)
|
||||||
|
if response.status_code != 200:
|
||||||
|
warnings.warn('HTTP code {}: {}'.format(response.status_code, url))
|
||||||
|
continue
|
||||||
|
octets = response.content
|
||||||
|
if date not in PICT_EXCEPTIONS:
|
||||||
|
assert picture_type(octets) is not None, url
|
||||||
|
file_path = url.replace(PICT_BASE_URL, '')
|
||||||
|
file_name = os.path.basename(file_path)
|
||||||
|
path = os.path.join(SAVE_DIR, date.split('-')[0])
|
||||||
|
file_path = os.path.join(path, file_name)
|
||||||
|
#import pdb; pdb.set_trace()
|
||||||
|
try:
|
||||||
|
os.makedirs(path)
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
with open(file_path, 'wb') as fp:
|
||||||
|
fp.write(octets)
|
||||||
|
urls_ok.append(url)
|
||||||
|
print(file_path)
|
||||||
|
return urls_ok
|
||||||
|
|
||||||
|
|
||||||
def parse_args(argv):
|
def parse_args(argv):
|
||||||
@@ -175,7 +208,12 @@ def main(argv, get_picture_urls):
|
|||||||
|
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
|
||||||
urls = get_picture_urls(dates, args.verbose, args.fixture_save)
|
if args.url_only:
|
||||||
|
urls = get_picture_urls(dates, args.verbose)
|
||||||
|
else:
|
||||||
|
urls = get_pictures(dates, args.verbose)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
elapsed = time.time() - t0
|
elapsed = time.time() - t0
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
|
|||||||
@@ -7,13 +7,6 @@ import pytest
|
|||||||
from daypicts import *
|
from daypicts import *
|
||||||
|
|
||||||
|
|
||||||
GIF_MIN = (b'GIF89a\x01\x00\x01\x00\x00\xff\x00,\x00\x00'
|
|
||||||
b'\x00\x00\x01\x00\x01\x00\x00\x02\x00;')
|
|
||||||
SVG_MIN = b'<svg xmlns="http://www.w3.org/2000/svg"></svg>'
|
|
||||||
SVG_XML_DECL = b'<?xml version="1.0" encoding="UTF-8"?>' + SVG_MIN
|
|
||||||
NOISE = b'\xb0\x0bU\xbe]L\n\x92\xbe\xc6\xf65"\xcc\xa3\xe3'
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.network
|
@pytest.mark.network
|
||||||
def test_get_picture_url_existing():
|
def test_get_picture_url_existing():
|
||||||
url = get_picture_url('2012-01-01')
|
url = get_picture_url('2012-01-01')
|
||||||
@@ -28,19 +21,6 @@ def test_get_picture_url_not_existing():
|
|||||||
get_picture_url('2013-09-12')
|
get_picture_url('2013-09-12')
|
||||||
|
|
||||||
|
|
||||||
def test_get_picture_type_imghdr():
|
|
||||||
assert get_picture_type(GIF_MIN) == 'gif'
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_picture_type_svg():
|
|
||||||
assert get_picture_type(SVG_MIN) == 'svg'
|
|
||||||
assert get_picture_type(SVG_XML_DECL) == 'svg'
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_picture_type_unknown():
|
|
||||||
assert get_picture_type(NOISE) is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_validate_full_date():
|
def test_validate_full_date():
|
||||||
parts = validate_date('2015-1-2')
|
parts = validate_date('2015-1-2')
|
||||||
assert parts == '2015-01-02'
|
assert parts == '2015-01-02'
|
||||||
@@ -85,3 +65,23 @@ def test_gen_year_dates_leap():
|
|||||||
dates = list(gen_year_dates('2012'))
|
dates = list(gen_year_dates('2012'))
|
||||||
assert len(dates) == 366
|
assert len(dates) == 366
|
||||||
assert dates[365] == '2012-12-31'
|
assert dates[365] == '2012-12-31'
|
||||||
|
|
||||||
|
|
||||||
|
GIF_MIN = (b'GIF89a\x01\x00\x01\x00\x00\xff\x00,\x00\x00'
|
||||||
|
b'\x00\x00\x01\x00\x01\x00\x00\x02\x00;')
|
||||||
|
SVG_MIN = b'<svg xmlns="http://www.w3.org/2000/svg"></svg>'
|
||||||
|
SVG_XML_DECL = b'<?xml version="1.0" encoding="UTF-8"?>' + SVG_MIN
|
||||||
|
NOISE = b'\xb0\x0bU\xbe]L\n\x92\xbe\xc6\xf65"\xcc\xa3\xe3'
|
||||||
|
|
||||||
|
def test_picture_type_imghdr():
|
||||||
|
assert picture_type(GIF_MIN) == 'gif'
|
||||||
|
|
||||||
|
|
||||||
|
def test_picture_type_svg():
|
||||||
|
assert picture_type(SVG_MIN) == 'svg'
|
||||||
|
assert picture_type(SVG_XML_DECL) == 'svg'
|
||||||
|
|
||||||
|
|
||||||
|
def test_picture_type_unknown():
|
||||||
|
assert picture_type(NOISE) is None
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user