concurrency examples
This commit is contained in:
64
concurrency/wikipedia/build_fixture.py
Normal file
64
concurrency/wikipedia/build_fixture.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import sys
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from daypicts import get_picture_url, validate_date, gen_dates
|
||||
from daypicts import NoPictureForDate
|
||||
from daypicts import POTD_PATH
|
||||
|
||||
FIXTURE_DIR = 'fixture/'
|
||||
|
||||
|
||||
def parse_args(argv):
|
||||
parser = argparse.ArgumentParser(description=main.__doc__)
|
||||
date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day'
|
||||
parser.add_argument('date', help=date_help)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
try:
|
||||
iso_parts = validate_date(args.date)
|
||||
except ValueError as exc:
|
||||
print('error:', exc.args[0])
|
||||
parser.print_usage()
|
||||
sys.exit(2)
|
||||
|
||||
dates = list(gen_dates(iso_parts))
|
||||
if len(dates) == 1:
|
||||
print('-> Date: ', dates[0])
|
||||
else:
|
||||
fmt = '-> {} days: {}...{}'
|
||||
print(fmt.format(len(dates), dates[0], dates[-1]))
|
||||
|
||||
return dates, args
|
||||
|
||||
|
||||
def save_picture_urls(dates, save_path):
|
||||
for date in dates:
|
||||
try:
|
||||
url = get_picture_url(date)
|
||||
except NoPictureForDate as exc:
|
||||
snippet = repr(exc)
|
||||
else:
|
||||
snippet = url.replace('http://', 'src="//') + '"'
|
||||
print(date, end=' ')
|
||||
print(snippet)
|
||||
with open(os.path.join(save_path, date), 'w') as fp:
|
||||
fp.write(snippet)
|
||||
|
||||
|
||||
def main(argv):
|
||||
"""Build test fixture from Wikipedia "POTD" data"""
|
||||
|
||||
save_path = os.path.join(FIXTURE_DIR,POTD_PATH)
|
||||
try:
|
||||
os.makedirs(save_path)
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
dates, args = parse_args(argv)
|
||||
|
||||
save_picture_urls(dates, save_path)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1:])
|
||||
@@ -25,7 +25,8 @@ import datetime
|
||||
import requests
|
||||
|
||||
SAVE_DIR = 'pictures/'
|
||||
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/Template:POTD/'
|
||||
POTD_PATH = 'Template:POTD/'
|
||||
POTD_BASE_URL = 'http://en.wikipedia.org/wiki/' + POTD_PATH
|
||||
POTD_IMAGE_RE = re.compile(r'src="(//upload\..*?)"')
|
||||
PODT_EARLIEST_TEMPLATE = '2007-01-01'
|
||||
|
||||
@@ -84,7 +85,7 @@ def validate_date(text):
|
||||
test_parts = parts[:]
|
||||
while len(test_parts) < 3:
|
||||
test_parts.append(1)
|
||||
date = datetime.datetime(*(int(part) for part in test_parts))
|
||||
date = datetime.date(*(int(part) for part in test_parts))
|
||||
iso_date = date.strftime(ISO_DATE_FMT)
|
||||
iso_date = iso_date[:1+len(parts)*3]
|
||||
if iso_date < PODT_EARLIEST_TEMPLATE:
|
||||
@@ -95,7 +96,7 @@ def validate_date(text):
|
||||
def gen_month_dates(iso_month):
|
||||
first = datetime.datetime.strptime(iso_month+'-01', ISO_DATE_FMT)
|
||||
one_day = datetime.timedelta(days=1)
|
||||
date = first
|
||||
date = first.date()
|
||||
while date.month == first.month:
|
||||
yield date.strftime(ISO_DATE_FMT)
|
||||
date += one_day
|
||||
@@ -115,6 +116,26 @@ def gen_dates(iso_parts):
|
||||
yield iso_parts
|
||||
|
||||
|
||||
def get_picture_urls(dates, verbose=False, save_fixture=False):
|
||||
urls = []
|
||||
count = 0
|
||||
for date in dates:
|
||||
try:
|
||||
url = get_picture_url(date)
|
||||
except NoPictureForDate as exc:
|
||||
if verbose:
|
||||
print('*** {!r} ***'.format(exc))
|
||||
continue
|
||||
count += 1
|
||||
if verbose:
|
||||
print(format(count, '3d'), end=' ')
|
||||
print(url.split('/')[-1])
|
||||
else:
|
||||
print(url)
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
|
||||
def parse_args(argv):
|
||||
parser = argparse.ArgumentParser(description=main.__doc__)
|
||||
date_help = 'YYYY-MM-DD or YYYY-MM or YYYY: year, month and day'
|
||||
@@ -123,6 +144,8 @@ def parse_args(argv):
|
||||
help='maximum number of items to fetch')
|
||||
parser.add_argument('-u', '--url_only', action='store_true',
|
||||
help='get picture URLS only')
|
||||
parser.add_argument('-f', '--fixture_save', action='store_true',
|
||||
help='save data for local test fixture')
|
||||
parser.add_argument('-v', '--verbose', action='store_true',
|
||||
help='display progress information')
|
||||
args = parser.parse_args(argv)
|
||||
@@ -145,26 +168,6 @@ def parse_args(argv):
|
||||
return dates, args
|
||||
|
||||
|
||||
def get_picture_urls(dates, verbose=False):
|
||||
urls = []
|
||||
count = 0
|
||||
for date in dates:
|
||||
try:
|
||||
url = get_picture_url(date)
|
||||
except NoPictureForDate as exc:
|
||||
if verbose:
|
||||
print('*** {!r} ***'.format(exc))
|
||||
continue
|
||||
count += 1
|
||||
if verbose:
|
||||
print(format(count, '3d'), end=' ')
|
||||
print(url.split('/')[-1])
|
||||
else:
|
||||
print(url)
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
|
||||
def main(argv, get_picture_urls):
|
||||
"""Get Wikipedia "Picture of The Day" for date, month or year"""
|
||||
|
||||
@@ -172,7 +175,7 @@ def main(argv, get_picture_urls):
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
urls = get_picture_urls(dates, args.verbose)
|
||||
urls = get_picture_urls(dates, args.verbose, args.fixture_save)
|
||||
|
||||
elapsed = time.time() - t0
|
||||
if args.verbose:
|
||||
|
||||
@@ -6,19 +6,19 @@ import sys
|
||||
import asyncio
|
||||
import aiohttp
|
||||
|
||||
from daypicts import main
|
||||
from daypicts import NoPictureForDate
|
||||
from daypicts import POTD_BASE_URL
|
||||
from daypicts import POTD_IMAGE_RE
|
||||
from daypicts import main, NoPictureForDate
|
||||
from daypicts import POTD_BASE_URL, POTD_IMAGE_RE
|
||||
|
||||
GLOBAL_TIMEOUT = 300 # seconds
|
||||
MAX_CONCURRENT_REQUESTS = 30
|
||||
|
||||
|
||||
@asyncio.coroutine
|
||||
def get_picture_url(iso_date):
|
||||
def get_picture_url(iso_date, semaphore):
|
||||
page_url = POTD_BASE_URL+iso_date
|
||||
response = yield from aiohttp.request('GET', page_url)
|
||||
text = yield from response.text()
|
||||
with (yield from semaphore):
|
||||
response = yield from aiohttp.request('GET', page_url)
|
||||
text = yield from response.text()
|
||||
pict_url = POTD_IMAGE_RE.search(text)
|
||||
if pict_url is None:
|
||||
raise NoPictureForDate(iso_date)
|
||||
@@ -27,7 +27,8 @@ def get_picture_url(iso_date):
|
||||
|
||||
@asyncio.coroutine
|
||||
def get_picture_urls(dates, verbose=False):
|
||||
tasks = [get_picture_url(date) for date in dates]
|
||||
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
|
||||
tasks = [get_picture_url(date, semaphore) for date in dates]
|
||||
urls = []
|
||||
count = 0
|
||||
# get results as jobs are done
|
||||
|
||||
@@ -7,13 +7,12 @@ from concurrent import futures
|
||||
|
||||
from daypicts import main, get_picture_url, NoPictureForDate
|
||||
|
||||
MAX_NUM_THREADS = 400
|
||||
GLOBAL_TIMEOUT = 300 # seconds
|
||||
MAX_CONCURRENT_REQUESTS = 30
|
||||
|
||||
|
||||
def get_picture_urls(dates, verbose=False):
|
||||
num_threads = min(len(dates), MAX_NUM_THREADS)
|
||||
pool = futures.ThreadPoolExecutor(num_threads)
|
||||
pool = futures.ThreadPoolExecutor(MAX_CONCURRENT_REQUESTS)
|
||||
|
||||
pending = {}
|
||||
for date in dates:
|
||||
|
||||
BIN
concurrency/wikipedia/fixture/docroot.zip
Normal file
BIN
concurrency/wikipedia/fixture/docroot.zip
Normal file
Binary file not shown.
Reference in New Issue
Block a user