Add files via upload

This commit is contained in:
Peter Norvig 2024-02-20 23:01:37 -08:00 committed by GitHub
parent 1ae17ba5b6
commit d5d31f932d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 7754 additions and 5900 deletions

File diff suppressed because one or more lines are too long

View File

@ -13,7 +13,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
@ -31,24 +31,24 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Reading Data: `rides` and `yearly`\n",
"# Reading Data: `rides`, `yearly`, and `daily`\n",
"\n",
"I saved a bunch of my recorded [Strava](https://www.strava.com/athletes/575579) rides, most of them longer than 25 miles, as [`bikerides.tsv`](bikerides.tsv). The columns are: the date; the year; a title; the elapsed time of the ride; the length of the ride in miles; and the total climbing in feet, e.g.: \n",
"I saved a bunch of my recorded [Strava](https://www.strava.com/athletes/575579) rides, most of them longer than 25 miles, as [`bikerides.tsv`](bikerides.tsv). The tab-separated columns are: the date; the year; a title; the elapsed time of the ride; the length of the ride in miles; and the total climbing in feet, e.g.: \n",
"\n",
" Mon, 10/5\t2020\tHalf way around the bay on bay trail\t6:26:35\t80.05\t541\n",
" Mon, 10/5/2020\tHalf way around the bay on bay trail\t6:26:35\t80.05\t541\n",
" \n",
"I parse the file into the pandas dataframe `rides`, adding derived columns for miles per hour, vertical meters climbed per hour (VAM), grade in feet per mile, grade in percent, and kilometers ridden:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"def parse_rides(lines):\n",
" \"\"\"Parse a bikerides.tsv file.\"\"\"\n",
" return drop_index(add_columns(pd.read_table(lines, comment='#',\n",
" return drop_index(add_ride_columns(pd.read_table(lines, comment='#',\n",
" converters=dict(hours=parse_hours, feet=parse_int))))\n",
"\n",
"def parse_hours(time: str) -> float: \n",
@ -57,18 +57,20 @@
" for i, x in enumerate(reversed(time.split(':'))))\n",
" return round(hrs, 2)\n",
"\n",
"def parse_int(field: str) -> int: return int(field.replace(',', ''))\n",
"def parse_int(field: str) -> int: return int(field.replace(',', '').replace('ft', '').replace('mi', ''))\n",
"\n",
"def add_columns(rides) -> pd.DataFrame:\n",
"def add_ride_columns(rides) -> pd.DataFrame:\n",
" \"\"\"Compute new columns from existing ones.\"\"\"\n",
" mi, hr, ft = rides['miles'], rides['hours'], rides['feet']\n",
" if 'date' in rides and 'year' not in rides:\n",
" rides.insert(1, \"year\", [int(str(d).split('/')[-1]) for d in rides['date'].tolist()])\n",
" return rides.assign(\n",
" mph=round(mi / hr, 2),\n",
" vam=round(ft / hr / 3.28084),\n",
" fpm=round(ft / mi),\n",
" fpmi=round(ft / mi),\n",
" pct=round(ft / mi * 100 / 5280, 2),\n",
" kms=round(mi * 1.609, 2),\n",
" km_up=round(ft * 0.0003048, 1))\n",
" meters=round(ft * 0.3048))\n",
"\n",
"def drop_index(frame) -> pd.DataFrame:\n",
" \"\"\"Drop the index column.\"\"\"\n",
@ -78,15 +80,17 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 125,
"metadata": {},
"outputs": [],
"source": [
"rides = parse_rides(open('bikerides.tsv'))\n",
"yearly = parse_rides(open('bikeyears.tsv')).drop(columns=['date', 'title'])\n",
"\n",
"yearly = parse_rides(open('bikeyears.tsv')).drop(columns='date')\n",
"\n",
"daily = yearly.copy()\n",
"for name in 'hours miles feet kms km_up'.split():\n",
" daily[name] = round(daily[name].map(lambda x: x / 350), 3 if name == 'km_up' else 1)"
"for name in 'hours miles feet kms meters'.split():\n",
" daily[name] = round(daily[name].map(lambda x: x / (6 * 52)), 1)"
]
},
{
@ -106,7 +110,7 @@
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
@ -118,30 +122,12 @@
" title, mi, ft, *times = segment.split(',')[:5]\n",
" for time in times:\n",
" records.append((title, parse_hours(time), float(mi), parse_int(ft)))\n",
" return add_columns(pd.DataFrame(records, columns=('title', 'hours', 'miles', 'feet')))"
" return add_ride_columns(pd.DataFrame(records, columns=('title', 'hours', 'miles', 'feet')))"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"segments = parse_segments(open('bikesegments.csv'))"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"places = pd.read_table(open('bikeplaceshort.csv'), sep=',', comment='#')"
]
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
@ -160,18 +146,25 @@
},
{
"cell_type": "code",
"execution_count": 57,
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"segments = parse_segments(open('bikesegments.csv'))\n",
"\n",
"places = drop_index(pd.read_table(open('bikeplaceshort.csv'), sep=',', comment='#'))\n",
"\n",
"tiles = drop_index(pd.DataFrame(columns='date square cluster total comment'.split(), data=[\n",
" ('06/30/2023', 13, 689, 2640, 'Rides in east Bay!9298603815'),\n",
" ('04/14/2023', 13, 630, 2595, 'Black Sands Beach connects Marin to max cluster!8891171008'),\n",
" ('03/04/2023', 13, 583, 2574, 'Almaden rides connects Gilroy to max cluster!8654437264'),\n",
" ('10/22/2022', 13, 396, 2495, 'Alviso levees to get to 13x13 max square!8003921626'),\n",
" ('10/16/2022', 12, 393, 2492, 'Milpitas ride connects East Bay to max cluster!7974994605'),\n",
" ('09/08/2022', 11, 300, 2487, 'First started tracking tiles')])\n",
" ).style.format({'comment': make_clickable, 'date': link_date})"
" ('01/01/2024', 14, 1056, 3105, 'Start of this year'),\n",
" ('12/08/2023', 14, 1042, 3084, 'Benicia ride connects East Bay and Napa clusters!10350071201'),\n",
" ('11/05/2023', 14, 932, 2914, 'Alum Rock ride gets 14x14 max square!8850905872'),\n",
" ('06/30/2023', 13, 689, 2640, 'Rides in east Bay fill in holes!9298603815'),\n",
" ('04/14/2023', 13, 630, 2595, 'Black Sands Beach low-tide hike connects Marin to max cluster!8891171008'),\n",
" ('03/04/2023', 13, 583, 2574, 'Almaden rides connects Gilroy to max cluster!8654437264'),\n",
" ('10/22/2022', 13, 396, 2495, 'Alviso levees to get to 13x13 max square!8003921626'),\n",
" ('10/16/2022', 12, 393, 2492, 'Milpitas ride connects East Bay to max cluster!7974994605'),\n",
" ('09/08/2022', 11, 300, 2487, 'First started tracking tiles')])\n",
" ).style.format({'comment': make_clickable, 'date': link_date})"
]
},
{
@ -183,7 +176,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
@ -216,7 +209,7 @@
" \"\"\"Given a ride distance in miles and total climb in feet, estimate time in minutes.\"\"\"\n",
" return round(60 * miles / estimator(feet / miles))\n",
"\n",
"def top(frame, field, n=20): return frame.sort_values(field, ascending=False).head(n)"
"def top(frame, field, n=20): return drop_index(frame.sort_values(field, ascending=False).head(n))"
]
},
{
@ -228,23 +221,33 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
"def mapl(f, *values): return list(map(f, *values))\n",
"\n",
"def wandering(places=places, by=['pct']):\n",
" \"All those who wander are not lost.\" # Also try by=['cat', 'pct']\n",
" frame = places.sort_values(by=by, ascending=('pct' not in by))\n",
" M = 1_000_000\n",
" for i, (name, miles, county, pct) in frame.iterrows():\n",
" # Some fiddling to get the format right\n",
" p = f'{pct:.1f}' if (pct > 0.1) else f'{pct:.3f}'\n",
" mymiles = pct / 100 * miles\n",
" done = f'{rounded(mymiles)}/{rounded(miles)} mi'\n",
" togo = next((f'{rounded(target / 100 * miles - mymiles):>5} mi for {target}%' \n",
" for target in (0.02, 0.1, 0.2, 1, 2, 25, 50, 90, 99)\n",
" if mymiles < target / 100 * miles), '')\n",
" print(f'{county} {p:>5}% {name:25} {done:>15} {togo}') \n",
" F = drop_index(places.sort_values(by=by, ascending=('pct' not in by)))\n",
" pd.set_option('display.max_rows', None)\n",
" return pd.DataFrame(\n",
" {'pct': [f'{p:.1f}%' if (p > 1) else f'{p:.3f}%' for p in F['pct']],\n",
" 'county': F['county'],\n",
" 'name': F['name'],\n",
" 'total': F['miles'],\n",
" 'done': mapl(rounded, F['miles'] * F['pct'] / 100),\n",
" 'to next badge': mapl(to_go, F['pct'], F['miles'])})\n",
"\n",
"\n",
"def to_go(pct, miles, targets=(0.02, 0.1, 0.2, 1, 2, 25, 50, 90, 99)):\n",
" \"\"\"Describe next target to hit to get a badge.\"\"\"\n",
" done = pct * miles / 100\n",
" return next((f'{rounded(target / 100 * miles - done):>5} mi to {target}%' \n",
" for target in targets\n",
" if done < target / 100 * miles), \n",
" '')\n",
" \n",
"def rounded(x: float) -> str: \n",
" \"\"\"Round x to 3 spaces wide (if possible).\"\"\"\n",
@ -263,32 +266,36 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"def make_leaders(data):\n",
" \"\"\"Make a dataframe of leaders in two counties.\"\"\"\n",
" leaders = pd.DataFrame(data, columns=['Name', 'Initials', 'SMC %', 'SCC %', 'Front?'])\n",
" leaders = pd.DataFrame(data, columns=['Name', 'Initials', 'SMC %', 'SCC %'])\n",
" leaders['SMC miles'] = [round(2814 * d[2] / 100) for d in data]\n",
" leaders['SCC miles'] = [round(7569 * d[3] / 100) for d in data]\n",
" leaders['Total miles'] = leaders['SMC miles'] + leaders['SCC miles']\n",
" leaders['Avg %'] = (leaders['SMC %'] + leaders['SCC %']) / 2\n",
" return drop_index(leaders.sort_values('Avg %', ascending=False))\n",
"\n",
"leaders = make_leaders([ # Data as of Sept 20, 2023 (Name, Initials, SMC, SCC, Frontier?)\n",
" ('Barry Mann', 'BM', 76.97, 30.21, 1), ('Jason Molenda', 'JM', 7.13, 55.39, 1), \n",
" ('Peter Norvig', 'PN', 61.56, 32.8, 1), ('Brian Feinberg', 'BF', 32.5, 43.68, 1),\n",
" ('Jim Brooks', 'JB', 4.23, 44.36, 0), ('Megan Gardner', 'MG', 97.62, 8.69, 1),\n",
" ('Matthew Ring', 'MR', 78.85, 1.48, 0), ('Elliot Hoff', 'EF', 52.88, 8.14, 0)])\n",
"leaders = make_leaders([ # Data as of Jan 3, 2024 (Name, Initials, SMC, SCC)\n",
" ('Megan Gardner', 'MG', 99.01, 13.6),\n",
" ('Barry Mann', 'BM', 77.41, 30.38), \n",
" ('Peter Norvig', 'PN', 63.5, 33.0),\n",
" ('Brian Feinberg', 'BF', 32.5, 43.9),\n",
" ('Jason Molenda', 'JM', 7.56, 56.25) \n",
" ])\n",
" \n",
"def pareto_front(leaders):\n",
" ax = leaders.plot('SMC %', 'SCC %', grid=True, kind='scatter')\n",
" front = sorted((x, y) for i, (_, _, x, y, f, *_) in leaders.iterrows() if f)\n",
" ax = leaders.plot('SMC %', 'SCC %', kind='scatter')\n",
" front = sorted((x, y) for i, (_, _, x, y, *_) in leaders.iterrows())\n",
" ax.plot(*zip(*front), ':'); ax.axis('square'); grid()\n",
" ax.set_xlabel('San Mateo County %')\n",
" ax.set_ylabel('Santa Clara County %')\n",
" for i, (name, initials, x, y, *_) in leaders.iterrows():\n",
" ax.text(x - 2, y + 2, initials)\n",
" return leaders.drop(columns=['Front?'])"
" return leaders"
]
},
{
@ -300,7 +307,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
@ -314,16 +321,14 @@
" \"\"\"The number of rides needed to reach an Eddington number target.\"\"\"\n",
" return target - sum(distances >= target)\n",
"\n",
"def Ed_gaps(rides, E_km=100, E_mi=67, N=11) -> dict:\n",
"def Ed_gaps(rides, E_km=103, E_mi=69, N=9) -> dict:\n",
" \"\"\"A table of gaps to Eddington numbers by year.\"\"\"\n",
" data = [(E_km + d, sum(rides.kms >= E_km + d), Ed_gap(rides.kms, E_km + d), \n",
" E_mi + d, sum(rides.miles >= E_mi + d), Ed_gap(rides.miles, E_mi + d))\n",
" data = [(E_km + d, Ed_gap(rides.kms, E_km + d), E_mi + d, Ed_gap(rides.miles, E_mi + d))\n",
" for d in range(N)]\n",
" df = pd.DataFrame(data, columns=['kms', 'km rides', 'kms gap', \n",
" 'miles', 'miles rides', 'miles gap'])\n",
" df = pd.DataFrame(data, columns=['kms', 'kms gap', 'miles', 'miles gap'])\n",
" return drop_index(df)\n",
"\n",
"def Ed_progress(rides, years=range(2023, 2013, -1)) -> pd.DataFrame:\n",
"def Ed_progress(rides, years=range(2024, 2013, -1)) -> pd.DataFrame:\n",
" \"\"\"A table of Eddington numbers by year, and a plot.\"\"\"\n",
" def Ed(year, unit): return Ed_number(rides[rides['year'] <= year], unit)\n",
" data = [(y, Ed(y, 'kms'), Ed(y, 'miles')) for y in years]\n",

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff