minor refactoring to make it easier to call shorten()

short.py appends timestamps to short.htaccesss
short.py now reads files and stdin
2025-05-22 14:28:42 -03:00 · 2025-05-22 13:44:46 -03:00 · 2025-05-22 13:24:50 -03:00 · 2025-05-22 10:05:54 -03:00
4 changed files with 111 additions and 45 deletions
--- a/links/sample-urls.txt
+++ b/links/sample-urls.txt
@ -0,0 +1,47 @@
+https://www.oreilly.com/library/view/fluent-python-2nd/9781492056348/
+https://dask.org/
+http://example.com/1572039572038573208
+http://www.unicode.org/
+https://www.techcrunch.com/2024/startup-funding-trends
+https://blog.medium.com/writing-tips-for-beginners
+https://github.com/microsoft/typescript
+https://stackoverflow.com/questions/javascript-async-await
+https://www.reddit.com/r/programming/hot
+https://docs.google.com/spreadsheets/create
+https://www.youtube.com/watch?v=dQw4w9WgXcQ
+https://www.amazon.com/dp/B08N5WRWNW
+https://support.apple.com/iphone-setup-guide
+https://www.wikipedia.org/wiki/Machine_Learning
+https://www.linkedin.com/in/johndoe123
+https://www.instagram.com/p/CxYz123AbC/
+https://twitter.com/elonmusk/status/1234567890
+https://www.facebook.com/events/987654321
+https://drive.google.com/file/d/1AbCdEfGhIjKlMnOp/view
+https://www.dropbox.com/s/qwerty123/document.pdf
+https://zoom.us/j/1234567890?pwd=abcdef
+https://calendly.com/janedoe/30min-meeting
+https://www.shopify.com/admin/products/new
+https://stripe.com/docs/api/charges/create
+https://www.paypal.com/invoice/create
+https://mailchimp.com/campaigns/dashboard
+https://analytics.google.com/analytics/web/
+https://console.aws.amazon.com/s3/buckets
+https://portal.azure.com/dashboard
+https://www.figma.com/file/AbCdEf123456/design-system
+https://www.notion.so/workspace/project-notes
+https://trello.com/b/AbCdEfGh/marketing-board
+https://slack.com/app_redirect?channel=general
+https://discord.gg/AbCdEfGh123
+https://www.twitch.tv/streamername/videos
+https://www.spotify.com/playlist/37i9dQZF1DXcBWIGoYBM5M
+https://www.netflix.com/browse/genre/83
+https://www.hulu.com/series/breaking-bad-2008
+https://www.airbnb.com/rooms/12345678
+https://www.booking.com/hotel/us/grand-plaza.html
+https://www.expedia.com/flights/search?trip=roundtrip
+https://www.uber.com/ride/request
+https://www.doordash.com/store/pizza-palace-123
+https://www.grubhub.com/restaurant/tacos-el-rey-456
+https://www.zillow.com/homes/for_sale/San-Francisco-CA
+https://www.craigslist.org/about/sites
+https://www.python.org/dev/peps/pep-0484/
--- a/links/short.htaccess
+++ b/links/short.htaccess
@ -1 +1 @@
-# file created and managed by short.py
+# content of short.htaccess file created and managed by short.py
--- a/links/short.py
+++ b/links/short.py
@ -1,22 +1,41 @@
 #!/usr/bin/env python3

+"""
+short.py generates unique short URLs.
+
+This script reads lines from stdin or files named as arguments, then:
+
+1. retrieves or creates new short URLs, taking into account existing RedirectTemp
+   directives in custom.htaccess or short.htaccess;
+2. appends RedirectTemp directives for newly created short URLs to short.htaccess;
+3. outputs the list of (short, long) URLs retrieved or created.
+
+"""
+
+import fileinput
 import itertools
 from collections.abc import Iterator
+from time import strftime
+
+HTACCESS_CUSTOM = 'custom.htaccess'
+HTACCESS_SHORT = 'short.htaccess'
+HTACCESS_FILES = (HTACCESS_CUSTOM, HTACCESS_SHORT)
+BASE_DOMAIN = 'fpy.li'


-def load_redirects():
+def load_redirects() -> tuple[dict, dict]:
    redirects = {}
    targets = {}
-    for filename in ('custom.htaccess', 'short.htaccess'):
+    for filename in HTACCESS_FILES:
        with open(filename) as fp:
            for line in fp:
                if line.startswith('RedirectTemp'):
                    _, short, long = line.split()
                    short = short[1:]  # Remove leading slash
-                    assert short not in redirects, f"{filename}: duplicate redirect from {short}"
-                    # custom is live since 2022, we cannot change it remove duplicate targets
-                    if not filename.startswith('custom'):
-                        assert long not in targets, f"{filename}: Duplicate redirect to {long}"
+                    assert short not in redirects, f'{filename}: duplicate redirect from {short}'
+                    # htaccess.custom is live since 2022, we can't change it remove duplicate targets
+                    if filename != HTACCESS_CUSTOM:
+                        assert long not in targets, f'{filename}: duplicate redirect to {long}'
                    redirects[short] = long
                    targets[long] = short
    return redirects, targets
@ -25,55 +44,51 @@ def load_redirects():
 SDIGITS = '23456789abcdefghjkmnpqrstvwxyz'


-def gen_short() -> Iterator[str]:
-    """
-    Generate every possible sequence of SDIGITS.
-    """
-    length = 1
+def gen_short(start_len=1) -> Iterator[str]:
+    """Generate every possible sequence of SDIGITS, starting with start_len"""
+    length = start_len
    while True:
        for short in itertools.product(SDIGITS, repeat=length):
            yield ''.join(short)
        length += 1


-def shorten(n: int) -> str:
-    """
-    Get Nth short URL made from SDIGITS, where 0 is the first.
-    """
-    iter_short = gen_short()
-    for _ in range(n+1):
-        short = next(iter_short)
-    return short
-
-
-def gen_free_short(redirects: dict) -> Iterator[str]:
-    """
-    Generate next available short URL.
-    """
-    for short in gen_short():
+def gen_unused_short(redirects: dict) -> Iterator[str]:
+    """Generate next available short URL of len >= 2."""
+    for short in gen_short(2):
        if short not in redirects:
            yield short


-def new_urls(urls: list[str], redirects: dict, targets: dict) -> None:
-    iter_short = gen_free_short(redirects)
-    with open('short.htaccess', 'a') as fp:
-        for url in urls:
-            assert 'fpy.li' not in url, f"{url} is a fpy.li URL"
-            if url in targets:
-                continue
-            short = next(iter_short)
-            redirects[short] = url
-            targets[url] = short
-            fp.write(f"RedirectTemp /{short} {url}\n")
-
-
-def main():
-    from random import randrange
-    urls = [f'https://example.com/{randrange(100000)}.html' for n in range(7)]
-
+def shorten(urls: list[str]) -> list[tuple[str, str]]:
+    """Return (short, long) pairs, appending directives to HTACCESS_SHORT as needed."""
    redirects, targets = load_redirects()
-    new_urls(urls, redirects, targets)
+    iter_short = gen_unused_short(redirects)
+    pairs = []
+    timestamp = strftime('%Y-%m-%d %H:%M:%S')
+    with open(HTACCESS_SHORT, 'a') as fp:
+        for long in urls:
+            assert BASE_DOMAIN not in long, f'{long} is a {BASE_DOMAIN} URL'
+            if long in targets:
+                short = targets[long]
+            else:
+                short = next(iter_short)
+                redirects[short] = long
+                targets[long] = short
+                if timestamp:
+                    fp.write(f'\n# appended: {timestamp}\n')
+                    timestamp = None
+                fp.write(f'RedirectTemp /{short} {long}\n')
+            pairs.append((short, long))
+
+    return pairs
+
+
+def main() -> None:
+    """read URLS from filename arguments or stdin"""
+    urls = [line.strip() for line in fileinput.input(encoding='utf-8')]
+    for short, long in shorten(urls):
+        print(f'{BASE_DOMAIN}/{short}\t{long}')


 if __name__ == '__main__':
--- a/ruff.toml
+++ b/ruff.toml
@ -0,0 +1,4 @@
+line-length = 100
+[format]
+# Like Python's repr(), use single quotes for strings.
+quote-style = "single"
Author	SHA1	Message	Date
Luciano Ramalho	cf99650007	minor refactoring to make it easier to call shorten()	2025-05-22 14:28:42 -03:00
Luciano Ramalho	ec03da74ca	short.py appends timestamps to short.htaccesss	2025-05-22 13:44:46 -03:00
Luciano Ramalho	5b743b5bd7	short.py now reads files and stdin	2025-05-22 13:24:50 -03:00
Luciano Ramalho	648e9f6394	Update short.py to return list of URL substitutions	2025-05-22 10:05:54 -03:00