diff --git a/build_docs.py b/build_docs.py
index d03d687..1e579be 100755
--- a/build_docs.py
+++ b/build_docs.py
@@ -23,6 +23,7 @@
from __future__ import annotations
import argparse
+import concurrent.futures
import dataclasses
import datetime as dt
import filecmp
@@ -1262,21 +1263,41 @@ def proofread_canonicals(
/3/whatsnew/3.11.html, which may not exist yet.
"""
logging.info("Checking canonical links...")
- canonical_re = re.compile(
- """"""
- )
- for file in www_root.glob("**/*.html"):
- html = file.read_text(encoding="UTF-8", errors="surrogateescape")
- canonical = canonical_re.search(html)
- if not canonical:
- continue
- target = canonical.group(1)
- if not (www_root / target).exists():
- logging.info("Removing broken canonical from %s to %s", file, target)
- html = html.replace(canonical.group(0), "")
- file.write_text(html, encoding="UTF-8", errors="surrogateescape")
- if not skip_cache_invalidation:
- purge(http, str(file).replace("/srv/docs.python.org/", ""))
+ worker_count = (os.cpu_count() or 1) + 2
+ with concurrent.futures.ThreadPoolExecutor(worker_count) as executor:
+ futures = {
+ executor.submit(_check_canonical_rel, file, www_root)
+ for file in www_root.glob("**/*.html")
+ }
+ paths_to_purge = {
+ res.relative_to(www_root) # strip the leading /srv/docs.python.org
+ for fut in concurrent.futures.as_completed(futures)
+ if (res := fut.result()) is not None
+ }
+ if not skip_cache_invalidation:
+ purge(http, *paths_to_purge)
+
+
+def _check_canonical_rel(file: Path, www_root: Path):
+ # Check for a canonical relation link in the HTML.
+ # If one exists, ensure that the target exists
+ # or otherwise remove the canonical link element.
+ prefix = b''
+ pfx_len = len(prefix)
+ sfx_len = len(suffix)
+ html = file.read_bytes()
+ try:
+ start = html.index(prefix)
+ end = html.index(suffix, start + pfx_len)
+ except ValueError:
+ return None
+ target = html[start + pfx_len : end].decode(errors="surrogateescape")
+ if (www_root / target).exists():
+ return None
+ logging.info("Removing broken canonical from %s to %s", file, target)
+ file.write_bytes(html[:start] + html[end + sfx_len :])
+ return file
def purge(http: urllib3.PoolManager, *paths: Path | str) -> None: