from compression import gzip import concurrent.futures from pathlib import Path import html.parser import functools import argparse import json import sys import re IGNORED_ID_RE = re.compile( r""" index-\d+ | id\d+ | [_a-z]+_\d+ """, re.VERBOSE, ) class IDGatherer(html.parser.HTMLParser): def __init__(self, ids): super().__init__() self.__ids = ids def handle_starttag(self, tag, attrs): for name, value in attrs: if name == 'id': if not IGNORED_ID_RE.fullmatch(value): self.__ids.add(value) def get_ids_from_file(path): ids = set() gatherer = IDGatherer(ids) with path.open(encoding='utf-8') as file: while chunk := file.read(4096): gatherer.feed(chunk) return ids def gather_ids(htmldir, *, verbose_print): if not htmldir.joinpath('objects.inv').exists(): raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory') if sys._is_gil_enabled: pool = concurrent.futures.ProcessPoolExecutor() else: pool = concurrent.futures.ThreadPoolExecutor() tasks = {} for path in htmldir.glob('**/*.html'): relative_path = path.relative_to(htmldir) if '_static' in relative_path.parts: continue if 'whatsnew' in relative_path.parts: continue tasks[relative_path] = pool.submit(get_ids_from_file, path=path) ids_by_page = {} for relative_path, future in tasks.items(): verbose_print(relative_path) ids = future.result() ids_by_page[str(relative_path)] = ids verbose_print(f' - {len(ids)} ids found') common = set.intersection(*ids_by_page.values()) verbose_print(f'Filtering out {len(common)} common ids') for key, page_ids in ids_by_page.items(): ids_by_page[key] = sorted(page_ids - common) return ids_by_page def do_check(baseline, checked, excluded, *, verbose_print): successful = True for name, baseline_ids in sorted(baseline.items()): try: checked_ids = checked[name] except KeyError: successful = False print(f'{name}: (page missing)') print() else: missing_ids = set(baseline_ids) - set(checked_ids) if missing_ids: missing_ids = { a for a in missing_ids if not IGNORED_ID_RE.fullmatch(a) and (name, a) not in excluded } if missing_ids: successful = False for missing_id in sorted(missing_ids): print(f'{name}: {missing_id}') print() return successful def main(argv): parser = argparse.ArgumentParser() parser.add_argument( '-v', '--verbose', action='store_true', help='print out more information', ) subparsers = parser.add_subparsers(dest='command', required=True) collect = subparsers.add_parser( 'collect', help='collect IDs from a set of HTML files' ) collect.add_argument( 'htmldir', type=Path, help='directory with HTML documentation' ) collect.add_argument( '-o', '--outfile', help='File to save the result in; default /html-ids.json.gz', ) check = subparsers.add_parser('check', help='check two archives of IDs') check.add_argument( 'baseline_file', type=Path, help='file with baseline IDs' ) check.add_argument('checked_file', type=Path, help='file with checked IDs') check.add_argument( '-x', '--exclude-file', type=Path, help='file with IDs to exclude from the check', ) args = parser.parse_args(argv[1:]) if args.verbose: verbose_print = functools.partial(print, file=sys.stderr) else: def verbose_print(*args, **kwargs): """do nothing""" if args.command == 'collect': ids = gather_ids(args.htmldir, verbose_print=verbose_print) if args.outfile is None: args.outfile = args.htmldir / 'html-ids.json.gz' with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile: json.dump({'ids_by_page': ids}, zfile) if args.command == 'check': with gzip.open(args.baseline_file) as zfile: baseline = json.load(zfile)['ids_by_page'] with gzip.open(args.checked_file) as zfile: checked = json.load(zfile)['ids_by_page'] excluded = set() if args.exclude_file: with open(args.exclude_file, encoding='utf-8') as file: for line in file: line = line.strip() if line and not line.startswith('#'): name, sep, excluded_id = line.partition(':') if sep: excluded.add((name.strip(), excluded_id.strip())) if do_check(baseline, checked, excluded, verbose_print=verbose_print): verbose_print('All OK') else: sys.stdout.flush() print( 'ERROR: Removed IDs found', 'The above HTML IDs were removed from the documentation, ' + 'resulting in broken links. Please add them back.', sep='\n', file=sys.stderr, ) if args.exclude_file: print(f'Alternatively, add them to {args.exclude_file}.') sys.exit(1) if __name__ == '__main__': main(sys.argv)