#!/usr/bin/env python3 """ Script to identify docstring formatting issues in Python files. Checks for: 1. Bullet lists (lines starting with *, -, +) without blank line before them 2. Numbered lists (lines starting with digits and .) without blank line before them 3. Code blocks (lines starting with >>>) without blank line before them 4. reStructuredText directives (lines starting with ..) without blank line before them These are common reStructuredText/Sphinx formatting issues that can cause documentation to render incorrectly. The script attempts to avoid false positives by: - Skipping content inside literal blocks (after :: markers) - Ignoring items that follow Sphinx field markers (:param:, :Example:, etc.) - Handling Python interactive session output (lines between >>> prompts) - Recognizing indented continuations Known limitations: - May flag some valid trailing >>> prompts in code examples - Line numbers are approximate (offset from docstring start) - Some complex nested structures may not be handled perfectly Usage: python check_docstring_formatting.py [paths...] python check_docstring_formatting.py -v [paths...] If no paths are specified, defaults to ../python relative to this script. """ import argparse import os import re import ast import sys from pathlib import Path def get_docstrings_from_file_regex(filepath, content): """ Fallback docstring extraction using regex when AST parsing fails. This handles files with Python 3.10+ syntax like match statements. """ docstrings = [] lines = content.split('\n') # Find triple-quoted strings that appear after def/class or at module level in_docstring = False docstring_lines = [] docstring_start = 0 quote_style = None i = 0 while i < len(lines): line = lines[i] stripped = line.strip() if not in_docstring: # Check for start of a docstring (triple quotes) for quote in ['"""', "'''"]: if quote in stripped: # Check if it's the start of a docstring idx = stripped.find(quote) # Make sure it's not inside a comment or after code before = stripped[:idx].strip() if before == '' or before.endswith(':'): in_docstring = True quote_style = quote docstring_start = i + 1 # Check if docstring ends on same line after_start = stripped[idx + 3:] if quote in after_start: # Single line docstring end_idx = after_start.find(quote) docstring_content = after_start[:end_idx] if docstring_content.strip(): docstrings.append((docstring_start, docstring_content, 'Unknown')) in_docstring = False quote_style = None else: docstring_lines = [after_start] break else: # We're inside a docstring, look for the end if quote_style in stripped: # Found end of docstring end_idx = line.find(quote_style) docstring_lines.append(line[:end_idx]) full_docstring = '\n'.join(docstring_lines) if full_docstring.strip(): docstrings.append((docstring_start, full_docstring, 'Unknown')) in_docstring = False docstring_lines = [] quote_style = None else: docstring_lines.append(line) i += 1 return docstrings def get_docstrings_from_file(filepath): """Extract all docstrings from a Python file with their line numbers.""" try: with open(filepath, 'r', encoding='utf-8') as f: content = f.read() except Exception: return [] # Try AST parsing first try: tree = ast.parse(content, filename=str(filepath)) docstrings = [] for node in ast.walk(tree): # Only check nodes that can have docstrings if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef, ast.Module)): try: docstring = ast.get_docstring(node, clean=False) if docstring: # Get the line number where the docstring starts if isinstance(node, ast.Module): # Module docstring is at the top line_num = 1 else: # For functions/classes, it's the first statement line_num = node.body[0].lineno if node.body else node.lineno # Get the name of the function/class/module if isinstance(node, ast.Module): node_name = 'module' else: node_name = node.name docstrings.append((line_num, docstring, node_name)) except: # Skip if we can't get the docstring pass return docstrings except SyntaxError: # Fall back to regex-based extraction for files with newer Python syntax return get_docstrings_from_file_regex(filepath, content) def check_docstring_formatting(docstring): """ Check for formatting issues in a docstring. Returns a list of (line_offset, issue_description) tuples. """ issues = [] lines = docstring.split('\n') # Patterns that should have a blank line before them patterns = [ (r'^\s*[\*\-\+]\s+', 'bullet list item'), (r'^\s*\d+\.\s+', 'numbered list item'), (r'^\s*>>>', 'code block'), (r'^\s*\.\.\s+', 'reStructuredText directive'), ] # Sphinx field patterns that can contain code blocks or lists sphinx_field_pattern = r'^\s*:[A-Za-z_][A-Za-z0-9_]*:' # Track if we're in a literal block (started by ::) in_literal_block = False literal_block_indent = 0 for i, line in enumerate(lines): # Skip the first line (always part of the opening) if i == 0: continue current_indent = len(line) - len(line.lstrip()) stripped = line.strip() # Check if previous line ended with :: (literal block marker) if i > 0: prev_line = lines[i - 1] if prev_line.rstrip().endswith('::'): in_literal_block = True literal_block_indent = len(prev_line) - len(prev_line.lstrip()) # If we're in a literal block and dedented, we're out if in_literal_block and stripped and current_indent <= literal_block_indent: in_literal_block = False # Skip checks if we're inside a literal block if in_literal_block: continue # Check each pattern for pattern, description in patterns: if re.match(pattern, line): # Check if previous line is blank or also matches a list pattern prev_line = lines[i - 1] if i > 0 else '' # If previous line is not blank if prev_line.strip() != '': # Check if previous line is also a list item (which is OK) is_prev_list = any(re.match(p[0], prev_line) for p in patterns) # Check if previous line is a Sphinx field (like :Example:, :param:, etc.) is_sphinx_field = re.match(sphinx_field_pattern, prev_line) # Check if we're indented under a previous section # If current line is more indented than previous non-blank line, it's likely continuation prev_indent = len(prev_line) - len(prev_line.lstrip()) is_indented_continuation = current_indent > prev_indent # Special case for code blocks (>>>): if description == 'code block': # In Python interactive sessions, >>> prompts after output or continuations are normal # Skip if: previous line is >>> or ..., OR both lines are indented (in code example) if (prev_line.strip().startswith('...') or prev_line.strip().startswith('>>>') or (prev_indent > 0)): # Both lines indented = inside code example # Don't report this as an issue break # Special case for bullet/numbered lists: # 1. Check if we're continuing a list (prev line is wrapped text from previous bullet) # 2. Check if we're nested under another list item is_nested_list = False if description in ['bullet list item', 'numbered list item']: # Look back to find context - skip blank lines for j in range(i - 1, max(0, i - 10), -1): check_line = lines[j] if not check_line.strip(): continue # Skip blank lines check_indent = len(check_line) - len(check_line.lstrip()) # If we find a line at same indent that's also a list item, we're continuing a list if check_indent == current_indent and any(re.match(p[0], check_line) for p in patterns): is_nested_list = True # This is a list continuation break # If we find a less-indented line that's also a list item, we're nested if check_indent < current_indent and any(re.match(p[0], check_line) for p in patterns): is_nested_list = True break # If we find a non-list line at current or less indent (intro text), stop looking if check_indent <= current_indent: break if not is_prev_list and not is_sphinx_field and not is_indented_continuation and not is_nested_list: issues.append((i + 1, f"{description} without blank line before it")) break # Only report one issue per line return issues def find_python_files(root_dir): """Find all Python files in the given directory.""" root = Path(root_dir) return list(root.rglob('*.py')) def main(): parser = argparse.ArgumentParser( description='Check Python docstrings for formatting issues.' ) parser.add_argument( 'paths', nargs='*', help='Files or directories to check (default: python/ directory)' ) parser.add_argument( '-v', '--verbose', action='store_true', help='Show all files being checked, not just files with issues' ) args = parser.parse_args() # Determine what to check if args.paths: files_to_check = [] for path_str in args.paths: # Check if path contains glob characters if any(c in path_str for c in '*?['): matches = list(Path.cwd().glob(path_str)) if not matches: print(f"Warning: {path_str} did not match any files", file=sys.stderr) for path in matches: if path.is_dir(): files_to_check.extend(find_python_files(path)) elif path.is_file() and path.suffix == '.py': files_to_check.append(path) else: path = Path(path_str) if path.is_dir(): files_to_check.extend(find_python_files(path)) elif path.is_file() and path.suffix == '.py': files_to_check.append(path) else: print(f"Warning: {path_str} is not a valid Python file or directory", file=sys.stderr) else: # Default to checking the python directory relative to this script script_dir = Path(__file__).parent python_dir = script_dir.parent / 'python' if not python_dir.exists(): print(f"Error: Directory {python_dir} does not exist", file=sys.stderr) sys.exit(1) # Exclude examples subfolder when running with default path files_to_check = [f for f in find_python_files(python_dir) if 'examples' not in f.parts] if args.verbose: print(f"Checking Python files...") print("=" * 80) files_with_issues = 0 total_issues = 0 for py_file in sorted(files_to_check): if args.verbose: print(f"Checking {py_file}...", end='', flush=True) docstrings = get_docstrings_from_file(py_file) file_issues = [] for doc_line_num, docstring, node_type in docstrings: issues = check_docstring_formatting(docstring) if issues: for line_offset, issue_desc in issues: # Calculate absolute line number in file # This is approximate since we don't have exact positions abs_line = doc_line_num + line_offset file_issues.append((abs_line, issue_desc, node_type)) if file_issues: files_with_issues += 1 total_issues += len(file_issues) if args.verbose: print(f" {len(file_issues)} issue(s) found") else: print(f"{py_file}: {len(file_issues)} issue(s) found") for line_num, issue_desc, node_type in sorted(file_issues): print(f" Line ~{line_num} ({node_type}): {issue_desc}") else: if args.verbose: print(" OK") if total_issues > 0: if args.verbose: print("=" * 80) print(f"\nFound {total_issues} issue(s) in {files_with_issues} file(s)") return 1 else: if args.verbose: print("=" * 80) print("No issues found!") return 0 if __name__ == '__main__': sys.exit(main())