chore: CI worfklows for linting docs#4076
Conversation
|
@d-v-b @chuckwondo this adds a fair amount of config/workflow files, so a review on the direction here would be welcome but not necessarily required |
| for line in text[directive_start:].splitlines()[1:]: | ||
| if line.strip() == "": | ||
| continue | ||
| if not line.startswith((" ", "\t")): | ||
| break | ||
| if MEMBERS_DISABLED_RE.match(line): | ||
| return True | ||
| return False |
There was a problem hiding this comment.
This is perhaps more readable:
| for line in text[directive_start:].splitlines()[1:]: | |
| if line.strip() == "": | |
| continue | |
| if not line.startswith((" ", "\t")): | |
| break | |
| if MEMBERS_DISABLED_RE.match(line): | |
| return True | |
| return False | |
| return any( | |
| MEMBERS_DISABLED_RE.match(line) | |
| for line in text[directive_start:].splitlines()[1:] | |
| if line.startswith((" ", "\t")) and line.strip() | |
| ) |
However, I cannot tell (yet, I'll keep reviewing) if text is the entirety of a python file, in which case, slicing it from some point to the end of the file for every directive encountered in the file seems highly inefficient (but perhaps isn't a noticeable performance problem, or I need to keep looking at your code to see this is not the case).
Although, I suspect that the suggestion above doesn't address the potentially larger issue of how this script is architected. In the end, however, this might be sufficient since this is simply a CI script, but I'll make more significant suggestions elsewhere.
| REPO_ROOT = Path(__file__).parent.parent.resolve() | ||
| API_DOCS_ROOT = REPO_ROOT / "docs" / "api" |
There was a problem hiding this comment.
I suggest you make this a CLI argument instead of hard-coding it here. Then modify the github workflow to call this script with "docs/api" as the argument since it will be invoked from the root of the repo.
| for md_file in sorted(API_DOCS_ROOT.rglob("*.md")): | ||
| text = md_file.read_text(encoding="utf-8") | ||
| for match in DIRECTIVE_RE.finditer(text): | ||
| obj = resolve(match.group("target")) | ||
| if obj is None: | ||
| continue | ||
| documented.add(id(obj)) | ||
| if isinstance(obj, ModuleType) and not members_disabled(text, match.start()): | ||
| member_names = getattr(obj, "__all__", None) or [ | ||
| name for name in dir(obj) if not name.startswith("_") | ||
| ] | ||
| for name in member_names: | ||
| member = getattr(obj, name, None) | ||
| if member is not None: | ||
| documented.add(id(member)) | ||
| return documented |
There was a problem hiding this comment.
This is nearly indecipherable, so it may be annoying to maintain.
I suggest refactoring this to take a pipeline approach, which I believe will not only make it more readable, but also allow for greater efficiency/speed (related to my comment on the members_disabled function), which seems to be a motivating factor based on your comments elsewhere.
A pipeline approach should allow you to avoid repeatedly splitting each file's text into individual lines every time a directive is encountered.
| missing = [] | ||
| for name in zarr.__all__: | ||
| if name in EXEMPT_EXPORTS: | ||
| continue | ||
| if id(getattr(zarr, name)) not in documented: | ||
| missing.append(name) | ||
| return sorted(missing) |
There was a problem hiding this comment.
Use the forcecomprehension, LukeMax!
| missing = [] | |
| for name in zarr.__all__: | |
| if name in EXEMPT_EXPORTS: | |
| continue | |
| if id(getattr(zarr, name)) not in documented: | |
| missing.append(name) | |
| return sorted(missing) | |
| return sorted( | |
| name | |
| for name in zarr.__all__ | |
| if name not in EXEMPT_EXPORTS | |
| and id(getattr(zarr, name)) not in documented | |
| ) |
| if not API_DOCS_ROOT.exists(): | ||
| raise FileNotFoundError(f"{API_DOCS_ROOT} does not exist.") |
There was a problem hiding this comment.
As I mentioned in another comment, take the docs root as an argument, rather than hard-coding it.
| lines = [ | ||
| f"Found {len(missing)} public export(s) in zarr.__all__ missing from the API " | ||
| "reference (docs/api/):\n", | ||
| ] | ||
| lines.extend(f" - zarr.{name}" for name in missing) | ||
| lines.append( | ||
| "\nAdd a `::: zarr.<name>` page under docs/api/zarr/ (and register it in " | ||
| "mkdocs.yml and docs/api/zarr/index.md), or -- if the export is intentionally " | ||
| "undocumented -- add it to EXEMPT_EXPORTS in this script with a reason." | ||
| ) | ||
| raise ValueError("\n".join(lines)) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() |
There was a problem hiding this comment.
I suggest following the pattern you used in lint_docs.py, where you print to stderr and use a non-zero exit code, rather than raising.
| findings: list[Finding] = [] | ||
| doc_nodes = (ast.Module, ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef) | ||
| for node in ast.walk(tree): | ||
| if not isinstance(node, doc_nodes): | ||
| continue | ||
| docstring = ast.get_docstring(node, clean=False) | ||
| if not docstring: | ||
| continue | ||
| # node.body[0].value is the docstring literal; its lineno is the line the string | ||
| # opens on, so content line i maps to source line (start + i). | ||
| start = node.body[0].value.lineno # type: ignore[attr-defined] | ||
| for offset, line in enumerate(docstring.splitlines()): | ||
| findings.extend( | ||
| Finding(path, start + offset, category, line) for category in _scan_line(line) | ||
| ) | ||
| return findings |
There was a problem hiding this comment.
Haven't tested this, but I would argue this is more readable. I encourage you to embrace comprehensions over nested for loops with embedded conditionals. While the logic is essentially the same, the comprehension syntax tends to be more readable (even more natural-language-like).
To avoid having the comprehensions themselves become incomprehensible, I've also created 2 comprehensions in place of the single nested for loops.
There are several other places in this script where I encourage you to do similar refactorings to aid in readability and maintainability.
| findings: list[Finding] = [] | |
| doc_nodes = (ast.Module, ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef) | |
| for node in ast.walk(tree): | |
| if not isinstance(node, doc_nodes): | |
| continue | |
| docstring = ast.get_docstring(node, clean=False) | |
| if not docstring: | |
| continue | |
| # node.body[0].value is the docstring literal; its lineno is the line the string | |
| # opens on, so content line i maps to source line (start + i). | |
| start = node.body[0].value.lineno # type: ignore[attr-defined] | |
| for offset, line in enumerate(docstring.splitlines()): | |
| findings.extend( | |
| Finding(path, start + offset, category, line) for category in _scan_line(line) | |
| ) | |
| return findings | |
| doc_nodes = (ast.Module, ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef) | |
| docstring_start_pairs = [ | |
| # node.body[0].value is the docstring literal; its lineno is the line the string | |
| # opens on, so content line i maps to source line (start + i). | |
| (docstring, start = node.body[0].value.lineno) | |
| for node in ast.walk(tree) | |
| if isinstance(node, doc_nodes) | |
| if docstring := ast.get_docstring(node, clean=False) | |
| ] | |
| return [ | |
| Finding(path, start + offset, category, line) | |
| for docstring, start in docstring_start_pairs | |
| for offset, line in enumerate(docstring.splitlines()) | |
| for category in _scan_line(line) | |
| ] |
This PR adds a few scripts/workflows to:
TODO:
docs/user-guide/*.mdchanges/