generate-wandb-python-reference/process_sdk_markdown.py at main · mdlinville/generate-wandb-python-reference · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
"""
Enhanced script to remove entire classes, methods, functions, optionally `__init__`
methods, **and individual attribute bullets** flagged with
    <!-- lazydoc-ignore-class-attributes -->
from lazydocs‑generated markdown.
"""
import os
import re
import argparse
import glob
from typing import List, Tuple


class MarkdownCleaner:
    # ------------------------------------------------------------------ #
    def __init__(self):
        # 1) simple one‑off replacements
        self.patterns: List[Tuple[re.Pattern, str]] = [
            (re.compile(r'<a\b[^>]*>(.*?)</a>', re.DOTALL), r'\1'),
            (re.compile(r'(# <kbd>module</kbd> `[\w\.]+)\.[\w]+`'), r'\1`'),
            (re.compile(
                r"\*\*Global Variables\*\*\n[-]+\n(?:(?!## |# <kbd>)[\s\S])*\n",
                re.MULTILINE
            ), ""),
            (re.compile(r'<b>(.*?)</b>'), r'\1'),
            (re.compile(
                r'---\n+_This file was automatically generated via '
                r'\[lazydocs\]\([^)]+\)._\n*'
            ), ""),
            (re.compile(r'####\s*'), r'### '),
        ]

        # 2) existing large‑block patterns
        self.block_pattern = re.compile(
            r"(?s)(## <kbd>class</kbd> `.*?`|"
            r"### <kbd>(?:method|function)</kbd> `.*?`\n\n```python\n.*?\n```\n\n.*?|"
            r"### <kbd>property</kbd> .*?\n\n.*?)(?=\n## |\n### |\Z)"
        )
        self.class_pattern = re.compile(r"(?s)## <kbd>class</kbd> `.*?`.*?(?=\n## <kbd>class</kbd>|$)")
        self.function_pattern = re.compile(r"(?s)## <kbd>function</kbd> `.*?`\n\n```python\n.*?\n```\n\n.*?(?=\n## |\Z)")
        self.init_pattern = re.compile(
            r"(?s)<!-- lazydoc-ignore-init: internal -->\s*"
            r"### <kbd>method</kbd> `.*?__init__.*?`\n\n```python\n.*?\n```\n\n.*?(?=\n## |\n### |\Z)"
        )
        self.classmethod_pattern = re.compile(
            r"(?s)### <kbd>classmethod</kbd> `.*?`\n\n```python\n.*?\n```\n\n.*?(?=\n## |\n### |\Z)"
        )

        # 3) what a single attribute bullet‑block looks like
        self.attr_block_pattern = re.compile(
            r"(?sm)^( {0,3}- .*?)"            # top‑level bullet start …
            r"(?=\n {0,3}- |\n## |\n### |\Z)" # … up to next bullet/header/EOF
        )

    # ------------------------------------------------------------------ #
    def clean_text(self, markdown_text: str) -> str:
        cleaned = markdown_text

        # -- simple substitutions
        for pat, repl in self.patterns:
            cleaned = pat.sub(repl, cleaned)

        # -- your original ignore markers
        cleaned = self._remove_ignored_blocks(cleaned, "<!-- lazydoc-ignore: internal -->",     self.block_pattern)
        cleaned = self._remove_ignored_blocks(cleaned, "<!-- lazydoc-ignore-class: internal -->", self.class_pattern)
        cleaned = self._remove_ignored_blocks(cleaned, "<!-- lazydoc-ignore-function: internal -->", self.function_pattern)
        cleaned = self._remove_ignored_blocks(cleaned, "<!-- lazydoc-ignore-classmethod: internal -->", self.classmethod_pattern)
        cleaned = self.init_pattern.sub("", cleaned)

        # -- NEW: attribute bullets flagged with the inline literal
        cleaned = self._remove_ignored_blocks(
            cleaned,
            "<!-- lazydoc-ignore-class-attributes -->",
            self.attr_block_pattern
        )

        return cleaned

    # ------------------------------------------------------------------ #
    def _remove_ignored_blocks(self, text: str, token: str, pattern: re.Pattern) -> str:
        """Drop any regex‑matched block that contains the given token."""
        def keep_or_drop(match: re.Match) -> str:
            return "" if token in match.group(0) else match.group(0)
        return pattern.sub(keep_or_drop, text)


# ----------------------------------------------------------------------#
def process_text(markdown_text: str) -> str:
    return MarkdownCleaner().clean_text(markdown_text)


def main(args):
    for filename in glob.glob(os.path.join(os.getcwd(), args.output_directory, "*.md")):
        with open(filename, "r") as f:
            text = f.read()
        with open(filename, "w") as f:
            f.write(process_text(text))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Post‑process lazydocs markdown.")
    parser.add_argument("--output_directory", default="wandb_sdk_docs",
                        help="Directory containing markdown files to process")
    main(parser.parse_args())