website/generate_post.py at main · MachinaIO/website · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python3
"""
generate_post.py

Usage:
    python generate_post.py <markdown_file.md> <output.html>

This script converts a markdown file to HTML content suitable for insertion
inside a <div class="content"> element, preserving LaTeX formatting.
"""

import sys
import re
import yaml
import markdown
from pathlib import Path
import os

def parse_markdown(path):
    """Parse markdown file, extracting YAML front matter and markdown content."""
    text = open(path, 'r', encoding='utf-8').read()
    # Split YAML front-matter
    fm_pattern = r'^---\s*\n(.*?)\n---\s*\n(.*)$'
    m = re.match(fm_pattern, text, flags=re.S)
    if m:
        fm = yaml.safe_load(m.group(1))
        body_md = m.group(2)
    else:
        fm = {}
        body_md = text

    # **Sanitize**: remove any literal stray </p> lines in the markdown
    body_md = re.sub(r'(?m)^\s*</p>\s*$', '', body_md)

    return fm, body_md

def convert_markdown_to_html(markdown_content):
    """Convert markdown content to HTML, preserving LaTeX blocks."""
    # Preserve LaTeX blocks before markdown conversion
    # Save display math blocks ($$...$$)
    display_math_blocks = []
    def save_display_math(match):
        display_math_blocks.append(match.group(1))
        return f"DISPLAYMATH{len(display_math_blocks)-1}PLACEHOLDER"

    # Save inline math blocks ($...$)
    inline_math_blocks = []
    def save_inline_math(match):
        inline_math_blocks.append(match.group(1))
        return f"INLINEMATH{len(inline_math_blocks)-1}PLACEHOLDER"

    # Replace LaTeX blocks with placeholders
    content_with_placeholders = re.sub(r'\$\$(.*?)\$\$', save_display_math, markdown_content, flags=re.DOTALL)
    content_with_placeholders = re.sub(r'\$([^\$]+?)\$', save_inline_math, content_with_placeholders)

    # Initialize markdown converter with extensions
    md = markdown.Markdown(extensions=['extra'])

    # Convert markdown to HTML
    html_content = md.convert(content_with_placeholders)

    # Restore LaTeX blocks with proper HTML escaping
    for i, math in enumerate(display_math_blocks):
        # Ensure special characters in LaTeX are properly handled
        placeholder = f"DISPLAYMATH{i}PLACEHOLDER"
        # Use HTML comments to protect LaTeX content
        replacement = f"$$\n{math}\n$$"
        html_content = html_content.replace(placeholder, replacement)

    for i, math in enumerate(inline_math_blocks):
        placeholder = f"INLINEMATH{i}PLACEHOLDER"
        replacement = f"${math}$"
        html_content = html_content.replace(placeholder, replacement)

    return html_content

def generate_content_html(md_path):
    """Generate just the HTML content from markdown file for insertion in content div."""
    # Parse the markdown file
    front_matter, markdown_content = parse_markdown(md_path)

    # Get title from front matter or use filename
    title = front_matter.get('title', Path(md_path).stem)

    # Remove duplicate title from markdown content if it exists
    # Check if the markdown content starts with a heading that matches the title
    title_pattern = re.compile(r'^#\s*(.*?)\s*$', re.MULTILINE)
    first_heading_match = title_pattern.search(markdown_content)

    if first_heading_match and first_heading_match.group(1).strip() == title.strip():
        # Remove the first heading if it matches the title
        markdown_content = title_pattern.sub('', markdown_content, count=1).strip()

    # Convert markdown to HTML
    html_content = convert_markdown_to_html(markdown_content)

    return html_content

def main(md_path, out_path):
    """Main function to generate HTML content from markdown and save to output file."""
    # Generate HTML content from markdown
    html_content = generate_content_html(md_path)

    # Write the output file
    with open(out_path, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"Generated HTML content from {md_path}: {out_path}")

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print("Usage: python generate_post.py <markdown.md> <output.html>")
        sys.exit(1)

    md_path = sys.argv[1]
    out_path = sys.argv[2]

    # If the markdown file is not in the current directory but just a filename,
    # check if it exists in the posts directory
    if not os.path.exists(md_path) and not os.path.dirname(md_path):
        posts_path = os.path.join("posts", md_path)
        if os.path.exists(posts_path):
            md_path = posts_path

    main(md_path, out_path)