| # Lint as: python3 |
| # pylint: disable=g-direct-third-party-import |
| # Copyright 2026 The Bazel Authors. All rights reserved. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| """A tool for converting .html/.md(x) docs to valid .mdx files.""" |
| |
| import os |
| import re |
| import sys |
| |
| from absl import app |
| from absl import flags |
| import markdownify |
| |
| |
| FLAGS = flags.FLAGS |
| |
| flags.DEFINE_string( |
| "in_dir", |
| None, |
| "Absolute path of the input directory (where .html and .md(x) files " |
| "should be read from).", |
| ) |
| flags.DEFINE_string( |
| "out_dir", |
| None, |
| "Absolute path of the output directory (where .mdx files should be" |
| " written to).", |
| ) |
| flags.mark_flag_as_required("in_dir") |
| flags.mark_flag_as_required("out_dir") |
| |
| |
| _HEADING_RE = re.compile(r"^# (.+)$", re.MULTILINE) |
| _TEMPLATE_RE = re.compile(r"^\{%.+$\n", re.MULTILINE) |
| _TAG_RE = re.compile(r"\s?\{:[^}]+\}") |
| _HTML_LINK_RE = re.compile(r"\]\(([^)]+)\.html") |
| _METADATA_PATTERN = re.compile( |
| "^((Project|Book):.+\n)", re.MULTILINE |
| ) |
| _TITLE_RE = re.compile(r"^title: '", re.MULTILINE) |
| _HTML_COMMENT_RE = re.compile(r"<!--.*?-->", re.DOTALL) |
| _ANGLE_BRACKET_LINK_RE = re.compile(r"<(https?://[^>]+)>") |
| _HTML_PRE_PATTERN = re.compile(r"(?:<pre>)(.*?)(?:</pre>)") |
| _HTML_STYLE_PATTERN = re.compile(r"^</?style>", re.MULTILINE) |
| _MD_FRONT_MATTER_PATTERN = re.compile(r"^---", re.MULTILINE) |
| |
| # Across code blocks and similar pre-formatted blocks, these |
| # characters must be converted to HTML entities so they don't |
| # look like JavaScript blocks. |
| _REPLACED_JS_CHARACTERS = { |
| "{": "{", |
| "}": "}", |
| } |
| |
| # Inside code blocks, these characters need to be converted |
| # to HTML entities to prevent parser errors. |
| _REPLACED_CODE_CHARACTERS = { |
| "<": "<", |
| ">": ">", |
| **_REPLACED_JS_CHARACTERS, |
| } |
| |
| |
| def _escape_chars(text, replacements): |
| """Escapes characters in a string. |
| |
| Args: |
| text: str; string that needs characters escaped. |
| replacements: dict[str, str]; a dictionary mapping characters to escape with |
| their replacements. |
| |
| Returns: |
| The escaped version of `text`. |
| """ |
| for c in replacements.keys(): |
| text = text.replace(c, replacements[c]) |
| return text |
| |
| |
| class AcornSafeMarkdownConverter(markdownify.MarkdownConverter): |
| """Custom converter that produces Acorn-parsable MDX output.""" |
| |
| def convert_code(self, node, text, parent_tags): |
| """Escape sensitive characters in code blocks so they're not mishandled.""" |
| text = super().convert_code(node, text, parent_tags) |
| return _escape_chars(text, _REPLACED_CODE_CHARACTERS) |
| |
| def escape(self, text, parent_tags): |
| """Custom escape handling.""" |
| if not text: |
| return text |
| escaped = super().escape(text, parent_tags) |
| |
| # Unescape underscores that are in the middle of words. |
| escaped = re.sub(r"(\w)\\_(\w)", r"\1_\2", escaped) |
| return _escape_chars(escaped, _REPLACED_CODE_CHARACTERS) |
| |
| |
| def _convert_directory(root_dir, mdx_dir): |
| """Converts all .html and .md(x) files to .mdx files. |
| |
| Args: |
| root_dir: str; full path of the directory with .html/.md(x) files (input). |
| mdx_dir: str; full path of the directory where .mdx files should be |
| created (output). |
| """ |
| for curr_dir, _, files in os.walk(root_dir): |
| rel = os.path.relpath(curr_dir, start=root_dir) |
| dest_dir = os.path.join(mdx_dir, rel) |
| os.makedirs(dest_dir, exist_ok=True) |
| |
| for fname in files: |
| basename, ext = os.path.splitext(fname) |
| if ext not in (".html", ".md", ".mdx"): |
| continue |
| |
| src = os.path.join(curr_dir, fname) |
| dest = os.path.join(dest_dir, f"{basename}.mdx") |
| |
| _convert_file(src, dest) |
| |
| |
| def _convert_file(src, dest): |
| with open(src, "rt") as f: |
| content = f.read() |
| |
| with open(dest, "wt") as f: |
| f.write(_transform(src, content)) |
| |
| |
| def _transform(path, content): |
| content = _pre_markdown_transforms(content) |
| md = _html2md(content) if path.endswith(".html") else content |
| return _post_markdown_transforms(md) |
| |
| |
| def _html2md(content): |
| return AcornSafeMarkdownConverter(heading_style="ATX").convert(content) |
| |
| |
| def _pre_markdown_transforms(content): |
| """Transforms applied to all sources before any markdown conversion. |
| |
| Args: |
| content: str; content of an HTML or .md file. |
| |
| Returns: |
| The file with invalid content removed. |
| """ |
| no_tags = _TAG_RE.sub("", content) |
| no_comments = _HTML_COMMENT_RE.sub("", no_tags) |
| # Remove Project: and Book: lines |
| no_metadata = _METADATA_PATTERN.sub("", no_comments, count=2).lstrip() |
| no_templates = _TEMPLATE_RE.sub("", no_metadata) |
| return _HTML_PRE_PATTERN.sub( |
| _escape_chars_in_pre_blocks, |
| no_templates, |
| re.DOTALL, |
| ) |
| |
| |
| def _post_markdown_transforms(content): |
| """Transforms applied to all sources after any markdown conversion. |
| |
| Args: |
| content: str; content of a converted .mdx file. |
| |
| Returns: |
| The content as fully valid .mdx. |
| """ |
| no_html_links = _HTML_LINK_RE.sub(_fix_link, content) |
| no_angle_links = _ANGLE_BRACKET_LINK_RE.sub(r"\1", no_html_links) |
| no_double_empty_lines = no_angle_links.replace("\n\n\n", "\n\n") |
| no_trailing_whitespaces = _remove_trailing_whitespaces(no_double_empty_lines) |
| fixed_headings = ( |
| no_trailing_whitespaces |
| if _TITLE_RE.search(no_trailing_whitespaces) |
| else _HEADING_RE.sub(_fix_title_heading, no_trailing_whitespaces, count=1) |
| ) |
| front_matter_first = _remove_anything_before_front_matter(fixed_headings) |
| return _remove_style_sections(front_matter_first) |
| |
| |
| def _remove_trailing_whitespaces(content): |
| lines = (l.rstrip() for l in content.split("\n")) |
| return "\n".join(lines) |
| |
| |
| def _fix_title_heading(m): |
| title = m.group(1).replace("'", "\\'") |
| return f"---\ntitle: '{title}'\n---" |
| |
| |
| def _remove_anything_before_front_matter(content): |
| if content.startswith("---\n"): |
| return content |
| |
| parts = _MD_FRONT_MATTER_PATTERN.split(content, maxsplit=1) |
| if len(parts) == 1: |
| # Technically this only affects files that we need for the old site, |
| # so the better solution would be to stop generating them. |
| return parts[0] |
| |
| return f"---{parts[1]}" |
| |
| |
| def _remove_style_sections(content): |
| m = _HTML_STYLE_PATTERN.search(content) |
| if not m: |
| return content |
| |
| parts = _HTML_STYLE_PATTERN.split(content) |
| return f"{parts[0]}{parts[2].lstrip()}" |
| |
| |
| def _escape_chars_in_pre_blocks(matches): |
| """Escapes characters in <pre> blocks that cause mdx parse errors. |
| |
| Because some <pre> blocks contain valid HTML elements (e.g. links), < and > |
| are not escaped. |
| |
| Args: |
| matches: re.Match; an object matching a <pre> block and its content. |
| |
| Returns: |
| The <pre> block with properly escaped content. |
| """ |
| content = _escape_chars(matches.group(1), _REPLACED_JS_CHARACTERS) |
| return f"<pre>{content}</pre>" |
| |
| |
| def _fix_link(m): |
| raw = m.group(1) |
| # Only keep .html extension for external links. |
| if raw.startswith("http://") or raw.startswith("https://"): |
| return m.group(0) |
| |
| return f"]({raw}" |
| |
| |
| def _fail(msg): |
| print(msg, file=sys.stderr) |
| exit(1) |
| |
| |
| def main(unused_argv): |
| if not os.path.isdir(FLAGS.in_dir): |
| _fail(f"{FLAGS.in_dir} is not a directory") |
| if not os.path.isdir(FLAGS.out_dir): |
| _fail(f"{FLAGS.out_dir} is not a directory") |
| |
| _convert_directory(FLAGS.in_dir, FLAGS.out_dir) |
| |
| |
| if __name__ == "__main__": |
| FLAGS(sys.argv) |
| app.run(main) |