scripts/docs/docs2mdx.py - bazel - Git at Google

 # Lint as: python3
 # pylint: disable=g-direct-third-party-import
 # Copyright 2026 The Bazel Authors. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """A tool for converting .html/.md(x) docs to valid .mdx files."""

 import os
 import re
 import sys

 from absl import app
 from absl import flags
 import markdownify


 FLAGS = flags.FLAGS

 flags.DEFINE_string(
     "in_dir",
     None,
     "Absolute path of the input directory (where .html and .md(x) files "
     "should be read from).",
 )
 flags.DEFINE_string(
     "out_dir",
     None,
     "Absolute path of the output directory (where .mdx files should be"
     " written to).",
 )
 flags.mark_flag_as_required("in_dir")
 flags.mark_flag_as_required("out_dir")


 _HEADING_RE = re.compile(r"^# (.+)$", re.MULTILINE)
 _TEMPLATE_RE = re.compile(r"^\{%.+$\n", re.MULTILINE)
 _TAG_RE = re.compile(r"\s?\{:[^}]+\}")
 _HTML_LINK_RE = re.compile(r"\]\(([^)]+)\.html")
 _METADATA_PATTERN = re.compile(
     "^((Project|Book):.+\n)", re.MULTILINE
 )
 _TITLE_RE = re.compile(r"^title: '", re.MULTILINE)
 _HTML_COMMENT_RE = re.compile(r"<!--.*?-->", re.DOTALL)
 _ANGLE_BRACKET_LINK_RE = re.compile(r"<(https?://[^>]+)>")
 _HTML_PRE_PATTERN = re.compile(r"(?:<pre>)(.*?)(?:</pre>)")
 _HTML_STYLE_PATTERN = re.compile(r"^</?style>", re.MULTILINE)
 _MD_FRONT_MATTER_PATTERN = re.compile(r"^---", re.MULTILINE)

 # Across code blocks and similar pre-formatted blocks, these
 # characters must be converted to HTML entities so they don't
 # look like JavaScript blocks.
 _REPLACED_JS_CHARACTERS = {
     "{": "&lcub;",
     "}": "&rcub;",
 }

 # Inside code blocks, these characters need to be converted
 # to HTML entities to prevent parser errors.
 _REPLACED_CODE_CHARACTERS = {
     "<": "&lt;",
     ">": "&gt;",
     **_REPLACED_JS_CHARACTERS,
 }


 def _escape_chars(text, replacements):
   """Escapes characters in a string.

   Args:
     text: str; string that needs characters escaped.
     replacements: dict[str, str]; a dictionary mapping characters to escape with
       their replacements.

   Returns:
     The escaped version of `text`.
   """
   for c in replacements.keys():
     text = text.replace(c, replacements[c])
   return text


 class AcornSafeMarkdownConverter(markdownify.MarkdownConverter):
   """Custom converter that produces Acorn-parsable MDX output."""

   def convert_code(self, node, text, parent_tags):
     """Escape sensitive characters in code blocks so they're not mishandled."""
     text = super().convert_code(node, text, parent_tags)
     return _escape_chars(text, _REPLACED_CODE_CHARACTERS)

   def escape(self, text, parent_tags):
     """Custom escape handling."""
     if not text:
       return text
     escaped = super().escape(text, parent_tags)

     # Unescape underscores that are in the middle of words.
     escaped = re.sub(r"(\w)\\_(\w)", r"\1_\2", escaped)
     return _escape_chars(escaped, _REPLACED_CODE_CHARACTERS)


 def _convert_directory(root_dir, mdx_dir):
   """Converts all .html and .md(x) files to .mdx files.

   Args:
       root_dir: str; full path of the directory with .html/.md(x) files (input).
       mdx_dir: str; full path of the directory where .mdx files should be
         created (output).
   """
   for curr_dir, _, files in os.walk(root_dir):
     rel = os.path.relpath(curr_dir, start=root_dir)
     dest_dir = os.path.join(mdx_dir, rel)
     os.makedirs(dest_dir, exist_ok=True)

     for fname in files:
       basename, ext = os.path.splitext(fname)
       if ext not in (".html", ".md", ".mdx"):
         continue

       src = os.path.join(curr_dir, fname)
       dest = os.path.join(dest_dir, f"{basename}.mdx")

       _convert_file(src, dest)


 def _convert_file(src, dest):
   with open(src, "rt") as f:
     content = f.read()

   with open(dest, "wt") as f:
     f.write(_transform(src, content))


 def _transform(path, content):
   content = _pre_markdown_transforms(content)
   md = _html2md(content) if path.endswith(".html") else content
   return _post_markdown_transforms(md)


 def _html2md(content):
   return AcornSafeMarkdownConverter(heading_style="ATX").convert(content)


 def _pre_markdown_transforms(content):
   """Transforms applied to all sources before any markdown conversion.

   Args:
     content: str; content of an HTML or .md file.

   Returns:
     The file with invalid content removed.
   """
   no_tags = _TAG_RE.sub("", content)
   no_comments = _HTML_COMMENT_RE.sub("", no_tags)
   # Remove Project: and Book: lines
   no_metadata = _METADATA_PATTERN.sub("", no_comments, count=2).lstrip()
   no_templates = _TEMPLATE_RE.sub("", no_metadata)
   return _HTML_PRE_PATTERN.sub(
       _escape_chars_in_pre_blocks,
       no_templates,
       re.DOTALL,
   )


 def _post_markdown_transforms(content):
   """Transforms applied to all sources after any markdown conversion.

   Args:
     content: str; content of a converted .mdx file.

   Returns:
     The content as fully valid .mdx.
   """
   no_html_links = _HTML_LINK_RE.sub(_fix_link, content)
   no_angle_links = _ANGLE_BRACKET_LINK_RE.sub(r"\1", no_html_links)
   no_double_empty_lines = no_angle_links.replace("\n\n\n", "\n\n")
   no_trailing_whitespaces = _remove_trailing_whitespaces(no_double_empty_lines)
   fixed_headings = (
       no_trailing_whitespaces
       if _TITLE_RE.search(no_trailing_whitespaces)
       else _HEADING_RE.sub(_fix_title_heading, no_trailing_whitespaces, count=1)
   )
   front_matter_first = _remove_anything_before_front_matter(fixed_headings)
   return _remove_style_sections(front_matter_first)


 def _remove_trailing_whitespaces(content):
   lines = (l.rstrip() for l in content.split("\n"))
   return "\n".join(lines)


 def _fix_title_heading(m):
   title = m.group(1).replace("'", "\\'")
   return f"---\ntitle: '{title}'\n---"


 def _remove_anything_before_front_matter(content):
   if content.startswith("---\n"):
     return content

   parts = _MD_FRONT_MATTER_PATTERN.split(content, maxsplit=1)
   if len(parts) == 1:
     # Technically this only affects files that we need for the old site,
     # so the better solution would be to stop generating them.
     return parts[0]

   return f"---{parts[1]}"


 def _remove_style_sections(content):
   m = _HTML_STYLE_PATTERN.search(content)
   if not m:
     return content

   parts = _HTML_STYLE_PATTERN.split(content)
   return f"{parts[0]}{parts[2].lstrip()}"


 def _escape_chars_in_pre_blocks(matches):
   """Escapes characters in <pre> blocks that cause mdx parse errors.

   Because some <pre> blocks contain valid HTML elements (e.g. links), < and >
   are not escaped.

   Args:
     matches: re.Match; an object matching a <pre> block and its content.

   Returns:
     The <pre> block with properly escaped content.
   """
   content = _escape_chars(matches.group(1), _REPLACED_JS_CHARACTERS)
   return f"<pre>{content}</pre>"


 def _fix_link(m):
   raw = m.group(1)
   # Only keep .html extension for external links.
   if raw.startswith("http://") or raw.startswith("https://"):
     return m.group(0)

   return f"]({raw}"


 def _fail(msg):
   print(msg, file=sys.stderr)
   exit(1)


 def main(unused_argv):
   if not os.path.isdir(FLAGS.in_dir):
     _fail(f"{FLAGS.in_dir} is not a directory")
   if not os.path.isdir(FLAGS.out_dir):
     _fail(f"{FLAGS.out_dir} is not a directory")

   _convert_directory(FLAGS.in_dir, FLAGS.out_dir)


 if __name__ == "__main__":
   FLAGS(sys.argv)
   app.run(main)
	# Lint as: python3
	# pylint: disable=g-direct-third-party-import
	# Copyright 2026 The Bazel Authors. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""A tool for converting .html/.md(x) docs to valid .mdx files."""

	import os
	import re
	import sys

	from absl import app
	from absl import flags
	import markdownify


	FLAGS = flags.FLAGS

	flags.DEFINE_string(
	"in_dir",
	None,
	"Absolute path of the input directory (where .html and .md(x) files "
	"should be read from).",
	)
	flags.DEFINE_string(
	"out_dir",
	None,
	"Absolute path of the output directory (where .mdx files should be"
	" written to).",
	)
	flags.mark_flag_as_required("in_dir")
	flags.mark_flag_as_required("out_dir")


	_HEADING_RE = re.compile(r"^# (.+)$", re.MULTILINE)
	_TEMPLATE_RE = re.compile(r"^\{%.+$\n", re.MULTILINE)
	_TAG_RE = re.compile(r"\s?\{:[^}]+\}")
	_HTML_LINK_RE = re.compile(r"\]\(([^)]+)\.html")
	_METADATA_PATTERN = re.compile(
	"^((Project\|Book):.+\n)", re.MULTILINE
	)
	_TITLE_RE = re.compile(r"^title: '", re.MULTILINE)
	_HTML_COMMENT_RE = re.compile(r"<!--.*?-->", re.DOTALL)
	_ANGLE_BRACKET_LINK_RE = re.compile(r"<(https?://[^>]+)>")
	_HTML_PRE_PATTERN = re.compile(r"(?:<pre>)(.*?)(?:</pre>)")
	_HTML_STYLE_PATTERN = re.compile(r"^</?style>", re.MULTILINE)
	_MD_FRONT_MATTER_PATTERN = re.compile(r"^---", re.MULTILINE)

	# Across code blocks and similar pre-formatted blocks, these
	# characters must be converted to HTML entities so they don't
	# look like JavaScript blocks.
	_REPLACED_JS_CHARACTERS = {
	"{": "{",
	"}": "}",
	}

	# Inside code blocks, these characters need to be converted
	# to HTML entities to prevent parser errors.
	_REPLACED_CODE_CHARACTERS = {
	"<": "<",
	">": ">",
	**_REPLACED_JS_CHARACTERS,
	}


	def _escape_chars(text, replacements):
	"""Escapes characters in a string.

	Args:
	text: str; string that needs characters escaped.
	replacements: dict[str, str]; a dictionary mapping characters to escape with
	their replacements.

	Returns:
	The escaped version of `text`.
	"""
	for c in replacements.keys():
	text = text.replace(c, replacements[c])
	return text


	class AcornSafeMarkdownConverter(markdownify.MarkdownConverter):
	"""Custom converter that produces Acorn-parsable MDX output."""

	def convert_code(self, node, text, parent_tags):
	"""Escape sensitive characters in code blocks so they're not mishandled."""
	text = super().convert_code(node, text, parent_tags)
	return _escape_chars(text, _REPLACED_CODE_CHARACTERS)

	def escape(self, text, parent_tags):
	"""Custom escape handling."""
	if not text:
	return text
	escaped = super().escape(text, parent_tags)

	# Unescape underscores that are in the middle of words.
	escaped = re.sub(r"(\w)\\_(\w)", r"\1_\2", escaped)
	return _escape_chars(escaped, _REPLACED_CODE_CHARACTERS)


	def _convert_directory(root_dir, mdx_dir):
	"""Converts all .html and .md(x) files to .mdx files.

	Args:
	root_dir: str; full path of the directory with .html/.md(x) files (input).
	mdx_dir: str; full path of the directory where .mdx files should be
	created (output).
	"""
	for curr_dir, _, files in os.walk(root_dir):
	rel = os.path.relpath(curr_dir, start=root_dir)
	dest_dir = os.path.join(mdx_dir, rel)
	os.makedirs(dest_dir, exist_ok=True)

	for fname in files:
	basename, ext = os.path.splitext(fname)
	if ext not in (".html", ".md", ".mdx"):
	continue

	src = os.path.join(curr_dir, fname)
	dest = os.path.join(dest_dir, f"{basename}.mdx")

	_convert_file(src, dest)


	def _convert_file(src, dest):
	with open(src, "rt") as f:
	content = f.read()

	with open(dest, "wt") as f:
	f.write(_transform(src, content))


	def _transform(path, content):
	content = _pre_markdown_transforms(content)
	md = _html2md(content) if path.endswith(".html") else content
	return _post_markdown_transforms(md)


	def _html2md(content):
	return AcornSafeMarkdownConverter(heading_style="ATX").convert(content)


	def _pre_markdown_transforms(content):
	"""Transforms applied to all sources before any markdown conversion.

	Args:
	content: str; content of an HTML or .md file.

	Returns:
	The file with invalid content removed.
	"""
	no_tags = _TAG_RE.sub("", content)
	no_comments = _HTML_COMMENT_RE.sub("", no_tags)
	# Remove Project: and Book: lines
	no_metadata = _METADATA_PATTERN.sub("", no_comments, count=2).lstrip()
	no_templates = _TEMPLATE_RE.sub("", no_metadata)
	return _HTML_PRE_PATTERN.sub(
	_escape_chars_in_pre_blocks,
	no_templates,
	re.DOTALL,
	)


	def _post_markdown_transforms(content):
	"""Transforms applied to all sources after any markdown conversion.

	Args:
	content: str; content of a converted .mdx file.

	Returns:
	The content as fully valid .mdx.
	"""
	no_html_links = _HTML_LINK_RE.sub(_fix_link, content)
	no_angle_links = _ANGLE_BRACKET_LINK_RE.sub(r"\1", no_html_links)
	no_double_empty_lines = no_angle_links.replace("\n\n\n", "\n\n")
	no_trailing_whitespaces = _remove_trailing_whitespaces(no_double_empty_lines)
	fixed_headings = (
	no_trailing_whitespaces
	if _TITLE_RE.search(no_trailing_whitespaces)
	else _HEADING_RE.sub(_fix_title_heading, no_trailing_whitespaces, count=1)
	)
	front_matter_first = _remove_anything_before_front_matter(fixed_headings)
	return _remove_style_sections(front_matter_first)


	def _remove_trailing_whitespaces(content):
	lines = (l.rstrip() for l in content.split("\n"))
	return "\n".join(lines)


	def _fix_title_heading(m):
	title = m.group(1).replace("'", "\\'")
	return f"---\ntitle: '{title}'\n---"


	def _remove_anything_before_front_matter(content):
	if content.startswith("---\n"):
	return content

	parts = _MD_FRONT_MATTER_PATTERN.split(content, maxsplit=1)
	if len(parts) == 1:
	# Technically this only affects files that we need for the old site,
	# so the better solution would be to stop generating them.
	return parts[0]

	return f"---{parts[1]}"


	def _remove_style_sections(content):
	m = _HTML_STYLE_PATTERN.search(content)
	if not m:
	return content

	parts = _HTML_STYLE_PATTERN.split(content)
	return f"{parts[0]}{parts[2].lstrip()}"


	def _escape_chars_in_pre_blocks(matches):
	"""Escapes characters in <pre> blocks that cause mdx parse errors.

	Because some <pre> blocks contain valid HTML elements (e.g. links), < and >
	are not escaped.

	Args:
	matches: re.Match; an object matching a <pre> block and its content.

	Returns:
	The <pre> block with properly escaped content.
	"""
	content = _escape_chars(matches.group(1), _REPLACED_JS_CHARACTERS)
	return f"<pre>{content}</pre>"


	def _fix_link(m):
	raw = m.group(1)
	# Only keep .html extension for external links.
	if raw.startswith("http://") or raw.startswith("https://"):
	return m.group(0)

	return f"]({raw}"


	def _fail(msg):
	print(msg, file=sys.stderr)
	exit(1)


	def main(unused_argv):
	if not os.path.isdir(FLAGS.in_dir):
	_fail(f"{FLAGS.in_dir} is not a directory")
	if not os.path.isdir(FLAGS.out_dir):
	_fail(f"{FLAGS.out_dir} is not a directory")

	_convert_directory(FLAGS.in_dir, FLAGS.out_dir)


	if __name__ == "__main__":
	FLAGS(sys.argv)
	app.run(main)