Avoid Perl dependency in test-setup.sh
Fixes #4691.
PiperOrigin-RevId: 230308181
diff --git a/tools/test/generate-xml.sh b/tools/test/generate-xml.sh
old mode 100644
new mode 100755
index 3a3de60..cb292ec
--- a/tools/test/generate-xml.sh
+++ b/tools/test/generate-xml.sh
@@ -22,15 +22,83 @@
EXIT_CODE="$4"
# Keep this in sync with test-setup.sh!
+function encode_stream {
+ # Replace invalid XML characters and invalid sequence in CDATA
+ # We do this in four steps:
+ #
+ # 1. Add a single whitespace character to the end of every line
+ #
+ # 2. Replace every sequence of legal characters followed by an illegal
+ # character *or* followed by a legal character at the end of the line with
+ # the same sequence of legal characters followed by a question mark
+ # character (replacing the illegal or last character). Since this will
+ # always replace the last character in a line with a question mark, we
+ # make sure to append a whitespace in step #1.
+ #
+ # A character is legal if it is a valid UTF-8 character that is allowed in
+ # an XML file (this excludes a few control codes, but otherwise allows
+ # most UTF-8 characters).
+ #
+ # We can't use sed in UTF-8 mode, because it would fail on the first
+ # illegal character. Instead, we have to match legal characters by their
+ # 8-bit binary sequences, and also switch sed to an 8-bit mode.
+ #
+ # The legal UTF codepoint ranges are 9,a,d,20-d7ff,e000-fffd,10000-10ffff,
+ # which results in the following 8-bit binary UTF-8 matchers:
+ # [\x9\xa\xd\x20-\x7f] <--- (9,A,D,20-7F)
+ # [\xc0-\xdf][\x80-\xbf] <--- (0080-07FF)
+ # [\xe0-\xec][\x80-\xbf][\x80-\xbf] <--- (0800-CFFF)
+ # [\xed][\x80-\x9f][\x80-\xbf] <--- (D000-D7FF)
+ # [\xee][\x80-\xbf][\x80-\xbf] <--- (E000-EFFF)
+ # [\xef][\x80-\xbe][\x80-\xbf] <--- (F000-FFEF)
+ # [\xef][\xbf][\x80-\xbd] <--- (FFF0-FFFD)
+ # [\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf] <--- (010000-10FFFF)
+ #
+ # We omit \xa and \xd below since sed already splits the input into lines.
+ #
+ # 3. Remove the last character in the line, which we expect to be a
+ # question mark (that was originally added as a whitespace in step #1).
+ #
+ # 4. Replace the string ']]>' with ']]>]]<![CDATA[>' to prevent escaping the
+ # surrounding CDATA block.
+ #
+ # Sed supports the necessary operations as of version 4.4, but not in all
+ # earlier versions. Specifically, we have found that sed 4.1.5 is not 8-bit
+ # safe even when set to an 8-bit locale.
+ #
+ # OSX sed does not support escape sequences (\xhh), use echo as workaround.
+ #
+ # Alternatives considered:
+ # Perl - We originally used Perl, but wanted to avoid the dependency.
+ # Recent versions of Perl now error on invalid utf-8 characters.
+ # tr - tr only replaces single-byte sequences, so cannot handle utf-8.
+ LC_ALL=C sed -E \
+ -e 's/.*/& /g' \
+ -e 's/(('\
+"$(echo -e '[\x9\x20-\x7f]')|"\
+"$(echo -e '[\xc0-\xdf][\x80-\xbf]')|"\
+"$(echo -e '[\xe0-\xec][\x80-\xbf][\x80-\xbf]')|"\
+"$(echo -e '[\xed][\x80-\x9f][\x80-\xbf]')|"\
+"$(echo -e '[\xee-\xef][\x80-\xbf][\x80-\xbf]')|"\
+"$(echo -e '[\xf0][\x80-\x8f][\x80-\xbf][\x80-\xbf]')"\
+')*)./\1?/g' \
+ -e 's/(.*)\?/\1/g' \
+ -e 's|]]>|]]>]]<![CDATA[>|g'
+}
+
function encode_as_xml {
- if [[ -f "$1" ]]; then
- # Replace invalid XML characters and invalid sequence in CDATA
- # cf. https://stackoverflow.com/a/7774512/4717701
- perl -CSDA -pe's/[^\x9\xA\xD\x20-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]+/?/g;' "$1" \
- | sed 's|]]>|]]>]]<![CDATA[>|g'
+ if [ -f "$1" ]; then
+ cat "$1" | encode_stream
fi
}
+# For testing, we allow calling this script with "-", in which case we only
+# perform the encoding step. We intentionally ignore the rest of the parameters.
+if [ "$TEST_LOG" == "-" ]; then
+ encode_stream
+ exit 0
+fi
+
test_name="${TEST_BINARY#./}"
errors=0
error_msg=""
@@ -46,14 +114,17 @@
fi
FAILED=0
-ENCODED_LOG="$(encode_as_xml "${TEST_LOG}")" || FAILED=1
-cat <<EOF >${XML_OUTPUT_FILE}
+ENCODED_LOG="$(encode_as_xml "${TEST_LOG}")" || FAILED=$?
+cat >"${XML_OUTPUT_FILE}" <<EOF
<?xml version="1.0" encoding="UTF-8"?>
<testsuites>
-<testsuite name="${test_name}" tests="1" failures="0" errors="${errors}">
- <testcase name="${test_name}" status="run" duration="${DURATION_IN_SECONDS}" time="${DURATION_IN_SECONDS}">${error_msg}</testcase>
- <system-out><![CDATA[${ENCODED_LOG}]]></system-out>
-</testsuite>
+ <testsuite name="${test_name}" tests="1" failures="0" errors="${errors}">
+ <testcase name="${test_name}" status="run" duration="${DURATION_IN_SECONDS}" time="${DURATION_IN_SECONDS}">${error_msg}</testcase>
+ <system-out>
+Generated test.log (if the file is not UTF-8, then this may be unreadable):
+<![CDATA[${ENCODED_LOG}]]>
+ </system-out>
+ </testsuite>
</testsuites>
EOF
exit "$FAILED"