Fix ctx.actions.write to write as ISO 8859-1 so it is symetric with BUILD and .bzl input parsing.
Internally BUILD and .bzl files are read as uninterpreted raw byte strings and
not as UTF-8, so we should write content back out the same way. For example,
if a BUILD file contains:
```
...
notice = "Copyright © 2019 Acme LLC",
```
Bazel will store the copyright symbol as two distinct octets ([0xc2, 0xa9]).
When writing that with an action, the same two octets should be emitted.
The behavior fixed by this change is to not interpret each character as
a standalone Unicode code point to be encoded as UTF-8.
While this change is appropriate for strings read from BUILD files, there is still a discrepancy in handling file paths with non Latin1 characters on Windows file systems. That is described in https://github.com/bazelbuild/bazel/issues/11602.
While this change does produce an observable behavior change, I am considering it a bug fix rather than an incompatible change because the previous behavior was both wrong and also (as described in #11602) inconsistent across different OSes.
Closes https://github.com/bazelbuild/bazel/issues/10174
RELNOTES:
Fix behavior of ctx.actions.write so content is written without an incorrect encoding to UTF-8.
See https://github.com/bazelbuild/bazel/issues/10174 for details.
PiperOrigin-RevId: 318056290
diff --git a/src/main/java/com/google/devtools/build/lib/analysis/actions/FileWriteAction.java b/src/main/java/com/google/devtools/build/lib/analysis/actions/FileWriteAction.java
index 5cb26b4..15aa1e8 100644
--- a/src/main/java/com/google/devtools/build/lib/analysis/actions/FileWriteAction.java
+++ b/src/main/java/com/google/devtools/build/lib/analysis/actions/FileWriteAction.java
@@ -14,7 +14,7 @@
package com.google.devtools.build.lib.analysis.actions;
-import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
import com.google.common.annotations.VisibleForTesting;
import com.google.devtools.build.lib.actions.ActionExecutionContext;
@@ -39,7 +39,12 @@
/**
* Action to write a file whose contents are known at analysis time.
*
- * <p>The output is always UTF-8 encoded.
+ * <p>The output file is generally encoded as UTF-8, but by an unusual path. BUILD files and
+ * directory entries, which are actually UTF-8, are misinterpreted by Bazel as Latin1, so that most
+ * Strings within the build language use this unusual representation. FileWriteAction writes those
+ * Strings out again as Latin1.
+ *
+ * <p>TODO(b/146554973): Change this implementation when that is addressed.
*
* <p>TODO(bazel-team): Choose a better name to distinguish this class from {@link
* BinaryFileWriteAction}.
@@ -171,7 +176,7 @@
final int uncompressedSize;
CompressedString(String chars) {
- byte[] dataToCompress = chars.getBytes(UTF_8);
+ byte[] dataToCompress = chars.getBytes(ISO_8859_1);
ByteArrayOutputStream byteStream = new ByteArrayOutputStream(dataToCompress.length);
try (GZIPOutputStream zipStream = new GZIPOutputStream(byteStream)) {
zipStream.write(dataToCompress);
@@ -202,7 +207,7 @@
// This should be impossible since we're reading from a byte array.
throw new RuntimeException(e);
}
- return new String(uncompressedBytes, UTF_8);
+ return new String(uncompressedBytes, ISO_8859_1);
}
}
@@ -216,6 +221,11 @@
*
* <p>Note that if the string is lazily computed or compressed, calling this method will force its
* computation or decompression. No attempt is made by FileWriteAction to cache the result.
+ *
+ * <p>Note that the content is a not a normal Java String. When Bazel parses BUILD files, it
+ * misinterprets the bytes as Latin1, so a code point with a 3-byte UTF-8 encoding will take 3
+ * chars internally. To reverse this process, you must encode this string as Latin1, giving you
+ * back the correct UTF-8 encoding of the original input.
*/
public String getFileContents() {
return fileContents.toString();
@@ -235,7 +245,7 @@
return new DeterministicWriter() {
@Override
public void writeOutputFile(OutputStream out) throws IOException {
- out.write(getFileContents().getBytes(UTF_8));
+ out.write(getFileContents().getBytes(ISO_8859_1));
}
};
}
diff --git a/src/test/shell/integration/loading_phase_test.sh b/src/test/shell/integration/loading_phase_test.sh
index f9fd9e7..9c33363 100755
--- a/src/test/shell/integration/loading_phase_test.sh
+++ b/src/test/shell/integration/loading_phase_test.sh
@@ -498,4 +498,96 @@
done
}
+# Test that actions.write correctly emits a UTF-8 encoded attribute value as
+# UTF-8.
+function test_actions_write_utf8_attribute() {
+ local -r pkg="${FUNCNAME}"
+ mkdir -p "$pkg" || fail "could not create \"$pkg\""
+
+ cat >"${pkg}/def.bzl" <<'EOF'
+def _write_attribute_impl(ctx):
+ ctx.actions.write(
+ output = ctx.outputs.out,
+ # adding a NL at the end to make the diff below easier
+ content = ctx.attr.text + '\n',
+ )
+ return []
+
+write_attribute = rule(
+ implementation = _write_attribute_impl,
+ attrs = {
+ "text": attr.string(),
+ "out": attr.output(),
+ },
+)
+EOF
+
+ cat >"${pkg}/BUILD" <<'EOF'
+load(":def.bzl", "write_attribute")
+write_attribute(
+ name = "text_with_non_latin1_chars",
+ # U+41, U+2117, U+4E16, U+1F63F (1,2,3,4-byte UTF-8 encodings), 10 bytes.
+ text = "A©世😿",
+ out = "out",
+)
+EOF
+ bazel build "${pkg}:text_with_non_latin1_chars" || fail "Expected build to succeed"
+ diff $(bazel info "${PRODUCT_NAME}-bin")/$pkg/out <(echo 'A©世😿') || fail 'diff failed'
+}
+
+# Test that actions.write emits a file name containing non-Latin1 characters as
+# a UTF-8 encoded string.
+function test_actions_write_not_latin1_path() {
+ # TODO(https://github.com/bazelbuild/bazel/issues/11602): Enable after that is fixed.
+ if $is_windows ; then
+ echo 'Skipping test_actions_write_not_latin1_path on Windows. See #11602'
+ return
+ fi
+
+ local -r pkg="${FUNCNAME}"
+ mkdir -p "$pkg" || fail "could not create \"$pkg\""
+
+ filename='A©世😿.file' # see above for an explanation.
+ echo hello >"${pkg}/${filename}"
+
+ cat >"${pkg}/def.bzl" <<'EOF'
+def _write_paths_impl(ctx):
+ # srcs is a list, but we only expect one entry.
+ if len(ctx.attr.srcs) != 1:
+ fail('expected exactly 1 file for srcs. got %d' % len(ctx.attr.srcs))
+ file_name = ctx.attr.srcs[0].label.name
+ ctx.actions.write(
+ output = ctx.outputs.out,
+ content = file_name,
+ )
+ return []
+
+write_paths = rule(
+ implementation = _write_paths_impl,
+ attrs = {
+ "srcs": attr.label_list(allow_files=True),
+ "out": attr.output(),
+ },
+)
+EOF
+
+ cat >"${pkg}/BUILD" <<'EOF'
+load(":def.bzl", "write_paths")
+write_paths(
+ name = "path_with_non_latin1",
+ # Use a glob to ensure that the value is read from the file system and not
+ # out of BUILD.
+ srcs = glob(["*.file"]),
+ out = "paths.txt",
+)
+EOF
+
+ bazel build "${pkg}:path_with_non_latin1" >output 2>&1 || (
+ echo '== build output'
+ cat output
+ fail "Expected build to succeed"
+ )
+ assert_contains "^${filename}$" $(bazel info "${PRODUCT_NAME}-bin")/$pkg/paths.txt
+}
+
run_suite "Integration tests of ${PRODUCT_NAME} using loading/analysis phases."