| // Copyright 2024 The Bazel Authors. All rights reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package com.google.devtools.build.lib.util; |
| |
| import static java.nio.charset.StandardCharsets.ISO_8859_1; |
| import static java.nio.charset.StandardCharsets.UTF_8; |
| |
| import com.google.common.base.Preconditions; |
| import com.google.devtools.build.lib.unsafe.StringUnsafe; |
| import java.lang.reflect.Field; |
| import java.nio.charset.Charset; |
| |
| /** |
| * Utility functions for reencoding strings between Bazel's internal raw byte encoding and regular |
| * Java strings. |
| * |
| * <p>Bazel needs to support the following two setups: |
| * |
| * <ul> |
| * <li>Standard setup: file paths, command-line arguments, environment variables, BUILD and .bzl |
| * files are all encoded in UTF-8, on Linux, macOS or Windows. |
| * <li>Legacy setup: file paths, command-line arguments, environment variables, BUILD and .bzl |
| * files are all encoded in <i>some</i> consistent superset of ASCII, on Linux, with the |
| * en_US.ISO-8859-1 locale available on the host. In particular, this setup allows any byte |
| * sequence to appear in a file path and be referenced in a BUILD file. |
| * </ul> |
| * |
| * <p>Bazel achieves this by forcing an en_US.ISO-8859-1 locale on Unix when available, which due to |
| * the byte-based nature of Unix APIs allows all Java (N)IO functions to treat strings as raw byte |
| * sequences (a Latin-1 character is equivalent to an unconstrained byte value). On macOS, where the |
| * JVM forces UTF-8 encoding for any kind of system interaction, as well as on Windows, where system |
| * APIs are all restricted to valid Unicode strings, Bazel has to reencode strings to Unicode before |
| * passing them to the JVM (and vice versa). Since BUILD and .bzl files are always read into Latin-1 |
| * strings (file encodings are not forced by the JVM) and are assumed to be encoded in UTF-8 (unless |
| * the Latin-1 locale is available), Bazel has to reencode the strings to UTF-8 so that they match |
| * up with the Starlark contents of these files (e.g. file paths mentioned in a BUILD file). |
| * |
| * <p>While allowing the user a great deal of flexibility, this requires great care when {@link |
| * String}s are passed into or out of Bazel via Java standard library functions or external APIs. |
| * The following three different types of strings need to be distinguished as if they were different |
| * Java types: |
| * |
| * <ul> |
| * <li>Internal strings: All strings retained by Bazel and used in its inner layers are expected |
| * to be raw byte sequences stored in Latin-1 {@link String}s. With Java's compact string |
| * representation, this means that the Latin-1 bytes are stored directly in the internal byte |
| * array {@link String#value} and the {@link String#coder} is {@link String#LATIN1}. |
| * <li>Unicode strings: Regular Java strings, which are always Unicode. A common example is a |
| * {@code string} field in a protobuf message. |
| * <li>Platform strings: Strings that are passed to or returned from Java (N)IO functions or as |
| * command-line arguments or environment variables to the {@code java} binary at startup or |
| * processes started via {@link java.lang.ProcessBuilder}. These strings are encoded and |
| * decoded by the JVM according to its default native encoding, which is given by the {@code |
| * sun.jnu.encoding} system property. With the current JDK version (21), this is: |
| * <ul> |
| * <li>UTF-8 on macOS; |
| * <li>determined by the active code page on Windows (Cp1252 on US Windows, can be set to |
| * UTF-8 by the user); |
| * <li>determined by the current locale on Linux (forced to en_US.ISO-8859-1 by the client |
| * if available, otherwise usually UTF-8); |
| * <li>determined by the current locale on OpenBSD, which is always UTF-8. |
| * </ul> |
| * As a result, there are two cases to consider: |
| * <ul> |
| * <li>On Linux with a Latin-1 locale, platform strings are identical to internal strings |
| * and Java (N)IO functions can be used to operate with Unix API on a raw byte level. |
| * <li>In all other cases, platform strings are a subset of Unicode strings. |
| * </ul> |
| * </ul> |
| * |
| * <p>The static methods in this class efficiently reencode {@link String}s between these three |
| * "types". Crucially, since ASCII strings are encoded identically in ISO-8859-1 and UTF-8, such |
| * strings do not need to be reencoded. |
| */ |
| public final class StringEncoding { |
| |
| static { |
| try { |
| Field compactStrings = String.class.getDeclaredField("COMPACT_STRINGS"); |
| compactStrings.setAccessible(true); |
| Preconditions.checkState( |
| (boolean) compactStrings.get(null), "Bazel requires -XX:+CompactStrings"); |
| } catch (NoSuchFieldException | IllegalAccessException e) { |
| throw new IllegalStateException(e); |
| } |
| } |
| |
| /** |
| * Transforms an internal string into a platform string as efficiently as possible. |
| * |
| * <p>See the class documentation for more information on the different types of strings. |
| */ |
| public static String internalToPlatform(String s) { |
| return needsReencodeForPlatform(s) |
| ? new String(StringUnsafe.getInternalStringBytes(s), UTF_8) |
| : s; |
| } |
| |
| /** |
| * Transforms a platform string into an internal string as efficiently as possible. |
| * |
| * <p>See the class documentation for more information on the different types of strings. |
| */ |
| public static String platformToInternal(String s) { |
| return needsReencodeForPlatform(s) |
| ? StringUnsafe.newInstance(s.getBytes(UTF_8), StringUnsafe.LATIN1) |
| : s; |
| } |
| |
| /** |
| * Transforms an internal string into a Unicode string as efficiently as possible. |
| * |
| * <p>See the class documentation for more information on the different types of strings. |
| */ |
| public static String internalToUnicode(String s) { |
| return needsReencodeForUnicode(s) |
| ? new String(StringUnsafe.getInternalStringBytes(s), UTF_8) |
| : s; |
| } |
| |
| /** |
| * Transforms a Unicode string into an internal string as efficiently as possible. |
| * |
| * <p>See the class documentation for more information on the different types of strings. |
| */ |
| public static String unicodeToInternal(String s) { |
| return needsReencodeForUnicode(s) |
| ? StringUnsafe.newInstance(s.getBytes(UTF_8), StringUnsafe.LATIN1) |
| : s; |
| } |
| |
| /** |
| * The {@link Charset} with which the JVM encodes any strings passed to or returned from Java |
| * (N)IO functions, command-line arguments or environment variables. |
| */ |
| private static final boolean SUN_JNU_ENCODING_IS_ISO_8859_1 = |
| Charset.forName(System.getProperty("sun.jnu.encoding")).equals(ISO_8859_1); |
| |
| /** |
| * This only exists for RemoteWorker, which uses JavaIoFileSystem with Unicode strings and thus |
| * shouldn't be subject to any reencoding. |
| */ |
| private static final boolean BAZEL_UNICODE_STRINGS = |
| Boolean.getBoolean("bazel.internal.UnicodeStrings"); |
| |
| private static boolean needsReencodeForPlatform(String s) { |
| if (SUN_JNU_ENCODING_IS_ISO_8859_1 && OS.getCurrent() == OS.LINUX) { |
| // In this case, platform strings encode raw bytes and are thus identical to internal strings. |
| return false; |
| } |
| // Otherwise, platform strings are a subset of Unicode strings. |
| return needsReencodeForUnicode(s); |
| } |
| |
| private static boolean needsReencodeForUnicode(String s) { |
| if (BAZEL_UNICODE_STRINGS) { |
| return false; |
| } |
| return !StringUnsafe.isAscii(s); |
| } |
| |
| private StringEncoding() {} |
| } |