blob: 1e42d7706dbc29a2a2a23fb58560a21ba3cb4fbb [file] [log] [blame]
// Copyright 2018 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.devtools.build.lib.unsafe;
import com.google.common.base.Ascii;
import java.lang.invoke.MethodHandle;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.VarHandle;
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.util.Arrays;
/**
* Provides direct access to the string implementation used by JDK9.
*
* <p>As of JDK9, a string is two fields: <code>byte coder</code>, and <code>byte[] value</code>.
* The <code>coder</code> field has value 0 if the encoding is LATIN-1, and 2 if the encoding is
* UTF-16 (the classic JDK8 encoding).
*
* <p>The <code>value</code> field contains the actual bytes.
*/
public final class StringUnsafe {
// Fields corresponding to the coder
public static final byte LATIN1 = 0;
public static final byte UTF16 = 1;
private static final MethodHandle CONSTRUCTOR;
private static final MethodHandle HAS_NEGATIVES;
private static final VarHandle VALUE_HANDLE;
private static final VarHandle CODE_HANDLE;
static {
try {
Class<?> stringCoding = Class.forName("java.lang.StringCoding");
Method hasNegatives =
stringCoding.getDeclaredMethod("hasNegatives", byte[].class, int.class, int.class);
hasNegatives.setAccessible(true);
HAS_NEGATIVES = MethodHandles.lookup().unreflect(hasNegatives);
Constructor<String> constructor =
String.class.getDeclaredConstructor(byte[].class, byte.class);
constructor.setAccessible(true);
CONSTRUCTOR = MethodHandles.lookup().unreflectConstructor(constructor);
var stringLookup = MethodHandles.privateLookupIn(String.class, MethodHandles.lookup());
VALUE_HANDLE = stringLookup.unreflectVarHandle(String.class.getDeclaredField("value"));
CODE_HANDLE = stringLookup.unreflectVarHandle(String.class.getDeclaredField("coder"));
} catch (ReflectiveOperationException e) {
throw new IllegalStateException(e);
}
}
/** Returns the coder used for this string. See {@link #LATIN1} and {@link #UTF16}. */
public static byte getCoder(String obj) {
return (byte) CODE_HANDLE.get(obj);
}
/**
* Returns the internal byte array, encoded according to {@link #getCoder}.
*
* <p>Use of this is unsafe. The representation may change from one JDK version to the next.
* Ensure you do not mutate this byte array in any way.
*/
public static byte[] getByteArray(String obj) {
return (byte[]) VALUE_HANDLE.get(obj);
}
/**
* Return the internal byte array of a String using Bazel's internal encoding (see {@link
* com.google.devtools.build.lib.util.StringEncoding}).
*
* <p>Use of this is unsafe. The representation may change from one JDK version to the next.
* Ensure you do not mutate this byte array in any way.
*/
public static byte[] getInternalStringBytes(String obj) {
// This is both a performance optimization and a correctness check: internal strings must
// always be coded in Latin-1, otherwise they have been constructed out of a non-ASCII string
// that hasn't been converted to internal encoding.
if (getCoder(obj) != LATIN1) {
// Truncation is ASCII only and thus doesn't change the encoding.
String truncatedString = Ascii.truncate(obj, 1000, "...");
throw new IllegalArgumentException(
String.format(
"Expected internal string with Latin-1 coder, got: %s (%s)",
truncatedString, Arrays.toString(getByteArray(truncatedString))));
}
return getByteArray(obj);
}
/** Returns whether the string is ASCII-only. */
public static boolean isAscii(String obj) {
// This implementation uses java.lang.StringCoding#hasNegatives, which is implemented as a JVM
// intrinsic. On a machine with 512-bit SIMD registers, this is 5x as fast as a naive loop
// over getByteArray(obj), which in turn is 5x as fast as obj.chars().anyMatch(c -> c > 0x7F) in
// a JMH benchmark.
if (getCoder(obj) != LATIN1) {
// Latin-1 is a superset of ASCII, so we must have non-ASCII characters.
return false;
}
byte[] bytes = getByteArray(obj);
try {
return !(boolean) HAS_NEGATIVES.invokeExact(bytes, 0, bytes.length);
} catch (Throwable t) {
// hasNegatives doesn't throw.
throw new IllegalStateException(t);
}
}
/**
* Constructs a new string from a byte array and coder.
*
* <p>The new string shares the byte array instance, which must not be modified after calling this
* method.
*/
public static String newInstance(byte[] bytes, byte coder) {
try {
return (String) CONSTRUCTOR.invokeExact(bytes, coder);
} catch (Throwable e) {
// The constructor never throws, so this is not expected.
throw new IllegalStateException(
"Could not instantiate string: " + Arrays.toString(bytes) + ", " + coder, e);
}
}
private StringUnsafe() {}
}