blob: ae5cb8a280dd2c66e6adb3d38d74c090d9f5d2bb [file] [log] [blame]
// Copyright 2024 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.devtools.build.lib.util;
import static com.google.common.truth.Truth.assertThat;
import static com.google.common.truth.TruthJUnit.assume;
import static com.google.devtools.build.lib.util.StringEncoding.internalToPlatform;
import static com.google.devtools.build.lib.util.StringEncoding.internalToUnicode;
import static com.google.devtools.build.lib.util.StringEncoding.platformToInternal;
import static com.google.devtools.build.lib.util.StringEncoding.unicodeToInternal;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;
import com.google.devtools.build.lib.unsafe.StringUnsafe;
import com.google.testing.junit.testparameterinjector.TestParameter;
import com.google.testing.junit.testparameterinjector.TestParameterInjector;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import org.junit.Test;
import org.junit.runner.RunWith;
@RunWith(TestParameterInjector.class)
public class StringEncodingTest {
public static final Charset SUN_JNU_ENCODING =
Charset.forName(System.getProperty("sun.jnu.encoding"));
@Test
public void testUnicodeToInternal() {
assertThat(unicodeToInternal("")).isSameInstanceAs("");
assertThat(unicodeToInternal("hello")).isSameInstanceAs("hello");
assertThat(unicodeToInternal("hällo"))
.isEqualTo(new String("hällo".getBytes(UTF_8), ISO_8859_1));
assertThat(unicodeToInternal("hållo"))
.isEqualTo(new String("hållo".getBytes(UTF_8), ISO_8859_1));
assertThat(unicodeToInternal("h👋llo"))
.isEqualTo(new String("h👋llo".getBytes(UTF_8), ISO_8859_1));
}
@Test
public void testInternalToUnicode() {
assertThat(internalToUnicode("")).isSameInstanceAs("");
assertThat(internalToUnicode("hello")).isSameInstanceAs("hello");
assertThat(internalToUnicode(new String("hällo".getBytes(UTF_8), ISO_8859_1)))
.isEqualTo("hällo");
assertThat(internalToUnicode(new String("hållo".getBytes(UTF_8), ISO_8859_1)))
.isEqualTo("hållo");
assertThat(internalToUnicode(new String("h👋llo".getBytes(UTF_8), ISO_8859_1)))
.isEqualTo("h👋llo");
}
@Test
public void testPlatformToInternal() {
if (SUN_JNU_ENCODING.equals(ISO_8859_1) && OS.getCurrent() == OS.LINUX) {
assertThat(platformToInternal("")).isSameInstanceAs("");
assertThat(platformToInternal("hello")).isSameInstanceAs("hello");
{
String s = new String("hällo".getBytes(UTF_8), ISO_8859_1);
assertThat(platformToInternal(s)).isSameInstanceAs(s);
}
{
String s = new String("hållo".getBytes(UTF_8), ISO_8859_1);
assertThat(platformToInternal(s)).isSameInstanceAs(s);
}
{
String s = new String("h👋llo".getBytes(UTF_8), ISO_8859_1);
assertThat(platformToInternal(s)).isSameInstanceAs(s);
}
{
// Not valid Unicode.
String s = new String(new byte[] {(byte) 0xFF, (byte) 0xFE, 0X01}, ISO_8859_1);
assertThat(platformToInternal(s)).isSameInstanceAs(s);
}
} else {
assertThat(platformToInternal("")).isSameInstanceAs("");
assertThat(platformToInternal("hello")).isSameInstanceAs("hello");
assertThat(platformToInternal("hällo"))
.isEqualTo(new String("hällo".getBytes(UTF_8), ISO_8859_1));
assertThat(platformToInternal("hållo"))
.isEqualTo(new String("hållo".getBytes(UTF_8), ISO_8859_1));
assertThat(platformToInternal("h👋llo"))
.isEqualTo(new String("h👋llo".getBytes(UTF_8), ISO_8859_1));
}
}
@Test
public void testInternalToPlatform() {
if (SUN_JNU_ENCODING.equals(ISO_8859_1) && OS.getCurrent() == OS.LINUX) {
assertThat(internalToPlatform("")).isSameInstanceAs("");
assertThat(internalToPlatform("hello")).isSameInstanceAs("hello");
{
String s = new String("hällo".getBytes(UTF_8), ISO_8859_1);
assertThat(internalToPlatform(s)).isSameInstanceAs(s);
}
{
String s = new String("hållo".getBytes(UTF_8), ISO_8859_1);
assertThat(internalToPlatform(s)).isSameInstanceAs(s);
}
{
String s = new String("h👋llo".getBytes(UTF_8), ISO_8859_1);
assertThat(internalToPlatform(s)).isSameInstanceAs(s);
}
{
// Not valid Unicode.
String s = new String(new byte[] {(byte) 0xFF, (byte) 0xFE, 0X01}, ISO_8859_1);
assertThat(internalToPlatform(s)).isSameInstanceAs(s);
}
} else {
assertThat(internalToPlatform("")).isSameInstanceAs("");
assertThat(internalToPlatform("hello")).isSameInstanceAs("hello");
assertThat(internalToPlatform(new String("hällo".getBytes(UTF_8), ISO_8859_1)))
.isEqualTo("hällo");
assertThat(internalToPlatform(new String("hållo".getBytes(UTF_8), ISO_8859_1)))
.isEqualTo("hållo");
assertThat(internalToPlatform(new String("h👋llo".getBytes(UTF_8), ISO_8859_1)))
.isEqualTo("h👋llo");
}
}
@Test
public void testPlatformToInternal_roundtrip(
@TestParameter({"ascii", "äöüÄÖÜß", "🌱", "羅勒罗勒学名"}) String s) {
assume().that(canEncode(s, SUN_JNU_ENCODING)).isTrue();
String internal = platformToInternal(s);
// In the internal encoding, raw bytes are encoded as Latin-1.
assertThat(StringUnsafe.getInstance().getCoder(internal)).isEqualTo(StringUnsafe.LATIN1);
String roundtripped = internalToPlatform(internal);
if (StringUnsafe.getInstance().isAscii(s)) {
assertThat(roundtripped).isSameInstanceAs(s);
} else {
assertThat(roundtripped).isEqualTo(s);
}
}
@Test
public void testPlatformToInternal_rawBytesRoundtrip() {
// Not valid UTF-8
byte[] rawBytes = new byte[] {0x00, 0x7F, (byte) 0x80, (byte) 0xFE, (byte) 0xFF};
assertThat(canDecode(rawBytes, UTF_8)).isFalse();
// Roundtripping raw bytes through the internal encoding requires Linux and a Latin-1 locale.
assume().that(OS.getCurrent()).isEqualTo(OS.LINUX);
assume().that(SUN_JNU_ENCODING).isEqualTo(ISO_8859_1);
String platform = new String(rawBytes, ISO_8859_1);
String internal = platformToInternal(platform);
assertThat(internal).isSameInstanceAs(platform);
String roundtripped = internalToPlatform(internal);
assertThat(roundtripped).isSameInstanceAs(internal);
}
@Test
public void testUnicodeToInternal_roundtrip(
@TestParameter({"ascii", "äöüÄÖÜß", "🌱", "羅勒罗勒学名"}) String s) {
String internal = unicodeToInternal(s);
// In the internal encoding, raw bytes are encoded as Latin-1.
assertThat(StringUnsafe.getInstance().getCoder(internal)).isEqualTo(StringUnsafe.LATIN1);
String roundtripped = internalToUnicode(internal);
if (StringUnsafe.getInstance().isAscii(s)) {
assertThat(roundtripped).isSameInstanceAs(s);
} else {
assertThat(roundtripped).isEqualTo(s);
}
}
private static boolean canEncode(String s, Charset charset) {
try {
charset.newEncoder().encode(CharBuffer.wrap(s));
return true;
} catch (CharacterCodingException e) {
return false;
}
}
private static boolean canDecode(byte[] bytes, Charset charset) {
try {
charset.newDecoder().decode(ByteBuffer.wrap(bytes));
return true;
} catch (CharacterCodingException e) {
return false;
}
}
}