blob: 47577aed9081db3b0124cec1362fc96ac932ab16 [file] [log] [blame] [edit]
// Part of the Crubit project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef CRUBIT_SUPPORT_RUST_STD_CHAR_H_
#define CRUBIT_SUPPORT_RUST_STD_CHAR_H_
#include <array>
#include <cstddef>
#include <cstdint>
#include <optional>
#include "absl/base/optimization.h"
#include "absl/types/span.h"
#include "support/annotations.h"
#include "support/rs_std/str_ref.h"
namespace rs_std {
namespace internal {
// A call to this function is used to trigger a compiler error when a `char_`
// is constructed from a `uint32_t` that is not a valid Unicode code point.
//
// It is intentionally not-`constexpr` so that calls to it from a constexpr
// context will result in a compiler error.
inline void CharArgumentMustBeUnicodeCodePoint() {}
} // namespace internal
// The names like `from_u32` and `from_u32_unchecked` mimic Rust names and
// therefore don't need to be named the same as ordinary C++ names. See:
// https://google.github.io/styleguide/cppguide.html#Exceptions_to_Naming_Rules
// `rs_std::char_` is a C++ representation of the `char` type from Rust.
// `rust_builtin_type_abi_assumptions.md` documents the ABI compatibility of
// these types.
class CRUBIT_INTERNAL_RUST_TYPE("char") CRUBIT_INTERNAL_SAME_ABI char_ final {
public:
// Creates a default `char_` - one that represents ASCII NUL character.
//
// Providing the default constructor helps to ensure that the `value_` always
// effectively stores a C++ equivalent of a well-defined Rust's `u32` value
// (and never has a `MaybeUninit<u32>` value). See also the P2723R1 proposal
// for C++ which argues that zero-initialization may mitigate 10% of exploits.
constexpr char_() noexcept = default;
// Constant-time implicit constructor which converts a Unicode scalar value
// `uint32_t` into an `rs_std::char_`. This function performs compile-time
// validation that the `uint32_t` is a valid Unicode scalar value.
explicit consteval char_(char32_t c) noexcept : value_(c) {
if (!IsValidCodePoint(c)) {
internal::CharArgumentMustBeUnicodeCodePoint();
}
}
// Converts a `uint32_t` into a `rs_std::char_`.
//
// Note that not all valid `uint32_t`s are valid `rs_std::char_`s.
// `from_u32` will return `std::nullopt` if the input is not a valid value for
// a `rs_std::char_`.
//
// See also
// https://doc.rust-lang.org/reference/behavior-considered-undefined.html
// which documents that undefined behavior may result in presence of "A value
// in a char which is a surrogate or above char::MAX."
//
// This function mimics Rust's `char::from_u32`:
// https://doc.rust-lang.org/std/primitive.char.html#method.from_u32
static constexpr std::optional<char_> from_u32(char32_t c) noexcept {
if (!IsValidCodePoint(c)) {
return std::nullopt;
}
return from_u32_unchecked(c);
}
constexpr char_(const char_&) = default;
constexpr char_& operator=(const char_&) = default;
explicit constexpr operator std::uint32_t() const noexcept { return value_; }
friend constexpr bool operator==(char_ lhs, char_ rhs) noexcept;
friend constexpr auto operator<=>(char_ lhs, char_ rhs) noexcept;
// The highest valid code point a char can have, '\u{10FFFF}'.
//
// This constant mimics Rust's `char::MAX`:
// https://doc.rust-lang.org/std/primitive.char.html#associatedconstant.MAX
static const char_ MAX;
// Encodes the UTF-8 representation of this `char_` into the given
// `output_buffer`.
//
// The `output_buffer` must be large enough to hold the UTF-8 representation
// of this `char_`, which may be 1, 2, 3, or 4 bytes.
//
// This function mimics Rust's `char::encode_utf8`:
// https://doc.rust-lang.org/std/primitive.char.html#method.encode_utf8
StrRef encode_utf8(absl::Span<uint8_t> output_buffer) const;
// Returns the number of bytes required to UTF-8-encode this `char_`.
//
// This function mimics Rust's `char::len_utf8`:
// https://doc.rust-lang.org/std/primitive.char.html#method.len_utf8
constexpr size_t len_utf8() const noexcept {
if (value_ < 0x80) {
return 1;
} else if (value_ < 0x800) {
return 2;
} else if (value_ < 0x10000) {
return 3;
} else {
return 4;
}
}
private:
struct UnsafePromiseUnicode {};
// This function mimics Rust's `char::from_u32_unchecked`:
// https://doc.rust-lang.org/std/primitive.char.html#method.from_u32_unchecked
//
// TODO(b/254095482): Figure out how to annotate/expose unsafe functions in
// C++ and then make this method public.
static constexpr char_ from_u32_unchecked(uint32_t value) noexcept {
return char_(value, UnsafePromiseUnicode{});
}
static inline constexpr bool IsValidCodePoint(uint32_t c) noexcept {
// TODO(lukasza): Consider using slightly more efficient checks similarly
// to how `char_try_from_u32` is implemented in Rust standard library.
if (ABSL_PREDICT_FALSE(c > 0x10ffff)) {
// Value greater than Rust's `char::MAX`:
// https://doc.rust-lang.org/std/primitive.char.html#associatedconstant.MAX
return false;
}
if (ABSL_PREDICT_FALSE(c >= 0xd800 && c <= 0xdfff)) {
// Surrogate characters.
return false;
}
return true;
}
explicit constexpr char_(std::uint32_t value, UnsafePromiseUnicode)
: value_(value) {}
// See "layout tests" comments in `char_test.cc` for explanation why
// `char32_t` is not used.
std::uint32_t value_ = '\0';
};
// Definition of `char_::MAX` - it can't be defined and declared within the
// `class` definition, because before `char_` is fully defined the compiler
// complains that `constexpr` variable cannot have non-literal type
// 'const char_'.
constexpr char_ char_::MAX = char_::from_u32_unchecked(0x10ffff);
CRUBIT_DO_NOT_BIND constexpr bool operator==(char_ lhs, char_ rhs) noexcept {
return lhs.value_ == rhs.value_;
}
CRUBIT_DO_NOT_BIND constexpr auto operator<=>(char_ lhs, char_ rhs) noexcept {
return lhs.value_ <=> rhs.value_;
}
// Support automatic stringification with absl::StrCat and absl::StrFormat.
// This will append a UTF-8-encoded representation of `char_` to the `sink`.
//
// Note: this interface does not actually depend on absl. However, we may
// have difficulty if we ever want to upstream this interface to Rust itself.
// If that happens, we can consider adding this type as an optional
// dependency of absl, or otherwise extend the absl stringification mechanism
// in order to understand this type.
template <typename Sink>
void AbslStringify(Sink& sink, const char_& c) {
std::array<uint8_t, 4> buffer;
StrRef str = c.encode_utf8(absl::MakeSpan(buffer));
sink.Append(str.to_string_view());
}
} // namespace rs_std
#endif // CRUBIT_SUPPORT_RS_STD_CHAR_H_