blob: 0b043893033adbcfb5b819e60a95dbb35c274efb [file] [log] [blame]
// Part of the Crubit project, under the Apache License v2.0 with LLVM
// Exceptions. See /LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
use arc_anyhow::{anyhow, ensure, Result};
use proc_macro2::{Ident, TokenStream};
use quote::{format_ident, quote, ToTokens};
use std::collections::{BTreeSet, HashSet};
use std::rc::Rc;
use std::sync::LazyLock;
pub fn is_cpp_reserved_keyword(ident: &str) -> bool {
RESERVED_CC_KEYWORDS.contains(ident)
}
/// Formats a C++ (qualified) identifier. Returns an error when `ident` is a C++
/// reserved keyword or is an invalid identifier.
pub fn format_cc_ident(ident: &str) -> Result<TokenStream> {
ensure!(!ident.is_empty(), "Empty string is not a valid C++ identifier");
// C++ doesn't have an equivalent of
// https://doc.rust-lang.org/rust-by-example/compatibility/raw_identifiers.html and therefore
// an error is returned when `ident` is a C++ reserved keyword.
ensure!(
!RESERVED_CC_KEYWORDS.contains(ident),
"`{}` is a C++ reserved keyword and can't be used as a C++ identifier",
ident
);
// https://en.cppreference.com/w/cpp/language/identifiers says that "A valid identifier must
// begin with a non-digit character (Latin letter, underscore, or Unicode
// character of class XID_Start)". One motivation for this check is to
// explicitly catch names of tuple fields (e.g. `some_tuple.0`).
let first_char = ident.chars().next().expect("!is_empty checked above");
ensure!(
unicode_ident::is_xid_start(first_char) || first_char == '_',
"The following character can't be used as a start of a C++ identifier: {first_char}",
);
ident.parse().map_err(
// Explicitly mapping the error via `anyhow!`, because `LexError` is not `Sync`
// (required for `anyhow::Error` to implement `From<LexError>`) and
// therefore we can't just use `?`.
|lex_error| anyhow!("Can't format `{ident}` as a C++ identifier: {lex_error}"),
)
}
/// Makes an 'Ident' to be used in the Rust source code. Escapes Rust keywords.
/// Panics if `ident` is empty or is otherwise an invalid identifier.
pub fn make_rs_ident(ident: &str) -> Ident {
// TODO(https://github.com/dtolnay/syn/pull/1098): Remove the hardcoded list once syn recognizes
// 2018 and 2021 keywords.
if ["async", "await", "try", "dyn"].contains(&ident) {
return format_ident!("r#{}", ident);
}
match syn::parse_str::<syn::Ident>(ident) {
Ok(_) => format_ident!("{}", ident),
Err(_) => format_ident!("r#{}", ident),
}
}
/// Escapes characters that may not appear in a C++ or Rust identifier.
///
/// The implemented escaping algorithm guarantess that different inputs will
/// always produce different outputs (i.e. unique symbols will remain unique
/// after escaping). Other than that, the implemented escaping algorithm is
/// somewhat arbitrary and should be treated as an implementation detail and not
/// depended upon.
///
/// This transformation allows using escaped symbol names as part of Rust and/or
/// C++ identifiers. In particular note that in practice Rust uses `$` and `.`
/// characters in symbols - for example: "_ZN58_$LT$rust_out..
/// Point$u20$as$u20$core..default..Default$GT$7default17h144069f0ad7be325E".
pub fn escape_non_identifier_chars(symbol: &str) -> String {
// EXTRA_CAPACITY_PREDICTION has been haphazardly chosen based on a single
// example encountered in practice where there were 16 characters that needed
// escaping: 2 x '_', 8 x '$', 6 x '.': "_ZN58_$LT$rust_out..
// Point$u20$as$u20$core..default..Default$GT$7default17h144069f0ad7be325E"
const EXTRA_CAPACITY_PREDICTION: usize = 20;
let mut result = String::with_capacity(symbol.len() + EXTRA_CAPACITY_PREDICTION);
for (i, c) in symbol.chars().enumerate() {
match c {
'_' => result.push_str("_u"),
'$' => result.push_str("_d"),
'.' => result.push_str("_p"),
c => {
let is_valid_identifier_char = if i == 0 {
// `is_xid_start` doesn't cover `'_'` character, but it is okay because we
// explicitly handle this character in a match branch above.
unicode_ident::is_xid_start(c)
} else {
unicode_ident::is_xid_continue(c)
};
if is_valid_identifier_char {
result.push(c);
} else {
result.push_str("_x");
result.push_str(&format!("{:08x}", c as u32));
};
}
}
}
result
}
/// Representation of `foo::bar::baz` where each component is either the name
/// of a C++ namespace, or the name of a Rust module.
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
pub struct NamespaceQualifier(pub Vec<Rc<str>>);
impl NamespaceQualifier {
/// Constructs a new `NamespaceQualifier` from a sequence of names.
pub fn new<T: Into<Rc<str>>>(iter: impl IntoIterator<Item = T>) -> Self {
// TODO(b/258265044): Catch most (all if possible) error conditions early. For
// example:
// - Panic early if any strings are empty, or are not Rust identifiers
// - Report an error early if any strings are C++ reserved keywords
// This may make `format_for_cc`, `format_with_cc_body`, and
// `format_namespace_bound_cc_tokens` infallible.
Self(iter.into_iter().map(Into::into).collect())
}
/// Returns `foo::bar::baz::` (escaping Rust keywords as needed).
pub fn format_for_rs(&self) -> TokenStream {
let namespace_rs_idents = self.0.iter().map(|ns| make_rs_ident(ns));
quote! { #(#namespace_rs_idents::)* }
}
/// Returns `foo::bar::baz::` (reporting errors for C++ keywords).
pub fn format_for_cc(&self) -> Result<TokenStream> {
let namespace_cc_idents = self.cc_idents()?;
Ok(quote! { #(#namespace_cc_idents::)* })
}
pub fn format_with_cc_body(
&self,
body: TokenStream,
attributes: Vec<TokenStream>,
) -> Result<TokenStream> {
if self.0.is_empty() {
Ok(body)
} else {
let namespace_cc_idents = self.cc_idents()?;
Ok(quote! {
__NEWLINE__ #(#attributes)* namespace #(#namespace_cc_idents)::* { __NEWLINE__
#body
__NEWLINE__ } __NEWLINE__
})
}
}
pub fn cc_idents(&self) -> Result<Vec<TokenStream>> {
self.0.iter().map(|ns| format_cc_ident(ns)).collect()
}
}
/// `CcInclude` represents a single `#include ...` directive in C++.
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub enum CcInclude {
/// Represents a system header, e.g., `cstdef`, which will be included by
/// angular brackets.
SystemHeader(&'static str),
/// Represents an Abseil header, e.g., `base/no_destructor.h`.
AbseilHeader(&'static str),
/// Represents a user header, which will be included by quotes.
UserHeader(Rc<str>),
/// Represents an `#include` for Crubit C++ support library headers: the
/// format specifier for what comes after `#include` and path of the support
/// library header.
SupportLibHeader(Rc<str>, Rc<str>),
}
impl CcInclude {
/// Creates a `CcInclude` that represents `#include <cstddef>` and provides
/// C++ types like `std::size_t` or `std::ptrdiff_t`. See
/// https://en.cppreference.com/w/cpp/header/cstddef
pub fn cstddef() -> Self {
Self::SystemHeader("cstddef")
}
/// Creates a `CcInclude` that represents `#include <cstdint>` and provides
/// C++ types like `std::int16_t` or `std::uint32_t`. See
/// https://en.cppreference.com/w/cpp/header/cstdint
pub fn cstdint() -> Self {
Self::SystemHeader("cstdint")
}
/// Creates a `CcInclude` that represents `#include <memory>`.
/// See https://en.cppreference.com/w/cpp/header/memory
pub fn memory() -> Self {
Self::SystemHeader("memory")
}
/// Creates a `CcInclude` that represents `#include <utility>` and provides
/// C++ functions like `std::move` and C++ types like `std::tuple`.
/// See https://en.cppreference.com/w/cpp/header/utility
pub fn utility() -> Self {
Self::SystemHeader("utility")
}
/// Creates a `CcInclude` that represents `#include <type_traits>` and
/// provides C++ APIs like `std::is_trivially_copy_constructible_v`.
/// See https://en.cppreference.com/w/cpp/header/type_traits
pub fn type_traits() -> Self {
Self::SystemHeader("type_traits")
}
/// Creates a user include: `#include "some/path/to/header.h"`.
pub fn user_header(path: Rc<str>) -> Self {
Self::UserHeader(path)
}
/// Creates an abseil include: `#include "third_party/absl/foo.h"`.
pub fn absl_header(path: &'static str) -> Self {
Self::AbseilHeader(path)
}
/// Creates a support library header include based on the specified format.
/// E.g., `\"{header}\"` and `hdr.h` produces `#include "hdr.h"`.
pub fn support_lib_header(format: Rc<str>, path: Rc<str>) -> Self {
Self::SupportLibHeader(format, path)
}
}
impl ToTokens for CcInclude {
fn to_tokens(&self, tokens: &mut TokenStream) {
match self {
Self::SystemHeader(path) => {
let path: TokenStream = path
.parse()
.expect("`pub` API of `CcInclude` guarantees validity of system includes");
quote! { __HASH_TOKEN__ include < #path > __NEWLINE__ }.to_tokens(tokens)
}
Self::AbseilHeader(path) => {
// TODO(b/368434564): Remove the copybara transform here.
// In google, these are prefixed with "third_party/".
// copybara:strip_begin
const PREFIX: &str = "third_party/absl";
/* copybara:strip_end_and_replace
const PREFIX: &str = "absl";
*/
let path = format!("{PREFIX}/{path}");
quote! { __HASH_TOKEN__ include #path __NEWLINE__ }.to_tokens(tokens)
}
Self::UserHeader(path) => {
quote! { __HASH_TOKEN__ include #path __NEWLINE__ }.to_tokens(tokens)
}
Self::SupportLibHeader(format, path) => {
let full_path: TokenStream = format
.replace("{header}", path)
.parse()
.expect("Failed to parse support lib `#include` path");
quote! { __HASH_TOKEN__ include #full_path __NEWLINE__ }.to_tokens(tokens)
}
}
}
}
/// Formats a set of `CcInclude`s, trying to follow the guidance from
/// [the Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html#Names_and_Order_of_Includes).
pub fn format_cc_includes(set_of_includes: &BTreeSet<CcInclude>) -> TokenStream {
let mut tokens = TokenStream::default();
let mut iter = set_of_includes.iter().peekable();
while let Some(include) = iter.next() {
include.to_tokens(&mut tokens);
// Add an empty line between system headers and user headers.
if let (CcInclude::SystemHeader(_), Some(CcInclude::UserHeader(_))) = (include, iter.peek())
{
quote! { __NEWLINE__ }.to_tokens(&mut tokens)
}
}
tokens
}
static RESERVED_CC_KEYWORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
// `RESERVED_CC_KEYWORDS` are based on https://en.cppreference.com/w/cpp/keyword
[
"alignas",
"alignof",
"and",
"and_eq",
"asm",
"atomic_cancel",
"atomic_commit",
"atomic_noexcept",
"auto",
"bitand",
"bitor",
"bool",
"break",
"case",
"catch",
"char",
"char8_t",
"char16_t",
"char32_t",
"class",
"compl",
"concept",
"const",
"consteval",
"constexpr",
"constinit",
"const_cast",
"continue",
"co_await",
"co_return",
"co_yield",
"decltype",
"default",
"delete",
"do",
"double",
"dynamic_cast",
"else",
"enum",
"explicit",
"export",
"extern",
"false",
"float",
"for",
"friend",
"goto",
"if",
"inline",
"int",
"long",
"mutable",
"namespace",
"new",
"noexcept",
"not",
"not_eq",
"nullptr",
"operator",
"or",
"or_eq",
"private",
"protected",
"public",
"reflexpr",
"register",
"reinterpret_cast",
"requires",
"return",
"short",
"signed",
"sizeof",
"static",
"static_assert",
"static_cast",
"struct",
"switch",
"synchronized",
"template",
"this",
"thread_local",
"throw",
"true",
"try",
"typedef",
"typeid",
"typename",
"union",
"unsigned",
"using",
"virtual",
"void",
"volatile",
"wchar_t",
"while",
"xor",
"xor_eq",
]
.into_iter()
.collect()
});
#[cfg(test)]
pub mod tests {
use super::*;
use googletest::prelude::*;
use itertools::Itertools;
use quote::quote;
use token_stream_matchers::{assert_cc_matches, assert_rs_matches};
use token_stream_printer::cc_tokens_to_formatted_string_for_tests;
#[gtest]
fn test_format_cc_ident_basic() {
assert_cc_matches!(format_cc_ident("foo").unwrap(), quote! { foo });
}
#[gtest]
fn test_format_cc_ident_exotic_xid_start() {
assert_cc_matches!(format_cc_ident("Łukasz").unwrap(), quote! { Łukasz });
}
#[gtest]
fn test_format_cc_ident_underscore() {
assert_cc_matches!(format_cc_ident("_").unwrap(), quote! { _ });
}
#[gtest]
fn test_format_cc_ident_reserved_rust_keyword() {
assert_cc_matches!(format_cc_ident("impl").unwrap(), quote! { impl });
}
#[gtest]
fn test_format_cc_ident_reserved_cc_keyword() {
let err = format_cc_ident("reinterpret_cast").unwrap_err();
let msg = err.to_string();
assert!(msg.contains("`reinterpret_cast`"));
assert!(msg.contains("C++ reserved keyword"));
}
#[gtest]
fn test_format_cc_ident_unparseable_identifier() {
let err = format_cc_ident("foo)").unwrap_err();
let msg = err.to_string();
assert!(msg.contains("Can't format `foo)` as a C++ identifier"));
assert!(msg.contains("cannot parse"));
}
#[gtest]
fn test_format_cc_ident_unqualified_identifiers() {
// https://en.cppreference.com/w/cpp/language/identifiers#Unqualified_identifiers
// These may appear in `IR::Func::name`.
assert_cc_matches!(format_cc_ident("operator==").unwrap(), quote! { operator== });
assert_cc_matches!(format_cc_ident("operator new").unwrap(), quote! { operator new });
// This may appear in `IR::Record::cc_name` (although in practice these will
// be namespace-qualified most of the time).
assert_cc_matches!(format_cc_ident("MyTemplate<int>").unwrap(), quote! { MyTemplate<int> });
}
/// https://en.cppreference.com/w/cpp/language/identifiers#Qualified_identifiers
///
/// This may appear in `IR::Record::cc_name`, or in
/// `__crubit::annotate(cpp_type=...)`.
#[gtest]
fn test_format_cc_ident_qualified_identifiers() {
assert_cc_matches!(
format_cc_ident("std::vector<int>").unwrap(),
quote! { std::vector<int> }
);
}
#[gtest]
fn test_format_cc_ident_empty() {
let err = format_cc_ident("").unwrap_err();
let msg = err.to_string();
assert_eq!(msg, "Empty string is not a valid C++ identifier");
}
#[gtest]
fn test_format_cc_ident_invalid_first_char() {
let tests = vec![
// `0` and `1 are field names in `struct RustStruct(i32, u16)`.
"0",
// `~MyClass` is a valid unqualified identifier in C++, but it is okay if
// `format_cc_ident` rejects it, because `format_cc_ident` is not used to format
// destructor names.
"~MyClass",
// We used to trim leading and/or trailing whitespace, but stricter validation
// of leading whitespace seems desirable.
r#" operator "" _km "#,
" foo",
// Other tests
"(foo",
"(foo)",
];
for test in tests.into_iter() {
let err = format_cc_ident(test).unwrap_err();
let actual_msg = err.to_string();
let c = test.chars().next().unwrap();
let expected_msg = format!(
"The following character can't be used as a start of a C++ identifier: {c}"
);
assert_eq!(actual_msg, expected_msg);
}
}
#[gtest]
fn test_make_rs_ident_basic() {
let id = make_rs_ident("foo");
assert_rs_matches!(quote! { #id }, quote! { foo });
}
#[gtest]
fn test_make_rs_ident_reserved_cc_keyword() {
let id = make_rs_ident("reinterpret_cast");
assert_rs_matches!(quote! { #id }, quote! { reinterpret_cast });
}
#[gtest]
fn test_make_rs_ident_reserved_rust_keyword() {
let id = make_rs_ident("impl");
assert_rs_matches!(quote! { #id }, quote! { r#impl });
}
#[gtest]
#[should_panic]
fn test_make_rs_ident_unfinished_group() {
make_rs_ident("(foo"); // No closing `)`.
}
#[gtest]
#[should_panic]
fn test_make_rs_ident_empty() {
make_rs_ident("");
}
#[gtest]
fn test_cc_include_to_tokens_for_system_header() {
let include = CcInclude::cstddef();
assert_cc_matches!(
quote! { #include },
quote! {
__HASH_TOKEN__ include <cstddef>
}
);
}
#[gtest]
fn test_cc_include_to_tokens_for_user_header() {
let include = CcInclude::user_header("some/path/to/header.h".into());
assert_cc_matches!(
quote! { #include },
quote! {
__HASH_TOKEN__ include "some/path/to/header.h"
}
);
}
#[gtest]
fn test_cc_include_ord() {
let cstddef = CcInclude::cstddef();
let memory = CcInclude::memory();
let a = CcInclude::user_header("a.h".into());
let b = CcInclude::user_header("b.h".into());
assert!(cstddef < memory);
assert!(cstddef < a);
assert!(cstddef < b);
assert!(memory < a);
assert!(memory < b);
assert!(a < b);
}
#[gtest]
fn test_format_cc_includes() {
let includes = [
CcInclude::cstddef(),
CcInclude::memory(),
CcInclude::user_header("a.h".into()),
CcInclude::user_header("b.h".into()),
]
.into_iter()
.collect::<BTreeSet<_>>();
let tokens = format_cc_includes(&includes);
let actual =
cc_tokens_to_formatted_string_for_tests(quote! { __NEWLINE__ #tokens }).unwrap();
assert_eq!(
actual,
r#"
#include <cstddef>
#include <memory>
#include "a.h"
#include "b.h"
"#
);
}
#[gtest]
fn test_namespace_qualifier_empty() {
let ns = NamespaceQualifier::new::<&str>([]);
let actual_rs = ns.format_for_rs();
assert!(actual_rs.is_empty());
let actual_cc = ns.format_for_cc().unwrap();
assert!(actual_cc.is_empty());
}
#[gtest]
fn test_namespace_qualifier_basic() {
let ns = NamespaceQualifier::new(["foo", "bar"]);
let actual_rs = ns.format_for_rs();
assert_rs_matches!(actual_rs, quote! { foo::bar:: });
let actual_cc = ns.format_for_cc().unwrap();
assert_cc_matches!(actual_cc, quote! { foo::bar:: });
}
#[gtest]
fn test_namespace_qualifier_reserved_cc_keyword() {
let ns = NamespaceQualifier::new(["foo", "impl", "bar"]);
let actual_rs = ns.format_for_rs();
assert_rs_matches!(actual_rs, quote! { foo :: r#impl :: bar :: });
let actual_cc = ns.format_for_cc().unwrap();
assert_cc_matches!(actual_cc, quote! { foo::impl::bar:: });
}
#[gtest]
fn test_namespace_qualifier_reserved_rust_keyword() {
let ns = NamespaceQualifier::new(["foo", "reinterpret_cast", "bar"]);
let actual_rs = ns.format_for_rs();
assert_rs_matches!(actual_rs, quote! { foo :: reinterpret_cast :: bar :: });
let cc_error = ns.format_for_cc().unwrap_err();
let msg = cc_error.to_string();
assert!(msg.contains("`reinterpret_cast`"));
assert!(msg.contains("C++ reserved keyword"));
}
#[gtest]
fn test_namespace_qualifier_format_with_cc_body_top_level_namespace() {
let ns = NamespaceQualifier::new::<&str>([]);
assert_cc_matches!(
ns.format_with_cc_body(quote! { cc body goes here }, vec![]).unwrap(),
quote! { cc body goes here },
);
}
#[gtest]
fn test_namespace_qualifier_format_with_cc_body_nested_namespace() {
let ns = NamespaceQualifier::new(["foo", "bar", "baz"]);
assert_cc_matches!(
ns.format_with_cc_body(quote! { cc body goes here }, vec![]).unwrap(),
quote! {
namespace foo::bar::baz {
cc body goes here
} // namespace foo::bar::baz
},
);
}
#[gtest]
fn test_format_cc_include_support_lib_header() {
let tests = vec![
(
"\"crubit/support/path/for/test/{header}\"",
"header.h",
"\"crubit/support/path/for/test/header.h\"",
),
(
"\"crubit/support/path/for/test/{header}\"",
"subdir/header.h",
"\"crubit/support/path/for/test/subdir/header.h\"",
),
(
"<crubit/support/path/for/test/{header}>",
"header.h",
"<crubit/support/path/for/test/header.h>",
),
("\"{header}\"", "header.h", "\"header.h\""),
];
for (support_path_format, header, expected_output) in tests.iter() {
let header = CcInclude::support_lib_header(
support_path_format.to_string().into(),
header.to_string().into(),
);
let mut actual_tokens = TokenStream::default();
header.to_tokens(&mut actual_tokens);
let expected_output: TokenStream =
expected_output.parse().expect("Failed to convert expected_output to TokenStream");
assert_cc_matches!(
actual_tokens,
quote! {
__HASH_TOKEN__ include #expected_output
}
);
}
}
#[gtest]
fn test_escape_non_identifier_chars() {
let tests = vec![
("", ""),
("foo", "foo"),
("0abc", "_x00000030abc"),
("abc$xyz", "abc_dxyz"),
("abc.xyz", "abc_pxyz"),
("abc_xyz", "abc_uxyz"),
("abc🦀xyz", "abc_x0001f980xyz"),
// With an escaping scheme like `$` => "_d", `<utf8 dd80 char>` => "_dd80", the
// following 2 tests would fail the injectivity requirement (they both would map to
// "_dd80"):
("$d80", "_dd80"),
("\u{740}", "_x00000740"),
];
for (input, expected_output) in tests.iter() {
let actual_output = escape_non_identifier_chars(input);
assert_eq!(&actual_output, expected_output);
}
// Asserting that each distinct, unique test input should result in a unique,
// non-duplicated output. (This can be seen as a rather lightweight and
// indirect verification of the injectivity requirement.)
let duplicate_expectations =
tests.iter().map(|(_, expected)| *expected).duplicates().collect_vec();
let empty_vec: Vec<&'static str> = vec![];
assert_eq!(empty_vec, duplicate_expectations);
}
}