bazel syntax: fine-grained syntax locations
This change improves the precision with which the locations
of source tokens are recorded in the syntax tree. Prior to
this change, every Node held a single LexerLocation object
that recorded the start and end offsets of the node, plus
a reference to the shared LineNumberTable (LNT), that maps
these offsets to Locations. This had a cost of one reference
and one LexerLocation object per node.
This change causes every Node to record the offsets only of
its salient tokens, plus a reference to the LNT. For example,
in the expression "1 + 2", the only salient token is the plus
operator; the start and end offsets can be computed inductively
by delegating to x.getStartLocation and y.getEndLocation.
Similarly, in f(x), the salient tokens are '(' and ')'.
This has a cost of 1 word plus approximately 1 int per Node.
Consequently, we can record the exact position of operators
that fail, and do so using less memory than before.
Now, when an expression such as 'f().g() + 1' fails,
the location in the error message will refer to the '+'
operator or one of the two '(' tokens. Before, all
three errors would be wrongly reported at the same place:
f, since it is the start of all three subexpressions.
Overview:
- Every Node has a reference to the LNT, set immediately
after construction. (Morally it is part of the constructor
but it's fussy to set it that way.)
- Every node defines getStartOffset and getEndOffset,
typically by delegating to its left and right subtrees.
- Node end offsets are exclusive again. CL 170723732 was a mistake:
half-open intervals are mathematically simpler.
A client that wants to subtract one may do that.
But there are none.
- Comprehension.{For,If} are now true Nodes.
- StarlarkFile's extent is now (correctly) the entire file,
not just the range from the first statement to the last.
- The parser provides offsets of salient tokens to the Node constructors.
- IntegerLiteral now retains the raw token text in addition to the value.
- Token is gone. Its four fields are now embedded in the Lexer.
- Eval uses the following token positions in run-time error messages:
x+y f(x) x[i] x.y x[i:j] k: v
^ ^ ^ ^ ^ ^
- Location is final. LexerLocation and LineAndColumn are gone.
- Misparsed source represented as an Identifier now has the text of the
source instead of "$error$". This is more faithful and causes
the offsets to be correct.
- The offsets of the orig Identifier in load("module", local="orig")
coincide with the text 'orig', sans quotation marks.
Benchmark: saves about 65MB (1% of live RAM) retained by the
Usual Benchmark, a deps query.
RELNOTES: N/A
PiperOrigin-RevId: 305803031
diff --git a/src/main/java/com/google/devtools/build/lib/syntax/StarlarkFile.java b/src/main/java/com/google/devtools/build/lib/syntax/StarlarkFile.java
index b5d222d..237cd19 100644
--- a/src/main/java/com/google/devtools/build/lib/syntax/StarlarkFile.java
+++ b/src/main/java/com/google/devtools/build/lib/syntax/StarlarkFile.java
@@ -34,46 +34,59 @@
final List<SyntaxError> errors; // appended to by ValidationEnvironment
@Nullable private final String contentHashCode;
+ @Override
+ public int getStartOffset() {
+ return 0;
+ }
+
+ @Override
+ public int getEndOffset() {
+ return lnt.size();
+ }
+
private StarlarkFile(
ImmutableList<Statement> statements,
FileOptions options,
ImmutableList<Comment> comments,
List<SyntaxError> errors,
- String contentHashCode,
- Lexer.LexerLocation location) {
+ String contentHashCode) {
this.statements = statements;
this.options = options;
this.comments = comments;
this.errors = errors;
this.contentHashCode = contentHashCode;
- this.setLocation(location);
}
// Creates a StarlarkFile from the given effective list of statements,
// which may include the prelude.
private static StarlarkFile create(
+ LineNumberTable lnt,
ImmutableList<Statement> statements,
FileOptions options,
Parser.ParseResult result,
String contentHashCode) {
- return new StarlarkFile(
- statements,
- options,
- ImmutableList.copyOf(result.comments),
- result.errors,
- contentHashCode,
- result.location);
+ StarlarkFile file =
+ new StarlarkFile(
+ statements,
+ options,
+ ImmutableList.copyOf(result.comments),
+ result.errors,
+ contentHashCode);
+ file.lnt = lnt;
+ return file;
}
/** Extract a subtree containing only statements from i (included) to j (excluded). */
public StarlarkFile subTree(int i, int j) {
- return new StarlarkFile(
- this.statements.subList(i, j),
- this.options,
- /*comments=*/ ImmutableList.of(),
- errors,
- /*contentHashCode=*/ null,
- (Lexer.LexerLocation) this.statements.get(i).getStartLocation());
+ StarlarkFile file =
+ new StarlarkFile(
+ this.statements.subList(i, j),
+ this.options,
+ /*comments=*/ ImmutableList.of(),
+ errors,
+ /*contentHashCode=*/ null);
+ file.lnt = this.lnt;
+ return file;
}
/**
@@ -121,7 +134,7 @@
stmts.addAll(prelude);
stmts.addAll(result.statements);
- return create(stmts.build(), options, result, /*contentHashCode=*/ null);
+ return create(result.lnt, stmts.build(), options, result, /*contentHashCode=*/ null);
}
// TODO(adonovan): make the digest publicly settable, and delete this.
@@ -129,6 +142,7 @@
throws IOException {
Parser.ParseResult result = Parser.parseFile(input, options);
return create(
+ result.lnt,
ImmutableList.copyOf(result.statements),
options,
result,
@@ -152,7 +166,11 @@
public static StarlarkFile parse(ParserInput input, FileOptions options) {
Parser.ParseResult result = Parser.parseFile(input, options);
return create(
- ImmutableList.copyOf(result.statements), options, result, /*contentHashCode=*/ null);
+ result.lnt,
+ ImmutableList.copyOf(result.statements),
+ options,
+ result,
+ /*contentHashCode=*/ null);
}
/** Parse a Starlark file with default options. */