Stop allocating new tokens in the lexer
There's only one Token and it gets reused.
This reduces the memory usage of the lexer. Parsing time seems to be 5%-10%
faster with this change on a large file. This makes little difference on the
overall performance of Bazel though.
RELNOTES: None.
PiperOrigin-RevId: 199310860
diff --git a/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java b/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java
index 91384f0..ed7bd4c 100644
--- a/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java
+++ b/src/main/java/com/google/devtools/build/lib/syntax/Lexer.java
@@ -81,8 +81,12 @@
// bottom.
private final Stack<Integer> indentStack = new Stack<>();
- /** Token to return */
- private Token token;
+ /**
+ * Token to return. This token is mutated in-place. Its kind is set to
+ * null to indicate the intermediate state, where the new token has not
+ * been scanned yet.
+ */
+ private final Token token;
private final List<Comment> comments;
@@ -112,6 +116,7 @@
this.checkIndentation = true;
this.comments = new ArrayList<>();
this.dents = 0;
+ this.token = new Token(null, -1, -1);
indentStack.push(0);
}
@@ -146,9 +151,10 @@
* after EOF has been returned.
*/
public Token nextToken() {
- boolean afterNewline = token != null && token.kind == TokenKind.NEWLINE;
- token = null;
+ boolean afterNewline = token.kind == TokenKind.NEWLINE;
+ token.kind = null;
tokenize();
+ Preconditions.checkState(token.kind != null);
// Like Python, always end with a NEWLINE token, even if no '\n' in input:
if (token.kind == TokenKind.EOF && !afterNewline) {
@@ -226,9 +232,20 @@
}
/** invariant: symbol positions are half-open intervals. */
- private void setToken(Token s) {
- Preconditions.checkState(token == null);
- token = s;
+ private void setToken(TokenKind kind, int left, int right) {
+ Preconditions.checkState(token.kind == null);
+ token.kind = kind;
+ token.left = left;
+ token.right = right;
+ token.value = null;
+ }
+
+ private void setToken(TokenKind kind, int left, int right, Object value) {
+ Preconditions.checkState(token.kind == null);
+ token.kind = kind;
+ token.left = left;
+ token.right = right;
+ token.value = value;
}
/**
@@ -241,7 +258,7 @@
newlineInsideExpression(); // in an expression: ignore space
} else {
checkIndentation = true;
- setToken(new Token(TokenKind.NEWLINE, pos - 1, pos));
+ setToken(TokenKind.NEWLINE, pos - 1, pos);
}
}
@@ -330,7 +347,7 @@
*
* @return the string-literal token.
*/
- private Token escapedStringLiteral(char quot, boolean isRaw) {
+ private void escapedStringLiteral(char quot, boolean isRaw) {
int literalStartPos = isRaw ? pos - 2 : pos - 1;
boolean inTriplequote = skipTripleQuote(quot);
// more expensive second choice that expands escaped into a buffer
@@ -345,12 +362,14 @@
break;
} else {
error("unterminated string literal at eol", literalStartPos, pos);
- return new Token(TokenKind.STRING, literalStartPos, pos, literal.toString());
+ setToken(TokenKind.STRING, literalStartPos, pos, literal.toString());
+ return;
}
case '\\':
if (pos == buffer.length) {
error("unterminated string literal at eof", literalStartPos, pos);
- return new Token(TokenKind.STRING, literalStartPos, pos, literal.toString());
+ setToken(TokenKind.STRING, literalStartPos, pos, literal.toString());
+ return;
}
if (isRaw) {
// Insert \ and the following character.
@@ -454,7 +473,8 @@
literal.append(c);
} else {
// Matching close-delimiter, all done.
- return new Token(TokenKind.STRING, literalStartPos, pos, literal.toString());
+ setToken(TokenKind.STRING, literalStartPos, pos, literal.toString());
+ return;
}
break;
default:
@@ -463,7 +483,7 @@
}
}
error("unterminated string literal at eof", literalStartPos, pos);
- return new Token(TokenKind.STRING, literalStartPos, pos, literal.toString());
+ setToken(TokenKind.STRING, literalStartPos, pos, literal.toString());
}
/**
@@ -477,14 +497,15 @@
* @param isRaw if true, do not escape the string.
* @return the string-literal token.
*/
- private Token stringLiteral(char quot, boolean isRaw) {
+ private void stringLiteral(char quot, boolean isRaw) {
int literalStartPos = isRaw ? pos - 2 : pos - 1;
int contentStartPos = pos;
// Don't even attempt to parse triple-quotes here.
if (skipTripleQuote(quot)) {
pos -= 2;
- return escapedStringLiteral(quot, isRaw);
+ escapedStringLiteral(quot, isRaw);
+ return;
}
// first quick optimistic scan for a simple non-escaped string
@@ -493,17 +514,16 @@
switch (c) {
case '\n':
error("unterminated string literal at eol", literalStartPos, pos);
- Token t =
- new Token(
- TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos - 1));
- return t;
+ setToken(TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos - 1));
+ return;
case '\\':
if (isRaw) {
if (lookaheadIs(0, '\r') && lookaheadIs(1, '\n')) {
// There was a CRLF after the newline. No shortcut possible, since it needs to be
// transformed into a single LF.
pos = contentStartPos;
- return escapedStringLiteral(quot, true);
+ escapedStringLiteral(quot, true);
+ return;
} else {
pos++;
break;
@@ -511,13 +531,15 @@
}
// oops, hit an escape, need to start over & build a new string buffer
pos = contentStartPos;
- return escapedStringLiteral(quot, false);
+ escapedStringLiteral(quot, false);
+ return;
case '\'':
case '"':
if (c == quot) {
// close-quote, all done.
- return new Token(
+ setToken(
TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos - 1));
+ return;
}
break;
default: // fall out
@@ -531,7 +553,7 @@
}
error("unterminated string literal at eof", literalStartPos, pos);
- return new Token(TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos));
+ setToken(TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos));
}
private static final Map<String, TokenKind> keywordMap = new HashMap<>();
@@ -578,13 +600,15 @@
*
* @return the identifier or keyword token.
*/
- private Token identifierOrKeyword() {
+ private void identifierOrKeyword() {
int oldPos = pos - 1;
String id = scanIdentifier();
TokenKind kind = keywordMap.get(id);
- return (kind == null)
- ? new Token(TokenKind.IDENTIFIER, oldPos, pos, id)
- : new Token(kind, oldPos, pos, null);
+ if (kind == null) {
+ setToken(TokenKind.IDENTIFIER, oldPos, pos, id);
+ } else {
+ setToken(kind, oldPos, pos, null);
+ }
}
private String scanIdentifier() {
@@ -649,10 +673,8 @@
*
* <p>ON ENTRY: 'pos' is 1 + the index of the first char in the literal.
* ON EXIT: 'pos' is 1 + the index of the last char in the literal.
- *
- * @return the integer token.
*/
- private Token integer() {
+ private void integer() {
int oldPos = pos - 1;
String literal = scanInteger();
@@ -679,7 +701,7 @@
error("invalid base-" + radix + " integer constant: " + literal);
}
- return new Token(TokenKind.INT, oldPos, pos, value);
+ setToken(TokenKind.INT, oldPos, pos, value);
}
/**
@@ -701,7 +723,7 @@
if (tok == null) {
return false;
} else {
- setToken(new Token(tok, pos, pos + 2));
+ setToken(tok, pos, pos + 2);
return true;
}
}
@@ -712,8 +734,8 @@
}
/**
- * Performs tokenization of the character buffer of file contents provided to the constructor.
- * Advances pos and sets the token variable.
+ * Performs tokenization of the character buffer of file contents provided to the constructor. At
+ * least one token will be added to the tokens queue.
*/
private void tokenize() {
if (checkIndentation) {
@@ -725,10 +747,10 @@
if (dents != 0) {
if (dents < 0) {
dents++;
- setToken(new Token(TokenKind.OUTDENT, pos - 1, pos));
+ setToken(TokenKind.OUTDENT, pos - 1, pos);
} else {
dents--;
- setToken(new Token(TokenKind.INDENT, pos - 1, pos));
+ setToken(TokenKind.INDENT, pos - 1, pos);
}
return;
}
@@ -742,94 +764,94 @@
pos++;
switch (c) {
case '{': {
- setToken(new Token(TokenKind.LBRACE, pos - 1, pos));
+ setToken(TokenKind.LBRACE, pos - 1, pos);
openParenStackDepth++;
break;
}
case '}': {
- setToken(new Token(TokenKind.RBRACE, pos - 1, pos));
+ setToken(TokenKind.RBRACE, pos - 1, pos);
popParen();
break;
}
case '(': {
- setToken(new Token(TokenKind.LPAREN, pos - 1, pos));
+ setToken(TokenKind.LPAREN, pos - 1, pos);
openParenStackDepth++;
break;
}
case ')': {
- setToken(new Token(TokenKind.RPAREN, pos - 1, pos));
+ setToken(TokenKind.RPAREN, pos - 1, pos);
popParen();
break;
}
case '[': {
- setToken(new Token(TokenKind.LBRACKET, pos - 1, pos));
+ setToken(TokenKind.LBRACKET, pos - 1, pos);
openParenStackDepth++;
break;
}
case ']': {
- setToken(new Token(TokenKind.RBRACKET, pos - 1, pos));
+ setToken(TokenKind.RBRACKET, pos - 1, pos);
popParen();
break;
}
case '>': {
- setToken(new Token(TokenKind.GREATER, pos - 1, pos));
+ setToken(TokenKind.GREATER, pos - 1, pos);
break;
}
case '<': {
- setToken(new Token(TokenKind.LESS, pos - 1, pos));
+ setToken(TokenKind.LESS, pos - 1, pos);
break;
}
case ':': {
- setToken(new Token(TokenKind.COLON, pos - 1, pos));
+ setToken(TokenKind.COLON, pos - 1, pos);
break;
}
case ',': {
- setToken(new Token(TokenKind.COMMA, pos - 1, pos));
+ setToken(TokenKind.COMMA, pos - 1, pos);
break;
}
case '+': {
- setToken(new Token(TokenKind.PLUS, pos - 1, pos));
+ setToken(TokenKind.PLUS, pos - 1, pos);
break;
}
case '-': {
- setToken(new Token(TokenKind.MINUS, pos - 1, pos));
+ setToken(TokenKind.MINUS, pos - 1, pos);
break;
}
case '|': {
- setToken(new Token(TokenKind.PIPE, pos - 1, pos));
+ setToken(TokenKind.PIPE, pos - 1, pos);
break;
}
case '=': {
- setToken(new Token(TokenKind.EQUALS, pos - 1, pos));
+ setToken(TokenKind.EQUALS, pos - 1, pos);
break;
}
case '%': {
- setToken(new Token(TokenKind.PERCENT, pos - 1, pos));
+ setToken(TokenKind.PERCENT, pos - 1, pos);
break;
}
case '/': {
if (lookaheadIs(0, '/') && lookaheadIs(1, '=')) {
- setToken(new Token(TokenKind.SLASH_SLASH_EQUALS, pos - 1, pos + 2));
+ setToken(TokenKind.SLASH_SLASH_EQUALS, pos - 1, pos + 2);
pos += 2;
} else if (lookaheadIs(0, '/')) {
- setToken(new Token(TokenKind.SLASH_SLASH, pos - 1, pos + 1));
+ setToken(TokenKind.SLASH_SLASH, pos - 1, pos + 1);
pos += 1;
} else {
// /= is handled by tokenizeTwoChars.
- setToken(new Token(TokenKind.SLASH, pos - 1, pos));
+ setToken(TokenKind.SLASH, pos - 1, pos);
}
break;
}
case ';': {
- setToken(new Token(TokenKind.SEMI, pos - 1, pos));
+ setToken(TokenKind.SEMI, pos - 1, pos);
break;
}
case '.': {
- setToken(new Token(TokenKind.DOT, pos - 1, pos));
+ setToken(TokenKind.DOT, pos - 1, pos);
break;
}
case '*': {
- setToken(new Token(TokenKind.STAR, pos - 1, pos));
+ setToken(TokenKind.STAR, pos - 1, pos);
break;
}
case ' ':
@@ -845,7 +867,7 @@
} else if (lookaheadIs(0, '\r') && lookaheadIs(1, '\n')) {
pos += 2; // skip the CRLF at the end of line
} else {
- setToken(new Token(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c)));
+ setToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c));
}
break;
}
@@ -868,7 +890,7 @@
}
case '\'':
case '\"': {
- setToken(stringLiteral(c, false));
+ stringLiteral(c, false);
break;
}
default: {
@@ -877,27 +899,27 @@
&& (buffer[pos] == '\'' || buffer[pos] == '\"')) {
c = buffer[pos];
pos++;
- setToken(stringLiteral(c, true));
+ stringLiteral(c, true);
break;
}
if (c >= '0' && c <= '9') {
- setToken(integer());
+ integer();
} else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
- setToken(identifierOrKeyword());
+ identifierOrKeyword();
} else {
error("invalid character: '" + c + "'");
}
break;
} // default
} // switch
- if (token != null) { // stop here if we scanned a token
+ if (token.kind != null) { // stop here if we scanned a token
return;
}
} // while
if (indentStack.size() > 1) { // top of stack is always zero
- setToken(new Token(TokenKind.NEWLINE, pos - 1, pos));
+ setToken(TokenKind.NEWLINE, pos - 1, pos);
while (indentStack.size() > 1) {
indentStack.pop();
dents--;
@@ -905,7 +927,7 @@
return;
}
- setToken(new Token(TokenKind.EOF, pos, pos));
+ setToken(TokenKind.EOF, pos, pos);
}
/**
diff --git a/src/main/java/com/google/devtools/build/lib/syntax/Token.java b/src/main/java/com/google/devtools/build/lib/syntax/Token.java
index 3c99df5..733b538 100644
--- a/src/main/java/com/google/devtools/build/lib/syntax/Token.java
+++ b/src/main/java/com/google/devtools/build/lib/syntax/Token.java
@@ -22,14 +22,14 @@
class Token {
TokenKind kind;
- final int left;
- final int right;
+ int left;
+ int right;
/**
* value is an Integer if the kind is INT.
* It is a String if the kind is STRING, IDENTIFIER, or COMMENT.
* It is null otherwise.
*/
- @Nullable final Object value;
+ @Nullable Object value;
Token(TokenKind kind, int left, int right) {
this(kind, left, right, null);
@@ -42,6 +42,10 @@
this.value = value;
}
+ Token copy() {
+ return new Token(kind, left, right, value);
+ }
+
/**
* Constructs an easy-to-read string representation of token, suitable for use
* in user error messages.
diff --git a/src/test/java/com/google/devtools/build/lib/syntax/LexerTest.java b/src/test/java/com/google/devtools/build/lib/syntax/LexerTest.java
index ccb968b..379d955 100644
--- a/src/test/java/com/google/devtools/build/lib/syntax/LexerTest.java
+++ b/src/test/java/com/google/devtools/build/lib/syntax/LexerTest.java
@@ -63,7 +63,7 @@
Token tok;
do {
tok = lexer.nextToken();
- result.add(tok);
+ result.add(tok.copy());
} while (tok.kind != TokenKind.EOF);
return result;
}