blob: 49702815d91c22f881efa16d987143b6e1aee6e8 [file] [log] [blame]
/*
* Copyright 2016 The Bazel Authors. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.idea.blaze.base.lang.buildfile.lexer;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import javax.annotation.Nullable;
/**
* A tokenizer for the BUILD language.
*
* <p>Copied from blaze/bazel's lexer. The differences are: 1. Blaze's lexer isn't 'faithful', in
* that it reorders characters, skips characters, and adds ghost characters. We can't do that,
* because we need to match the editor's view of the document. 2. Blaze's lexer only lexes entire
* files (it can't incrementally lex part of a file, starting from a given indent stack depth).
*/
public class BuildLexerBase {
/**
* When tokenizing for the purposes of parsing, we handle indentation.<br>
* For syntax highlighting, we need to tokenize every character, and don't care about indentation.
*/
public enum LexerMode {
Parsing,
SyntaxHighlighting
}
// Characters that can come immediately prior to an '=' character to generate
// a different token
private static final Map<Character, TokenKind> EQUAL_TOKENS =
ImmutableMap.<Character, TokenKind>builder()
.put('=', TokenKind.EQUALS_EQUALS)
.put('!', TokenKind.NOT_EQUALS)
.put('>', TokenKind.GREATER_EQUALS)
.put('<', TokenKind.LESS_EQUALS)
.put('+', TokenKind.PLUS_EQUALS)
.put('-', TokenKind.MINUS_EQUALS)
.put('*', TokenKind.STAR_EQUALS)
.put('/', TokenKind.SLASH_EQUALS)
.put('%', TokenKind.PERCENT_EQUALS)
.build();
private final LexerMode mode;
// Input buffer and position
private final char[] buffer;
private int pos;
private final List<Token> tokens;
// The number of unclosed open-parens ("(", '{', '[') at the current point in
// the stream. Whitespace is handled differently when this is nonzero.
private int openParenStackDepth = 0;
// The stack of enclosing indentation levels; always contains '0' at the
// bottom.
private final Stack<Integer> indentStack = new Stack<>();
private boolean containsErrors;
/**
* Constructs a lexer which tokenizes the contents of the specified InputBuffer. Any errors during
* lexing are reported on "handler".
*/
public BuildLexerBase(CharSequence input, int initialStackDepth, LexerMode mode) {
this.buffer = input.toString().toCharArray();
// Empirical measurements show roughly 1 token per 8 characters in buffer.
this.tokens = Lists.newArrayListWithExpectedSize(buffer.length / 8);
this.pos = 0;
this.openParenStackDepth = initialStackDepth;
this.mode = mode;
indentStack.push(0);
tokenize();
}
/** The number of unclosed open-parens ("(", '{', '[') at the end of this string. */
public int getOpenParenStackDepth() {
return openParenStackDepth;
}
/**
* Returns true if there were errors during scanning of this input file or string. The
* BuildLexerBase may attempt to recover from errors, but clients should not rely on the results
* of scanning if this flag is set.
*/
public boolean containsErrors() {
return containsErrors;
}
/** Returns the (mutable) list of tokens generated by the BuildLexerBase. */
public List<Token> getTokens() {
return tokens;
}
private void popParen() {
if (openParenStackDepth == 0) {
error("indentation error");
} else {
openParenStackDepth--;
}
}
private void error(String message) {
error(message, pos - 1, pos - 1);
}
protected void error(String message, int start, int end) {
this.containsErrors = true;
}
/** invariant: symbol positions are half-open intervals. */
private void addToken(TokenKind kind, int left, int right) {
addToken(kind, left, right, null);
}
private void addToken(TokenKind kind, int left, int right, @Nullable Object value) {
tokens.add(new Token(kind, left, right, value));
}
/**
* Parses an end-of-line sequence, handling statement indentation correctly.
*
* <p>UNIX newlines are assumed (LF). Carriage returns are always ignored.
*
* <p>ON ENTRY: 'pos' is the index of the char after '\n'. ON EXIT: 'pos' is the index of the next
* non-space char after '\n'.
*/
protected void newline() {
if (mode == LexerMode.SyntaxHighlighting) {
addToken(TokenKind.NEWLINE, pos - 1, pos);
return;
}
if (openParenStackDepth > 0) {
newlineInsideExpression(); // in an expression: ignore space
} else {
newlineOutsideExpression(); // generate NEWLINE/INDENT/DEDENT tokens
}
}
private void newlineInsideExpression() {
int oldPos = pos - 1;
while (pos < buffer.length) {
switch (buffer[pos]) {
case ' ':
case '\t':
case '\r':
pos++;
break;
default:
// ignored by the parser
addToken(TokenKind.WHITESPACE, oldPos, pos);
return;
}
}
addToken(TokenKind.WHITESPACE, oldPos, pos);
}
/**
* Handle INDENT and DEDENT within statements.
*
* <p>Note these tokens have zero length -- this is because we can have an arbitrary number of
* dedents squeezed into some number of characters, and the size of all the lexical elements must
* match the number of characters in the file.
*/
private void newlineOutsideExpression() {
int oldPos = pos - 1;
if (pos > 1) { // skip over newline at start of file
addToken(TokenKind.NEWLINE, oldPos, pos);
oldPos = pos;
}
// we're in a stmt: suck up space at beginning of next line
int indentLen = 0;
while (pos < buffer.length) {
char c = buffer[pos];
if (c == ' ') {
indentLen++;
pos++;
} else if (c == '\t') {
indentLen += 8 - indentLen % 8;
pos++;
} else if (c == '\n') { // entirely blank line: ignore
indentLen = 0;
pos++;
} else if (c == '#') { // line containing only indented comment
if (oldPos != pos) {
addToken(TokenKind.WHITESPACE, oldPos, pos);
oldPos = pos;
}
while (pos < buffer.length && c != '\n') {
c = buffer[pos++];
}
addToken(TokenKind.COMMENT, oldPos, pos - 1, bufferSlice(oldPos, pos - 1));
oldPos = pos - 1;
indentLen = 0;
} else { // printing character
break;
}
}
if (oldPos != pos) {
addToken(TokenKind.WHITESPACE, oldPos, pos);
}
if (pos == buffer.length) {
indentLen = 0;
} // trailing space on last line
int peekedIndent = indentStack.peek();
if (peekedIndent < indentLen) { // push a level
indentStack.push(indentLen);
addToken(TokenKind.INDENT, pos, pos);
} else if (peekedIndent > indentLen) { // pop one or more levels
while (peekedIndent > indentLen) {
indentStack.pop();
addToken(TokenKind.DEDENT, pos, pos);
peekedIndent = indentStack.peek();
}
if (peekedIndent < indentLen) {
error("indentation error");
}
}
}
/** Collapse adjacent whitespace characters into a single token */
private void addWhitespace() {
int oldPos = pos - 1;
while (pos < buffer.length) {
switch (buffer[pos]) {
case ' ':
case '\t':
case '\r':
pos++;
break;
default:
addToken(TokenKind.WHITESPACE, oldPos, pos, bufferSlice(oldPos, pos));
return;
}
}
addToken(TokenKind.WHITESPACE, oldPos, pos, bufferSlice(oldPos, pos));
}
/**
* Returns true if current position is in the middle of a triple quote delimiter (3 x quot), and
* advances 'pos' by two if so.
*/
private boolean skipTripleQuote(char quot) {
if (pos + 1 < buffer.length && buffer[pos] == quot && buffer[pos + 1] == quot) {
pos += 2;
return true;
} else {
return false;
}
}
/**
* Scans a string literal delimited by 'quot', containing escape sequences.
*
* <p>ON ENTRY: 'pos' is 1 + the index of the first delimiter ON EXIT: 'pos' is 1 + the index of
* the last delimiter.
*/
private void escapedStringLiteral(char quot, boolean isRaw) {
int oldPos = isRaw ? pos - 2 : pos - 1;
boolean inTripleQuote = skipTripleQuote(quot);
// more expensive second choice that expands escaped into a buffer
StringBuilder literal = new StringBuilder();
while (pos < buffer.length) {
char c = buffer[pos];
pos++;
switch (c) {
case '\n':
if (inTripleQuote) {
literal.append(c);
break;
} else {
error("unterminated string literal at eol", oldPos, pos);
addToken(TokenKind.STRING, oldPos, pos - 1, literal.toString());
newline();
return;
}
case '\\':
if (pos == buffer.length) {
error("unterminated string literal at eof", oldPos, pos);
addToken(TokenKind.STRING, oldPos, pos - 1, literal.toString());
return;
}
if (isRaw) {
// Insert \ and the following character.
// As in Python, it means that a raw string can never end with a single \.
literal.append('\\');
literal.append(buffer[pos]);
pos++;
break;
}
c = buffer[pos];
pos++;
switch (c) {
case '\n':
// ignore end of line character
break;
case 'n':
literal.append('\n');
break;
case 'r':
literal.append('\r');
break;
case 't':
literal.append('\t');
break;
case '\\':
literal.append('\\');
break;
case '\'':
literal.append('\'');
break;
case '"':
literal.append('"');
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
{ // octal escape
int octal = c - '0';
if (pos < buffer.length) {
c = buffer[pos];
if (c >= '0' && c <= '7') {
pos++;
octal = (octal << 3) | (c - '0');
if (pos < buffer.length) {
c = buffer[pos];
if (c >= '0' && c <= '7') {
pos++;
octal = (octal << 3) | (c - '0');
}
}
}
}
literal.append((char) (octal & 0xff));
break;
}
case 'a':
case 'b':
case 'f':
case 'N':
case 'u':
case 'U':
case 'v':
case 'x':
// exists in Python but not implemented in Blaze => error
error("escape sequence not implemented: \\" + c, oldPos, pos);
break;
default:
// unknown char escape => "\literal"
literal.append('\\');
literal.append(c);
break;
}
break;
case '\'':
case '"':
if (c != quot || (inTripleQuote && !skipTripleQuote(quot))) {
// Non-matching quote, treat it like a regular char.
literal.append(c);
} else {
// Matching close-delimiter, all done.
addToken(TokenKind.STRING, oldPos, pos, literal.toString());
return;
}
break;
default:
literal.append(c);
break;
}
}
error("unterminated string literal at eof", oldPos, pos);
addToken(TokenKind.STRING, oldPos, pos, literal.toString());
}
/**
* Scans a string literal delimited by 'quot'.
*
* <ul>
* <li> ON ENTRY: 'pos' is 1 + the index of the first delimiter
* <li> ON EXIT: 'pos' is 1 + the index of the last delimiter.
* </ul>
*
* @param isRaw if true, do not escape the string.
*/
private void addStringLiteral(char quot, boolean isRaw) {
int oldPos = isRaw ? pos - 2 : pos - 1;
int start = pos;
// Don't even attempt to parse triple-quotes here.
if (skipTripleQuote(quot)) {
pos -= 2;
escapedStringLiteral(quot, isRaw);
return;
}
// first quick optimistic scan for a simple non-escaped string
while (pos < buffer.length) {
char c = buffer[pos++];
switch (c) {
case '\n':
error("unterminated string literal at eol", oldPos, pos);
addToken(TokenKind.STRING, oldPos, pos - 1, bufferSlice(start, pos - 1));
newline();
return;
case '\\':
if (isRaw) {
// skip the next character
pos++;
break;
}
// oops, hit an escape, need to start over & build a new string buffer
pos = oldPos + 1;
escapedStringLiteral(quot, false);
return;
case '\'':
case '"':
if (c == quot) {
// close-quote, all done.
addToken(TokenKind.STRING, oldPos, pos, bufferSlice(start, pos - 1));
return;
}
}
}
error("unterminated string literal at eof", oldPos, pos);
addToken(TokenKind.STRING, oldPos, pos, bufferSlice(start, pos));
}
private static final ImmutableMap<String, TokenKind> KEYWORD_MAP = createKeywordMap();
private static ImmutableMap<String, TokenKind> createKeywordMap() {
ImmutableMap.Builder<String, TokenKind> builder = ImmutableMap.builder();
for (TokenKind kind : TokenKind.KEYWORDS) {
builder.put(kind.toString(), kind);
}
return builder.build();
}
private TokenKind getTokenKindForIdentfier(String id) {
TokenKind kind = KEYWORD_MAP.get(id);
return kind == null ? TokenKind.IDENTIFIER : kind;
}
private String scanIdentifier() {
int oldPos = pos - 1;
while (pos < buffer.length) {
switch (buffer[pos]) {
case '_':
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
pos++;
break;
default:
return bufferSlice(oldPos, pos);
}
}
return bufferSlice(oldPos, pos);
}
/**
* Scans an identifier or keyword.
*
* <p>ON ENTRY: 'pos' is 1 + the index of the first char in the identifier. ON EXIT: 'pos' is 1 +
* the index of the last char in the identifier.
*
* @return the identifier or keyword token.
*/
private void addIdentifierOrKeyword() {
int oldPos = pos - 1;
String id = scanIdentifier();
TokenKind kind = getTokenKindForIdentfier(id);
addToken(kind, oldPos, pos, (kind == TokenKind.IDENTIFIER) ? id : null);
}
private String scanInteger() {
int oldPos = pos - 1;
while (pos < buffer.length) {
char c = buffer[pos];
switch (c) {
case 'X':
case 'x':
case 'a':
case 'A':
case 'b':
case 'B':
case 'c':
case 'C':
case 'd':
case 'D':
case 'e':
case 'E':
case 'f':
case 'F':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
pos++;
break;
default:
return bufferSlice(oldPos, pos);
}
}
return bufferSlice(oldPos, pos);
}
/**
* Scans an addInteger literal.
*
* <p>ON ENTRY: 'pos' is 1 + the index of the first char in the literal. ON EXIT: 'pos' is 1 + the
* index of the last char in the literal.
*/
private void addInteger() {
int oldPos = pos - 1;
String literal = scanInteger();
final String substring;
final int radix;
if (literal.startsWith("0x") || literal.startsWith("0X")) {
radix = 16;
substring = literal.substring(2);
} else if (literal.startsWith("0") && literal.length() > 1) {
radix = 8;
substring = literal.substring(1);
} else {
radix = 10;
substring = literal;
}
int value = 0;
try {
value = Integer.parseInt(substring, radix);
} catch (NumberFormatException e) {
error("invalid base-" + radix + " integer constant: " + literal);
}
addToken(TokenKind.INT, oldPos, pos, value);
}
/**
* Tokenizes a two-char operator.
*
* @return true if it tokenized an operator
*/
private boolean tokenizeTwoChars() {
if (pos + 2 >= buffer.length) {
return false;
}
char c1 = buffer[pos];
char c2 = buffer[pos + 1];
TokenKind tok = null;
if (c2 == '=') {
tok = EQUAL_TOKENS.get(c1);
} else if (c2 == '*' && c1 == '*') {
tok = TokenKind.STAR_STAR;
}
if (tok == null) {
return false;
}
addToken(tok, pos, pos + 2);
return true;
}
/** Performs tokenization of the character buffer of file contents provided to the constructor. */
private void tokenize() {
while (pos < buffer.length) {
if (tokenizeTwoChars()) {
pos += 2;
continue;
}
char c = buffer[pos];
pos++;
switch (c) {
case '{':
{
addToken(TokenKind.LBRACE, pos - 1, pos);
openParenStackDepth++;
break;
}
case '}':
{
addToken(TokenKind.RBRACE, pos - 1, pos);
popParen();
break;
}
case '(':
{
addToken(TokenKind.LPAREN, pos - 1, pos);
openParenStackDepth++;
break;
}
case ')':
{
addToken(TokenKind.RPAREN, pos - 1, pos);
popParen();
break;
}
case '[':
{
addToken(TokenKind.LBRACKET, pos - 1, pos);
openParenStackDepth++;
break;
}
case ']':
{
addToken(TokenKind.RBRACKET, pos - 1, pos);
popParen();
break;
}
case '>':
{
addToken(TokenKind.GREATER, pos - 1, pos);
break;
}
case '<':
{
addToken(TokenKind.LESS, pos - 1, pos);
break;
}
case ':':
{
addToken(TokenKind.COLON, pos - 1, pos);
break;
}
case ',':
{
addToken(TokenKind.COMMA, pos - 1, pos);
break;
}
case '+':
{
addToken(TokenKind.PLUS, pos - 1, pos);
break;
}
case '-':
{
addToken(TokenKind.MINUS, pos - 1, pos);
break;
}
case '|':
{
addToken(TokenKind.PIPE, pos - 1, pos);
break;
}
case '=':
{
addToken(TokenKind.EQUALS, pos - 1, pos);
break;
}
case '%':
{
addToken(TokenKind.PERCENT, pos - 1, pos);
break;
}
case '/':
{
addToken(TokenKind.SLASH, pos - 1, pos);
break;
}
case ';':
{
addToken(TokenKind.SEMI, pos - 1, pos);
break;
}
case '.':
{
addToken(TokenKind.DOT, pos - 1, pos);
break;
}
case '*':
{
addToken(TokenKind.STAR, pos - 1, pos);
break;
}
case ' ':
case '\t':
case '\r':
{
addWhitespace();
break;
}
case '\\':
{
// Backslash character is valid only at the end of a line (or in a string)
if (pos + 1 < buffer.length && buffer[pos] == '\n') {
// treat end of line backslash and newline char as whitespace
// (they're ignored by the parser)
pos++;
addToken(TokenKind.WHITESPACE, pos - 2, pos, Character.toString(c));
} else {
addToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c));
}
break;
}
case '\n':
{
newline();
break;
}
case '#':
{
int oldPos = pos - 1;
while (pos < buffer.length) {
c = buffer[pos];
if (c == '\n') {
break;
} else {
pos++;
}
}
addToken(TokenKind.COMMENT, oldPos, pos, bufferSlice(oldPos, pos));
break;
}
case '\'':
case '\"':
{
addStringLiteral(c, false);
break;
}
default:
{
// detect raw strings, e.g. r"str"
if (c == 'r' && pos < buffer.length && (buffer[pos] == '\'' || buffer[pos] == '\"')) {
c = buffer[pos];
pos++;
addStringLiteral(c, true);
break;
}
if (Character.isDigit(c)) {
addInteger();
} else if (Character.isJavaIdentifierStart(c) && c != '$') {
addIdentifierOrKeyword();
} else {
// Some characters in Python are not recognized in Blaze syntax (e.g. '!')
addToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c));
error("invalid character: '" + c + "'");
}
break;
} // default
} // switch
} // while
}
/**
* Returns parts of the source buffer based on offsets
*
* @param start the beginning offset for the slice
* @param end the offset immediately following the slice
* @return the text at offset start with length end - start
*/
private String bufferSlice(int start, int end) {
return new String(this.buffer, start, end - start);
}
}