| /* |
| * Copyright 2016 The Bazel Authors. All rights reserved. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package com.google.idea.blaze.base.lang.buildfile.lexer; |
| |
| import com.google.common.collect.ImmutableMap; |
| import com.google.common.collect.Lists; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Stack; |
| import javax.annotation.Nullable; |
| |
| /** |
| * A tokenizer for the BUILD language. |
| * |
| * <p>Copied from blaze/bazel's lexer. The differences are: 1. Blaze's lexer isn't 'faithful', in |
| * that it reorders characters, skips characters, and adds ghost characters. We can't do that, |
| * because we need to match the editor's view of the document. 2. Blaze's lexer only lexes entire |
| * files (it can't incrementally lex part of a file, starting from a given indent stack depth). |
| */ |
| public class BuildLexerBase { |
| |
| /** |
| * When tokenizing for the purposes of parsing, we handle indentation.<br> |
| * For syntax highlighting, we need to tokenize every character, and don't care about indentation. |
| */ |
| public enum LexerMode { |
| Parsing, |
| SyntaxHighlighting |
| } |
| |
| // Characters that can come immediately prior to an '=' character to generate |
| // a different token |
| private static final Map<Character, TokenKind> EQUAL_TOKENS = |
| ImmutableMap.<Character, TokenKind>builder() |
| .put('=', TokenKind.EQUALS_EQUALS) |
| .put('!', TokenKind.NOT_EQUALS) |
| .put('>', TokenKind.GREATER_EQUALS) |
| .put('<', TokenKind.LESS_EQUALS) |
| .put('+', TokenKind.PLUS_EQUALS) |
| .put('-', TokenKind.MINUS_EQUALS) |
| .put('*', TokenKind.STAR_EQUALS) |
| .put('/', TokenKind.SLASH_EQUALS) |
| .put('%', TokenKind.PERCENT_EQUALS) |
| .build(); |
| |
| private final LexerMode mode; |
| |
| // Input buffer and position |
| private final char[] buffer; |
| private int pos; |
| |
| private final List<Token> tokens; |
| |
| // The number of unclosed open-parens ("(", '{', '[') at the current point in |
| // the stream. Whitespace is handled differently when this is nonzero. |
| private int openParenStackDepth = 0; |
| |
| // The stack of enclosing indentation levels; always contains '0' at the |
| // bottom. |
| private final Stack<Integer> indentStack = new Stack<>(); |
| |
| private boolean containsErrors; |
| |
| /** |
| * Constructs a lexer which tokenizes the contents of the specified InputBuffer. Any errors during |
| * lexing are reported on "handler". |
| */ |
| public BuildLexerBase(CharSequence input, int initialStackDepth, LexerMode mode) { |
| this.buffer = input.toString().toCharArray(); |
| // Empirical measurements show roughly 1 token per 8 characters in buffer. |
| this.tokens = Lists.newArrayListWithExpectedSize(buffer.length / 8); |
| this.pos = 0; |
| this.openParenStackDepth = initialStackDepth; |
| this.mode = mode; |
| |
| indentStack.push(0); |
| tokenize(); |
| } |
| |
| /** The number of unclosed open-parens ("(", '{', '[') at the end of this string. */ |
| public int getOpenParenStackDepth() { |
| return openParenStackDepth; |
| } |
| |
| /** |
| * Returns true if there were errors during scanning of this input file or string. The |
| * BuildLexerBase may attempt to recover from errors, but clients should not rely on the results |
| * of scanning if this flag is set. |
| */ |
| public boolean containsErrors() { |
| return containsErrors; |
| } |
| |
| /** Returns the (mutable) list of tokens generated by the BuildLexerBase. */ |
| public List<Token> getTokens() { |
| return tokens; |
| } |
| |
| private void popParen() { |
| if (openParenStackDepth == 0) { |
| error("indentation error"); |
| } else { |
| openParenStackDepth--; |
| } |
| } |
| |
| private void error(String message) { |
| error(message, pos - 1, pos - 1); |
| } |
| |
| protected void error(String message, int start, int end) { |
| this.containsErrors = true; |
| } |
| |
| /** invariant: symbol positions are half-open intervals. */ |
| private void addToken(TokenKind kind, int left, int right) { |
| addToken(kind, left, right, null); |
| } |
| |
| private void addToken(TokenKind kind, int left, int right, @Nullable Object value) { |
| tokens.add(new Token(kind, left, right, value)); |
| } |
| |
| /** |
| * Parses an end-of-line sequence, handling statement indentation correctly. |
| * |
| * <p>UNIX newlines are assumed (LF). Carriage returns are always ignored. |
| * |
| * <p>ON ENTRY: 'pos' is the index of the char after '\n'. ON EXIT: 'pos' is the index of the next |
| * non-space char after '\n'. |
| */ |
| protected void newline() { |
| if (mode == LexerMode.SyntaxHighlighting) { |
| addToken(TokenKind.NEWLINE, pos - 1, pos); |
| return; |
| } |
| if (openParenStackDepth > 0) { |
| newlineInsideExpression(); // in an expression: ignore space |
| } else { |
| newlineOutsideExpression(); // generate NEWLINE/INDENT/DEDENT tokens |
| } |
| } |
| |
| private void newlineInsideExpression() { |
| int oldPos = pos - 1; |
| while (pos < buffer.length) { |
| switch (buffer[pos]) { |
| case ' ': |
| case '\t': |
| case '\r': |
| pos++; |
| break; |
| default: |
| // ignored by the parser |
| addToken(TokenKind.WHITESPACE, oldPos, pos); |
| return; |
| } |
| } |
| addToken(TokenKind.WHITESPACE, oldPos, pos); |
| } |
| |
| /** |
| * Handle INDENT and DEDENT within statements. |
| * |
| * <p>Note these tokens have zero length -- this is because we can have an arbitrary number of |
| * dedents squeezed into some number of characters, and the size of all the lexical elements must |
| * match the number of characters in the file. |
| */ |
| private void newlineOutsideExpression() { |
| int oldPos = pos - 1; |
| if (pos > 1) { // skip over newline at start of file |
| addToken(TokenKind.NEWLINE, oldPos, pos); |
| oldPos = pos; |
| } |
| |
| // we're in a stmt: suck up space at beginning of next line |
| int indentLen = 0; |
| while (pos < buffer.length) { |
| char c = buffer[pos]; |
| if (c == ' ') { |
| indentLen++; |
| pos++; |
| } else if (c == '\t') { |
| indentLen += 8 - indentLen % 8; |
| pos++; |
| } else if (c == '\n') { // entirely blank line: ignore |
| indentLen = 0; |
| pos++; |
| } else if (c == '#') { // line containing only indented comment |
| if (oldPos != pos) { |
| addToken(TokenKind.WHITESPACE, oldPos, pos); |
| oldPos = pos; |
| } |
| while (pos < buffer.length && c != '\n') { |
| c = buffer[pos++]; |
| } |
| addToken(TokenKind.COMMENT, oldPos, pos - 1, bufferSlice(oldPos, pos - 1)); |
| oldPos = pos - 1; |
| indentLen = 0; |
| } else { // printing character |
| break; |
| } |
| } |
| |
| if (oldPos != pos) { |
| addToken(TokenKind.WHITESPACE, oldPos, pos); |
| } |
| if (pos == buffer.length) { |
| indentLen = 0; |
| } // trailing space on last line |
| |
| int peekedIndent = indentStack.peek(); |
| if (peekedIndent < indentLen) { // push a level |
| indentStack.push(indentLen); |
| addToken(TokenKind.INDENT, pos, pos); |
| |
| } else if (peekedIndent > indentLen) { // pop one or more levels |
| while (peekedIndent > indentLen) { |
| indentStack.pop(); |
| addToken(TokenKind.DEDENT, pos, pos); |
| peekedIndent = indentStack.peek(); |
| } |
| |
| if (peekedIndent < indentLen) { |
| error("indentation error"); |
| } |
| } |
| } |
| |
| /** Collapse adjacent whitespace characters into a single token */ |
| private void addWhitespace() { |
| int oldPos = pos - 1; |
| while (pos < buffer.length) { |
| switch (buffer[pos]) { |
| case ' ': |
| case '\t': |
| case '\r': |
| pos++; |
| break; |
| default: |
| addToken(TokenKind.WHITESPACE, oldPos, pos, bufferSlice(oldPos, pos)); |
| return; |
| } |
| } |
| addToken(TokenKind.WHITESPACE, oldPos, pos, bufferSlice(oldPos, pos)); |
| } |
| |
| /** |
| * Returns true if current position is in the middle of a triple quote delimiter (3 x quot), and |
| * advances 'pos' by two if so. |
| */ |
| private boolean skipTripleQuote(char quot) { |
| if (pos + 1 < buffer.length && buffer[pos] == quot && buffer[pos + 1] == quot) { |
| pos += 2; |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Scans a string literal delimited by 'quot', containing escape sequences. |
| * |
| * <p>ON ENTRY: 'pos' is 1 + the index of the first delimiter ON EXIT: 'pos' is 1 + the index of |
| * the last delimiter. |
| */ |
| private void escapedStringLiteral(char quot, boolean isRaw) { |
| int oldPos = isRaw ? pos - 2 : pos - 1; |
| boolean inTripleQuote = skipTripleQuote(quot); |
| |
| // more expensive second choice that expands escaped into a buffer |
| StringBuilder literal = new StringBuilder(); |
| while (pos < buffer.length) { |
| char c = buffer[pos]; |
| pos++; |
| switch (c) { |
| case '\n': |
| if (inTripleQuote) { |
| literal.append(c); |
| break; |
| } else { |
| error("unterminated string literal at eol", oldPos, pos); |
| addToken(TokenKind.STRING, oldPos, pos - 1, literal.toString()); |
| newline(); |
| return; |
| } |
| case '\\': |
| if (pos == buffer.length) { |
| error("unterminated string literal at eof", oldPos, pos); |
| addToken(TokenKind.STRING, oldPos, pos - 1, literal.toString()); |
| return; |
| } |
| if (isRaw) { |
| // Insert \ and the following character. |
| // As in Python, it means that a raw string can never end with a single \. |
| literal.append('\\'); |
| literal.append(buffer[pos]); |
| pos++; |
| break; |
| } |
| c = buffer[pos]; |
| pos++; |
| switch (c) { |
| case '\n': |
| // ignore end of line character |
| break; |
| case 'n': |
| literal.append('\n'); |
| break; |
| case 'r': |
| literal.append('\r'); |
| break; |
| case 't': |
| literal.append('\t'); |
| break; |
| case '\\': |
| literal.append('\\'); |
| break; |
| case '\'': |
| literal.append('\''); |
| break; |
| case '"': |
| literal.append('"'); |
| break; |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| { // octal escape |
| int octal = c - '0'; |
| if (pos < buffer.length) { |
| c = buffer[pos]; |
| if (c >= '0' && c <= '7') { |
| pos++; |
| octal = (octal << 3) | (c - '0'); |
| if (pos < buffer.length) { |
| c = buffer[pos]; |
| if (c >= '0' && c <= '7') { |
| pos++; |
| octal = (octal << 3) | (c - '0'); |
| } |
| } |
| } |
| } |
| literal.append((char) (octal & 0xff)); |
| break; |
| } |
| case 'a': |
| case 'b': |
| case 'f': |
| case 'N': |
| case 'u': |
| case 'U': |
| case 'v': |
| case 'x': |
| // exists in Python but not implemented in Blaze => error |
| error("escape sequence not implemented: \\" + c, oldPos, pos); |
| break; |
| default: |
| // unknown char escape => "\literal" |
| literal.append('\\'); |
| literal.append(c); |
| break; |
| } |
| break; |
| case '\'': |
| case '"': |
| if (c != quot || (inTripleQuote && !skipTripleQuote(quot))) { |
| // Non-matching quote, treat it like a regular char. |
| literal.append(c); |
| } else { |
| // Matching close-delimiter, all done. |
| addToken(TokenKind.STRING, oldPos, pos, literal.toString()); |
| return; |
| } |
| break; |
| default: |
| literal.append(c); |
| break; |
| } |
| } |
| error("unterminated string literal at eof", oldPos, pos); |
| addToken(TokenKind.STRING, oldPos, pos, literal.toString()); |
| } |
| |
| /** |
| * Scans a string literal delimited by 'quot'. |
| * |
| * <ul> |
| * <li> ON ENTRY: 'pos' is 1 + the index of the first delimiter |
| * <li> ON EXIT: 'pos' is 1 + the index of the last delimiter. |
| * </ul> |
| * |
| * @param isRaw if true, do not escape the string. |
| */ |
| private void addStringLiteral(char quot, boolean isRaw) { |
| int oldPos = isRaw ? pos - 2 : pos - 1; |
| int start = pos; |
| |
| // Don't even attempt to parse triple-quotes here. |
| if (skipTripleQuote(quot)) { |
| pos -= 2; |
| escapedStringLiteral(quot, isRaw); |
| return; |
| } |
| |
| // first quick optimistic scan for a simple non-escaped string |
| while (pos < buffer.length) { |
| char c = buffer[pos++]; |
| switch (c) { |
| case '\n': |
| error("unterminated string literal at eol", oldPos, pos); |
| addToken(TokenKind.STRING, oldPos, pos - 1, bufferSlice(start, pos - 1)); |
| newline(); |
| return; |
| case '\\': |
| if (isRaw) { |
| // skip the next character |
| pos++; |
| break; |
| } |
| // oops, hit an escape, need to start over & build a new string buffer |
| pos = oldPos + 1; |
| escapedStringLiteral(quot, false); |
| return; |
| case '\'': |
| case '"': |
| if (c == quot) { |
| // close-quote, all done. |
| addToken(TokenKind.STRING, oldPos, pos, bufferSlice(start, pos - 1)); |
| return; |
| } |
| } |
| } |
| |
| error("unterminated string literal at eof", oldPos, pos); |
| addToken(TokenKind.STRING, oldPos, pos, bufferSlice(start, pos)); |
| } |
| |
| private static final ImmutableMap<String, TokenKind> KEYWORD_MAP = createKeywordMap(); |
| |
| private static ImmutableMap<String, TokenKind> createKeywordMap() { |
| ImmutableMap.Builder<String, TokenKind> builder = ImmutableMap.builder(); |
| for (TokenKind kind : TokenKind.KEYWORDS) { |
| builder.put(kind.toString(), kind); |
| } |
| return builder.build(); |
| } |
| |
| private TokenKind getTokenKindForIdentfier(String id) { |
| TokenKind kind = KEYWORD_MAP.get(id); |
| return kind == null ? TokenKind.IDENTIFIER : kind; |
| } |
| |
| private String scanIdentifier() { |
| int oldPos = pos - 1; |
| while (pos < buffer.length) { |
| switch (buffer[pos]) { |
| case '_': |
| case 'a': |
| case 'b': |
| case 'c': |
| case 'd': |
| case 'e': |
| case 'f': |
| case 'g': |
| case 'h': |
| case 'i': |
| case 'j': |
| case 'k': |
| case 'l': |
| case 'm': |
| case 'n': |
| case 'o': |
| case 'p': |
| case 'q': |
| case 'r': |
| case 's': |
| case 't': |
| case 'u': |
| case 'v': |
| case 'w': |
| case 'x': |
| case 'y': |
| case 'z': |
| case 'A': |
| case 'B': |
| case 'C': |
| case 'D': |
| case 'E': |
| case 'F': |
| case 'G': |
| case 'H': |
| case 'I': |
| case 'J': |
| case 'K': |
| case 'L': |
| case 'M': |
| case 'N': |
| case 'O': |
| case 'P': |
| case 'Q': |
| case 'R': |
| case 'S': |
| case 'T': |
| case 'U': |
| case 'V': |
| case 'W': |
| case 'X': |
| case 'Y': |
| case 'Z': |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': |
| pos++; |
| break; |
| default: |
| return bufferSlice(oldPos, pos); |
| } |
| } |
| return bufferSlice(oldPos, pos); |
| } |
| |
| /** |
| * Scans an identifier or keyword. |
| * |
| * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the identifier. ON EXIT: 'pos' is 1 + |
| * the index of the last char in the identifier. |
| * |
| * @return the identifier or keyword token. |
| */ |
| private void addIdentifierOrKeyword() { |
| int oldPos = pos - 1; |
| String id = scanIdentifier(); |
| TokenKind kind = getTokenKindForIdentfier(id); |
| addToken(kind, oldPos, pos, (kind == TokenKind.IDENTIFIER) ? id : null); |
| } |
| |
| private String scanInteger() { |
| int oldPos = pos - 1; |
| while (pos < buffer.length) { |
| char c = buffer[pos]; |
| switch (c) { |
| case 'X': |
| case 'x': |
| case 'a': |
| case 'A': |
| case 'b': |
| case 'B': |
| case 'c': |
| case 'C': |
| case 'd': |
| case 'D': |
| case 'e': |
| case 'E': |
| case 'f': |
| case 'F': |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': |
| pos++; |
| break; |
| default: |
| return bufferSlice(oldPos, pos); |
| } |
| } |
| return bufferSlice(oldPos, pos); |
| } |
| |
| /** |
| * Scans an addInteger literal. |
| * |
| * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the literal. ON EXIT: 'pos' is 1 + the |
| * index of the last char in the literal. |
| */ |
| private void addInteger() { |
| int oldPos = pos - 1; |
| String literal = scanInteger(); |
| |
| final String substring; |
| final int radix; |
| if (literal.startsWith("0x") || literal.startsWith("0X")) { |
| radix = 16; |
| substring = literal.substring(2); |
| } else if (literal.startsWith("0") && literal.length() > 1) { |
| radix = 8; |
| substring = literal.substring(1); |
| } else { |
| radix = 10; |
| substring = literal; |
| } |
| |
| int value = 0; |
| try { |
| value = Integer.parseInt(substring, radix); |
| } catch (NumberFormatException e) { |
| error("invalid base-" + radix + " integer constant: " + literal); |
| } |
| |
| addToken(TokenKind.INT, oldPos, pos, value); |
| } |
| |
| /** |
| * Tokenizes a two-char operator. |
| * |
| * @return true if it tokenized an operator |
| */ |
| private boolean tokenizeTwoChars() { |
| if (pos + 2 >= buffer.length) { |
| return false; |
| } |
| char c1 = buffer[pos]; |
| char c2 = buffer[pos + 1]; |
| TokenKind tok = null; |
| if (c2 == '=') { |
| tok = EQUAL_TOKENS.get(c1); |
| } else if (c2 == '*' && c1 == '*') { |
| tok = TokenKind.STAR_STAR; |
| } |
| if (tok == null) { |
| return false; |
| } |
| addToken(tok, pos, pos + 2); |
| return true; |
| } |
| |
| /** Performs tokenization of the character buffer of file contents provided to the constructor. */ |
| private void tokenize() { |
| while (pos < buffer.length) { |
| if (tokenizeTwoChars()) { |
| pos += 2; |
| continue; |
| } |
| char c = buffer[pos]; |
| pos++; |
| switch (c) { |
| case '{': |
| { |
| addToken(TokenKind.LBRACE, pos - 1, pos); |
| openParenStackDepth++; |
| break; |
| } |
| case '}': |
| { |
| addToken(TokenKind.RBRACE, pos - 1, pos); |
| popParen(); |
| break; |
| } |
| case '(': |
| { |
| addToken(TokenKind.LPAREN, pos - 1, pos); |
| openParenStackDepth++; |
| break; |
| } |
| case ')': |
| { |
| addToken(TokenKind.RPAREN, pos - 1, pos); |
| popParen(); |
| break; |
| } |
| case '[': |
| { |
| addToken(TokenKind.LBRACKET, pos - 1, pos); |
| openParenStackDepth++; |
| break; |
| } |
| case ']': |
| { |
| addToken(TokenKind.RBRACKET, pos - 1, pos); |
| popParen(); |
| break; |
| } |
| case '>': |
| { |
| addToken(TokenKind.GREATER, pos - 1, pos); |
| break; |
| } |
| case '<': |
| { |
| addToken(TokenKind.LESS, pos - 1, pos); |
| break; |
| } |
| case ':': |
| { |
| addToken(TokenKind.COLON, pos - 1, pos); |
| break; |
| } |
| case ',': |
| { |
| addToken(TokenKind.COMMA, pos - 1, pos); |
| break; |
| } |
| case '+': |
| { |
| addToken(TokenKind.PLUS, pos - 1, pos); |
| break; |
| } |
| case '-': |
| { |
| addToken(TokenKind.MINUS, pos - 1, pos); |
| break; |
| } |
| case '|': |
| { |
| addToken(TokenKind.PIPE, pos - 1, pos); |
| break; |
| } |
| case '=': |
| { |
| addToken(TokenKind.EQUALS, pos - 1, pos); |
| break; |
| } |
| case '%': |
| { |
| addToken(TokenKind.PERCENT, pos - 1, pos); |
| break; |
| } |
| case '/': |
| { |
| addToken(TokenKind.SLASH, pos - 1, pos); |
| break; |
| } |
| case ';': |
| { |
| addToken(TokenKind.SEMI, pos - 1, pos); |
| break; |
| } |
| case '.': |
| { |
| addToken(TokenKind.DOT, pos - 1, pos); |
| break; |
| } |
| case '*': |
| { |
| addToken(TokenKind.STAR, pos - 1, pos); |
| break; |
| } |
| case ' ': |
| case '\t': |
| case '\r': |
| { |
| addWhitespace(); |
| break; |
| } |
| case '\\': |
| { |
| // Backslash character is valid only at the end of a line (or in a string) |
| if (pos + 1 < buffer.length && buffer[pos] == '\n') { |
| // treat end of line backslash and newline char as whitespace |
| // (they're ignored by the parser) |
| pos++; |
| addToken(TokenKind.WHITESPACE, pos - 2, pos, Character.toString(c)); |
| } else { |
| addToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c)); |
| } |
| break; |
| } |
| case '\n': |
| { |
| newline(); |
| break; |
| } |
| case '#': |
| { |
| int oldPos = pos - 1; |
| while (pos < buffer.length) { |
| c = buffer[pos]; |
| if (c == '\n') { |
| break; |
| } else { |
| pos++; |
| } |
| } |
| addToken(TokenKind.COMMENT, oldPos, pos, bufferSlice(oldPos, pos)); |
| break; |
| } |
| case '\'': |
| case '\"': |
| { |
| addStringLiteral(c, false); |
| break; |
| } |
| default: |
| { |
| // detect raw strings, e.g. r"str" |
| if (c == 'r' && pos < buffer.length && (buffer[pos] == '\'' || buffer[pos] == '\"')) { |
| c = buffer[pos]; |
| pos++; |
| addStringLiteral(c, true); |
| break; |
| } |
| |
| if (Character.isDigit(c)) { |
| addInteger(); |
| } else if (Character.isJavaIdentifierStart(c) && c != '$') { |
| addIdentifierOrKeyword(); |
| } else { |
| // Some characters in Python are not recognized in Blaze syntax (e.g. '!') |
| addToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c)); |
| error("invalid character: '" + c + "'"); |
| } |
| break; |
| } // default |
| } // switch |
| } // while |
| } |
| |
| /** |
| * Returns parts of the source buffer based on offsets |
| * |
| * @param start the beginning offset for the slice |
| * @param end the offset immediately following the slice |
| * @return the text at offset start with length end - start |
| */ |
| private String bufferSlice(int start, int end) { |
| return new String(this.buffer, start, end - start); |
| } |
| } |