| /* |
| * Copyright 2016 The Bazel Authors. All rights reserved. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package com.google.idea.blaze.base.lang.buildfile.lexer; |
| |
| import com.google.common.collect.ImmutableMap; |
| import com.google.common.collect.Lists; |
| |
| import javax.annotation.Nullable; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Stack; |
| |
| /** |
| * A tokenizer for the BUILD language. |
| * |
| * Copied from blaze/bazel's lexer. The differences are: |
| * 1. Blaze's lexer isn't 'faithful', in that it reorders characters, skips characters, and adds |
| * ghost characters. We can't do that, because we need to match the editor's view of the document. |
| * 2. Blaze's lexer only lexes entire files (it can't incrementally lex part of a file, starting |
| * from a given indent stack depth). |
| */ |
| public class BuildLexerBase { |
| |
| /** |
| * When tokenizing for the purposes of parsing, we handle indentation.<br> |
| * For syntax highlighting, we need to tokenize every character, and don't care about indentation. |
| */ |
| public enum LexerMode { |
| Parsing, |
| SyntaxHighlighting |
| } |
| |
| // Characters that can come immediately prior to an '=' character to generate |
| // a different token |
| private static final Map<Character, TokenKind> EQUAL_TOKENS = |
| ImmutableMap.<Character, TokenKind>builder() |
| .put('=', TokenKind.EQUALS_EQUALS) |
| .put('!', TokenKind.NOT_EQUALS) |
| .put('>', TokenKind.GREATER_EQUALS) |
| .put('<', TokenKind.LESS_EQUALS) |
| .put('+', TokenKind.PLUS_EQUALS) |
| .put('-', TokenKind.MINUS_EQUALS) |
| .put('*', TokenKind.STAR_EQUALS) |
| .put('/', TokenKind.SLASH_EQUALS) |
| .put('%', TokenKind.PERCENT_EQUALS) |
| .build(); |
| |
| private final LexerMode mode; |
| |
| // Input buffer and position |
| private final char[] buffer; |
| private int pos; |
| |
| private final List<Token> tokens; |
| |
| // The number of unclosed open-parens ("(", '{', '[') at the current point in |
| // the stream. Whitespace is handled differently when this is nonzero. |
| private int openParenStackDepth = 0; |
| |
| // The stack of enclosing indentation levels; always contains '0' at the |
| // bottom. |
| private final Stack<Integer> indentStack = new Stack<>(); |
| |
| private boolean containsErrors; |
| |
| /** |
| * Constructs a lexer which tokenizes the contents of the specified |
| * InputBuffer. Any errors during lexing are reported on "handler". |
| */ |
| public BuildLexerBase(CharSequence input, int initialStackDepth, LexerMode mode) { |
| this.buffer = input.toString().toCharArray(); |
| // Empirical measurements show roughly 1 token per 8 characters in buffer. |
| this.tokens = Lists.newArrayListWithExpectedSize(buffer.length / 8); |
| this.pos = 0; |
| this.openParenStackDepth = initialStackDepth; |
| this.mode = mode; |
| |
| indentStack.push(0); |
| tokenize(); |
| } |
| |
| /** |
| * The number of unclosed open-parens ("(", '{', '[') at the end of this string. |
| */ |
| public int getOpenParenStackDepth() { |
| return openParenStackDepth; |
| } |
| |
| /** |
| * Returns true if there were errors during scanning of this input file or |
| * string. The BuildLexerBase may attempt to recover from errors, but clients should |
| * not rely on the results of scanning if this flag is set. |
| */ |
| public boolean containsErrors() { |
| return containsErrors; |
| } |
| |
| /** |
| * Returns the (mutable) list of tokens generated by the BuildLexerBase. |
| */ |
| public List<Token> getTokens() { |
| return tokens; |
| } |
| |
| private void popParen() { |
| if (openParenStackDepth == 0) { |
| error("indentation error"); |
| } else { |
| openParenStackDepth--; |
| } |
| } |
| |
| private void error(String message) { |
| error(message, pos - 1, pos - 1); |
| } |
| |
| protected void error(String message, int start, int end) { |
| this.containsErrors = true; |
| } |
| |
| /** invariant: symbol positions are half-open intervals. */ |
| private void addToken(TokenKind kind, int left, int right) { |
| addToken(kind, left, right, null); |
| } |
| |
| private void addToken(TokenKind kind, int left, int right, @Nullable Object value) { |
| tokens.add(new Token(kind, left, right, value)); |
| } |
| |
| /** |
| * Parses an end-of-line sequence, handling statement indentation correctly. |
| * |
| * <p>UNIX newlines are assumed (LF). Carriage returns are always ignored. |
| * |
| * <p>ON ENTRY: 'pos' is the index of the char after '\n'. |
| * ON EXIT: 'pos' is the index of the next non-space char after '\n'. |
| */ |
| protected void newline() { |
| if (mode == LexerMode.SyntaxHighlighting) { |
| addToken(TokenKind.NEWLINE, pos - 1, pos); |
| return; |
| } |
| if (openParenStackDepth > 0) { |
| newlineInsideExpression(); // in an expression: ignore space |
| } else { |
| newlineOutsideExpression(); // generate NEWLINE/INDENT/DEDENT tokens |
| } |
| } |
| |
| private void newlineInsideExpression() { |
| int oldPos = pos - 1; |
| while (pos < buffer.length) { |
| switch (buffer[pos]) { |
| case ' ': case '\t': case '\r': |
| pos++; |
| break; |
| default: |
| // ignored by the parser |
| addToken(TokenKind.WHITESPACE, oldPos, pos); |
| return; |
| } |
| } |
| addToken(TokenKind.WHITESPACE, oldPos, pos); |
| } |
| |
| /** |
| * Handle INDENT and DEDENT within statements.<p> |
| * Note these tokens have zero length -- this is because we can have an arbitrary number of |
| * dedents squeezed into some number of characters, and the size of all the lexical elements |
| * must match the number of characters in the file. |
| */ |
| private void newlineOutsideExpression() { |
| int oldPos = pos - 1; |
| if (pos > 1) { // skip over newline at start of file |
| addToken(TokenKind.NEWLINE, oldPos, pos); |
| oldPos = pos; |
| } |
| |
| // we're in a stmt: suck up space at beginning of next line |
| int indentLen = 0; |
| while (pos < buffer.length) { |
| char c = buffer[pos]; |
| if (c == ' ') { |
| indentLen++; |
| pos++; |
| } else if (c == '\t') { |
| indentLen += 8 - indentLen % 8; |
| pos++; |
| } else if (c == '\n') { // entirely blank line: ignore |
| indentLen = 0; |
| pos++; |
| } else if (c == '#') { // line containing only indented comment |
| if (oldPos != pos) { |
| addToken(TokenKind.WHITESPACE, oldPos, pos); |
| oldPos = pos; |
| } |
| while (pos < buffer.length && c != '\n') { |
| c = buffer[pos++]; |
| } |
| addToken(TokenKind.COMMENT, oldPos, pos - 1, bufferSlice(oldPos, pos - 1)); |
| oldPos = pos - 1; |
| indentLen = 0; |
| } else { // printing character |
| break; |
| } |
| } |
| |
| if (oldPos != pos) { |
| addToken(TokenKind.WHITESPACE, oldPos, pos); |
| } |
| if (pos == buffer.length) { |
| indentLen = 0; |
| } // trailing space on last line |
| |
| int peekedIndent = indentStack.peek(); |
| if (peekedIndent < indentLen) { // push a level |
| indentStack.push(indentLen); |
| addToken(TokenKind.INDENT, pos, pos); |
| |
| } else if (peekedIndent > indentLen) { // pop one or more levels |
| while (peekedIndent > indentLen) { |
| indentStack.pop(); |
| addToken(TokenKind.DEDENT, pos, pos); |
| peekedIndent = indentStack.peek(); |
| } |
| |
| if (peekedIndent < indentLen) { |
| error("indentation error"); |
| } |
| } |
| } |
| |
| /** |
| * Collapse adjacent whitespace characters into a single token |
| */ |
| private void addWhitespace() { |
| int oldPos = pos - 1; |
| while (pos < buffer.length) { |
| switch (buffer[pos]) { |
| case ' ': case '\t': case '\r': |
| pos++; |
| break; |
| default: |
| addToken(TokenKind.WHITESPACE, oldPos, pos, bufferSlice(oldPos, pos)); |
| return; |
| } |
| } |
| addToken(TokenKind.WHITESPACE, oldPos, pos, bufferSlice(oldPos, pos)); |
| } |
| |
| /** |
| * Returns true if current position is in the middle of a triple quote |
| * delimiter (3 x quot), and advances 'pos' by two if so. |
| */ |
| private boolean skipTripleQuote(char quot) { |
| if (pos + 1 < buffer.length && buffer[pos] == quot && buffer[pos + 1] == quot) { |
| pos += 2; |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Scans a string literal delimited by 'quot', containing escape sequences. |
| * |
| * <p>ON ENTRY: 'pos' is 1 + the index of the first delimiter |
| * ON EXIT: 'pos' is 1 + the index of the last delimiter. |
| */ |
| private void escapedStringLiteral(char quot, boolean isRaw) { |
| int oldPos = isRaw ? pos - 2 : pos - 1; |
| boolean inTripleQuote = skipTripleQuote(quot); |
| |
| // more expensive second choice that expands escaped into a buffer |
| StringBuilder literal = new StringBuilder(); |
| while (pos < buffer.length) { |
| char c = buffer[pos]; |
| pos++; |
| switch (c) { |
| case '\n': |
| if (inTripleQuote) { |
| literal.append(c); |
| break; |
| } else { |
| error("unterminated string literal at eol", oldPos, pos); |
| addToken(TokenKind.STRING, oldPos, pos, literal.toString()); |
| newline(); |
| return; |
| } |
| case '\\': |
| if (pos == buffer.length) { |
| error("unterminated string literal at eof", oldPos, pos); |
| addToken(TokenKind.STRING, oldPos, pos, literal.toString()); |
| return; |
| } |
| if (isRaw) { |
| // Insert \ and the following character. |
| // As in Python, it means that a raw string can never end with a single \. |
| literal.append('\\'); |
| literal.append(buffer[pos]); |
| pos++; |
| break; |
| } |
| c = buffer[pos]; |
| pos++; |
| switch (c) { |
| case '\n': |
| // ignore end of line character |
| break; |
| case 'n': |
| literal.append('\n'); |
| break; |
| case 'r': |
| literal.append('\r'); |
| break; |
| case 't': |
| literal.append('\t'); |
| break; |
| case '\\': |
| literal.append('\\'); |
| break; |
| case '\'': |
| literal.append('\''); |
| break; |
| case '"': |
| literal.append('"'); |
| break; |
| case '0': case '1': case '2': case '3': |
| case '4': case '5': case '6': case '7': { // octal escape |
| int octal = c - '0'; |
| if (pos < buffer.length) { |
| c = buffer[pos]; |
| if (c >= '0' && c <= '7') { |
| pos++; |
| octal = (octal << 3) | (c - '0'); |
| if (pos < buffer.length) { |
| c = buffer[pos]; |
| if (c >= '0' && c <= '7') { |
| pos++; |
| octal = (octal << 3) | (c - '0'); |
| } |
| } |
| } |
| } |
| literal.append((char) (octal & 0xff)); |
| break; |
| } |
| case 'a': case 'b': case 'f': case 'N': case 'u': case 'U': case 'v': case 'x': |
| // exists in Python but not implemented in Blaze => error |
| error("escape sequence not implemented: \\" + c, oldPos, pos); |
| break; |
| default: |
| // unknown char escape => "\literal" |
| literal.append('\\'); |
| literal.append(c); |
| break; |
| } |
| break; |
| case '\'': |
| case '"': |
| if (c != quot || (inTripleQuote && !skipTripleQuote(quot))) { |
| // Non-matching quote, treat it like a regular char. |
| literal.append(c); |
| } else { |
| // Matching close-delimiter, all done. |
| addToken(TokenKind.STRING, oldPos, pos, literal.toString()); |
| return; |
| } |
| break; |
| default: |
| literal.append(c); |
| break; |
| } |
| } |
| error("unterminated string literal at eof", oldPos, pos); |
| addToken(TokenKind.STRING, oldPos, pos, literal.toString()); |
| } |
| |
| /** |
| * Scans a string literal delimited by 'quot'. |
| * |
| * <ul> |
| * <li> ON ENTRY: 'pos' is 1 + the index of the first delimiter |
| * <li> ON EXIT: 'pos' is 1 + the index of the last delimiter. |
| * </ul> |
| * |
| * @param isRaw if true, do not escape the string. |
| */ |
| private void addStringLiteral(char quot, boolean isRaw) { |
| int oldPos = isRaw ? pos - 2 : pos - 1; |
| int start = pos; |
| |
| // Don't even attempt to parse triple-quotes here. |
| if (skipTripleQuote(quot)) { |
| pos -= 2; |
| escapedStringLiteral(quot, isRaw); |
| return; |
| } |
| |
| // first quick optimistic scan for a simple non-escaped string |
| while (pos < buffer.length) { |
| char c = buffer[pos++]; |
| switch (c) { |
| case '\n': |
| error("unterminated string literal at eol", oldPos, pos); |
| addToken(TokenKind.STRING, oldPos, pos - 1, bufferSlice(start, pos - 1)); |
| newline(); |
| return; |
| case '\\': |
| if (isRaw) { |
| // skip the next character |
| pos++; |
| break; |
| } |
| // oops, hit an escape, need to start over & build a new string buffer |
| pos = oldPos + 1; |
| escapedStringLiteral(quot, false); |
| return; |
| case '\'': |
| case '"': |
| if (c == quot) { |
| // close-quote, all done. |
| addToken(TokenKind.STRING, oldPos, pos, bufferSlice(start, pos - 1)); |
| return; |
| } |
| } |
| } |
| |
| error("unterminated string literal at eof", oldPos, pos); |
| addToken(TokenKind.STRING, oldPos, pos, bufferSlice(start, pos)); |
| } |
| |
| private static final ImmutableMap<String, TokenKind> KEYWORD_MAP = createKeywordMap(); |
| |
| private static ImmutableMap<String, TokenKind> createKeywordMap() { |
| ImmutableMap.Builder<String, TokenKind> builder = ImmutableMap.builder(); |
| for (TokenKind kind : TokenKind.KEYWORDS) { |
| builder.put(kind.toString(), kind); |
| } |
| return builder.build(); |
| } |
| |
| private TokenKind getTokenKindForIdentfier(String id) { |
| TokenKind kind = KEYWORD_MAP.get(id); |
| return kind == null ? TokenKind.IDENTIFIER : kind; |
| } |
| |
| private String scanIdentifier() { |
| int oldPos = pos - 1; |
| while (pos < buffer.length) { |
| switch (buffer[pos]) { |
| case '_': |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
| case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': |
| case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': |
| case 's': case 't': case 'u': case 'v': case 'w': case 'x': |
| case 'y': case 'z': |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
| case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': |
| case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': |
| case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': |
| case 'Y': case 'Z': |
| case '0': case '1': case '2': case '3': case '4': case '5': |
| case '6': case '7': case '8': case '9': |
| pos++; |
| break; |
| default: |
| return bufferSlice(oldPos, pos); |
| } |
| } |
| return bufferSlice(oldPos, pos); |
| } |
| |
| /** |
| * Scans an identifier or keyword. |
| * |
| * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the identifier. |
| * ON EXIT: 'pos' is 1 + the index of the last char in the identifier. |
| * |
| * @return the identifier or keyword token. |
| */ |
| private void addIdentifierOrKeyword() { |
| int oldPos = pos - 1; |
| String id = scanIdentifier(); |
| TokenKind kind = getTokenKindForIdentfier(id); |
| addToken(kind, oldPos, pos, |
| (kind == TokenKind.IDENTIFIER) ? id : null); |
| } |
| |
| private String scanInteger() { |
| int oldPos = pos - 1; |
| while (pos < buffer.length) { |
| char c = buffer[pos]; |
| switch (c) { |
| case 'X': case 'x': |
| case 'a': case 'A': |
| case 'b': case 'B': |
| case 'c': case 'C': |
| case 'd': case 'D': |
| case 'e': case 'E': |
| case 'f': case 'F': |
| case '0': case '1': |
| case '2': case '3': |
| case '4': case '5': |
| case '6': case '7': |
| case '8': case '9': |
| pos++; |
| break; |
| default: |
| return bufferSlice(oldPos, pos); |
| } |
| } |
| return bufferSlice(oldPos, pos); |
| } |
| |
| /** |
| * Scans an addInteger literal. |
| * |
| * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the literal. |
| * ON EXIT: 'pos' is 1 + the index of the last char in the literal. |
| */ |
| private void addInteger() { |
| int oldPos = pos - 1; |
| String literal = scanInteger(); |
| |
| final String substring; |
| final int radix; |
| if (literal.startsWith("0x") || literal.startsWith("0X")) { |
| radix = 16; |
| substring = literal.substring(2); |
| } else if (literal.startsWith("0") && literal.length() > 1) { |
| radix = 8; |
| substring = literal.substring(1); |
| } else { |
| radix = 10; |
| substring = literal; |
| } |
| |
| int value = 0; |
| try { |
| value = Integer.parseInt(substring, radix); |
| } catch (NumberFormatException e) { |
| error("invalid base-" + radix + " integer constant: " + literal); |
| } |
| |
| addToken(TokenKind.INT, oldPos, pos, value); |
| } |
| |
| /** |
| * Tokenizes a two-char operator. |
| * @return true if it tokenized an operator |
| */ |
| private boolean tokenizeTwoChars() { |
| if (pos + 2 >= buffer.length) { |
| return false; |
| } |
| char c1 = buffer[pos]; |
| char c2 = buffer[pos + 1]; |
| TokenKind tok = null; |
| if (c2 == '=') { |
| tok = EQUAL_TOKENS.get(c1); |
| } else if (c2 == '*' && c1 == '*') { |
| tok = TokenKind.STAR_STAR; |
| } |
| if (tok == null) { |
| return false; |
| } |
| addToken(tok, pos, pos + 2); |
| return true; |
| } |
| |
| /** |
| * Performs tokenization of the character buffer of file contents provided to |
| * the constructor. |
| */ |
| private void tokenize() { |
| while (pos < buffer.length) { |
| if (tokenizeTwoChars()) { |
| pos += 2; |
| continue; |
| } |
| char c = buffer[pos]; |
| pos++; |
| switch (c) { |
| case '{': { |
| addToken(TokenKind.LBRACE, pos - 1, pos); |
| openParenStackDepth++; |
| break; |
| } |
| case '}': { |
| addToken(TokenKind.RBRACE, pos - 1, pos); |
| popParen(); |
| break; |
| } |
| case '(': { |
| addToken(TokenKind.LPAREN, pos - 1, pos); |
| openParenStackDepth++; |
| break; |
| } |
| case ')': { |
| addToken(TokenKind.RPAREN, pos - 1, pos); |
| popParen(); |
| break; |
| } |
| case '[': { |
| addToken(TokenKind.LBRACKET, pos - 1, pos); |
| openParenStackDepth++; |
| break; |
| } |
| case ']': { |
| addToken(TokenKind.RBRACKET, pos - 1, pos); |
| popParen(); |
| break; |
| } |
| case '>': { |
| addToken(TokenKind.GREATER, pos - 1, pos); |
| break; |
| } |
| case '<': { |
| addToken(TokenKind.LESS, pos - 1, pos); |
| break; |
| } |
| case ':': { |
| addToken(TokenKind.COLON, pos - 1, pos); |
| break; |
| } |
| case ',': { |
| addToken(TokenKind.COMMA, pos - 1, pos); |
| break; |
| } |
| case '+': { |
| addToken(TokenKind.PLUS, pos - 1, pos); |
| break; |
| } |
| case '-': { |
| addToken(TokenKind.MINUS, pos - 1, pos); |
| break; |
| } |
| case '|': { |
| addToken(TokenKind.PIPE, pos - 1, pos); |
| break; |
| } |
| case '=': { |
| addToken(TokenKind.EQUALS, pos - 1, pos); |
| break; |
| } |
| case '%': { |
| addToken(TokenKind.PERCENT, pos - 1, pos); |
| break; |
| } |
| case '/': { |
| addToken(TokenKind.SLASH, pos - 1, pos); |
| break; |
| } |
| case ';': { |
| addToken(TokenKind.SEMI, pos - 1, pos); |
| break; |
| } |
| case '.': { |
| addToken(TokenKind.DOT, pos - 1, pos); |
| break; |
| } |
| case '*': { |
| addToken(TokenKind.STAR, pos - 1, pos); |
| break; |
| } |
| case ' ': |
| case '\t': |
| case '\r': { |
| addWhitespace(); |
| break; |
| } |
| case '\\': { |
| // Backslash character is valid only at the end of a line (or in a string) |
| if (pos + 1 < buffer.length && buffer[pos] == '\n') { |
| // treat end of line backslash and newline char as whitespace (they're ignored by the parser) |
| pos++; |
| addToken(TokenKind.WHITESPACE, pos - 2, pos, Character.toString(c)); |
| } else { |
| addToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c)); |
| } |
| break; |
| } |
| case '\n': { |
| newline(); |
| break; |
| } |
| case '#': { |
| int oldPos = pos - 1; |
| while (pos < buffer.length) { |
| c = buffer[pos]; |
| if (c == '\n') { |
| break; |
| } else { |
| pos++; |
| } |
| } |
| addToken(TokenKind.COMMENT, oldPos, pos, bufferSlice(oldPos, pos)); |
| break; |
| } |
| case '\'': |
| case '\"': { |
| addStringLiteral(c, false); |
| break; |
| } |
| default: { |
| // detect raw strings, e.g. r"str" |
| if (c == 'r' && pos < buffer.length |
| && (buffer[pos] == '\'' || buffer[pos] == '\"')) { |
| c = buffer[pos]; |
| pos++; |
| addStringLiteral(c, true); |
| break; |
| } |
| |
| if (Character.isDigit(c)) { |
| addInteger(); |
| } else if (Character.isJavaIdentifierStart(c) && c != '$') { |
| addIdentifierOrKeyword(); |
| } else { |
| // Some characters in Python are not recognized in Blaze syntax (e.g. '!') |
| addToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c)); |
| error("invalid character: '" + c + "'"); |
| } |
| break; |
| } // default |
| } // switch |
| } // while |
| } |
| |
| /** |
| * Returns parts of the source buffer based on offsets |
| * |
| * @param start the beginning offset for the slice |
| * @param end the offset immediately following the slice |
| * @return the text at offset start with length end - start |
| */ |
| private String bufferSlice(int start, int end) { |
| return new String(this.buffer, start, end - start); |
| } |
| |
| } |