base/src/com/google/idea/blaze/base/lang/buildfile/lexer/BuildLexerBase.java - intellij - Git at Google

 /*
  * Copyright 2016 The Bazel Authors. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package com.google.idea.blaze.base.lang.buildfile.lexer;

 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Lists;
 import java.util.List;
 import java.util.Map;
 import java.util.Stack;
 import javax.annotation.Nullable;

 /**
  * A tokenizer for the BUILD language.
  *
  * <p>Copied from blaze/bazel's lexer. The differences are: 1. Blaze's lexer isn't 'faithful', in
  * that it reorders characters, skips characters, and adds ghost characters. We can't do that,
  * because we need to match the editor's view of the document. 2. Blaze's lexer only lexes entire
  * files (it can't incrementally lex part of a file, starting from a given indent stack depth).
  */
 public class BuildLexerBase {

   /**
    * When tokenizing for the purposes of parsing, we handle indentation.<br>
    * For syntax highlighting, we need to tokenize every character, and don't care about indentation.
    */
   public enum LexerMode {
     Parsing,
     SyntaxHighlighting
   }

   // Characters that can come immediately prior to an '=' character to generate
   // a different token
   private static final Map<Character, TokenKind> EQUAL_TOKENS =
       ImmutableMap.<Character, TokenKind>builder()
           .put('=', TokenKind.EQUALS_EQUALS)
           .put('!', TokenKind.NOT_EQUALS)
           .put('>', TokenKind.GREATER_EQUALS)
           .put('<', TokenKind.LESS_EQUALS)
           .put('+', TokenKind.PLUS_EQUALS)
           .put('-', TokenKind.MINUS_EQUALS)
           .put('*', TokenKind.STAR_EQUALS)
           .put('/', TokenKind.SLASH_EQUALS)
           .put('%', TokenKind.PERCENT_EQUALS)
           .build();

   private final LexerMode mode;

   // Input buffer and position
   private final char[] buffer;
   private int pos;

   private final List<Token> tokens;

   // The number of unclosed open-parens ("(", '{', '[') at the current point in
   // the stream. Whitespace is handled differently when this is nonzero.
   private int openParenStackDepth = 0;

   // The stack of enclosing indentation levels; always contains '0' at the
   // bottom.
   private final Stack<Integer> indentStack = new Stack<>();

   private boolean containsErrors;

   /**
    * Constructs a lexer which tokenizes the contents of the specified InputBuffer. Any errors during
    * lexing are reported on "handler".
    */
   public BuildLexerBase(CharSequence input, int initialStackDepth, LexerMode mode) {
     this.buffer = input.toString().toCharArray();
     // Empirical measurements show roughly 1 token per 8 characters in buffer.
     this.tokens = Lists.newArrayListWithExpectedSize(buffer.length / 8);
     this.pos = 0;
     this.openParenStackDepth = initialStackDepth;
     this.mode = mode;

     indentStack.push(0);
     tokenize();
   }

   /** The number of unclosed open-parens ("(", '{', '[') at the end of this string. */
   public int getOpenParenStackDepth() {
     return openParenStackDepth;
   }

   /**
    * Returns true if there were errors during scanning of this input file or string. The
    * BuildLexerBase may attempt to recover from errors, but clients should not rely on the results
    * of scanning if this flag is set.
    */
   public boolean containsErrors() {
     return containsErrors;
   }

   /** Returns the (mutable) list of tokens generated by the BuildLexerBase. */
   public List<Token> getTokens() {
     return tokens;
   }

   private void popParen() {
     if (openParenStackDepth == 0) {
       error("indentation error");
     } else {
       openParenStackDepth--;
     }
   }

   private void error(String message) {
     error(message, pos - 1, pos - 1);
   }

   protected void error(String message, int start, int end) {
     this.containsErrors = true;
   }

   /** invariant: symbol positions are half-open intervals. */
   private void addToken(TokenKind kind, int left, int right) {
     addToken(kind, left, right, null);
   }

   private void addToken(TokenKind kind, int left, int right, @Nullable Object value) {
     tokens.add(new Token(kind, left, right, value));
   }

   /**
    * Parses an end-of-line sequence, handling statement indentation correctly.
    *
    * <p>UNIX newlines are assumed (LF). Carriage returns are always ignored.
    *
    * <p>ON ENTRY: 'pos' is the index of the char after '\n'. ON EXIT: 'pos' is the index of the next
    * non-space char after '\n'.
    */
   protected void newline() {
     if (mode == LexerMode.SyntaxHighlighting) {
       addToken(TokenKind.NEWLINE, pos - 1, pos);
       return;
     }
     if (openParenStackDepth > 0) {
       newlineInsideExpression(); // in an expression: ignore space
     } else {
       newlineOutsideExpression(); // generate NEWLINE/INDENT/DEDENT tokens
     }
   }

   private void newlineInsideExpression() {
     int oldPos = pos - 1;
     while (pos < buffer.length) {
       switch (buffer[pos]) {
         case ' ':
         case '\t':
         case '\r':
           pos++;
           break;
         default:
           // ignored by the parser
           addToken(TokenKind.WHITESPACE, oldPos, pos);
           return;
       }
     }
     addToken(TokenKind.WHITESPACE, oldPos, pos);
   }

   /**
    * Handle INDENT and DEDENT within statements.
    *
    * <p>Note these tokens have zero length -- this is because we can have an arbitrary number of
    * dedents squeezed into some number of characters, and the size of all the lexical elements must
    * match the number of characters in the file.
    */
   private void newlineOutsideExpression() {
     int oldPos = pos - 1;
     if (pos > 1) { // skip over newline at start of file
       addToken(TokenKind.NEWLINE, oldPos, pos);
       oldPos = pos;
     }

     // we're in a stmt: suck up space at beginning of next line
     int indentLen = 0;
     while (pos < buffer.length) {
       char c = buffer[pos];
       if (c == ' ') {
         indentLen++;
         pos++;
       } else if (c == '\t') {
         indentLen += 8 - indentLen % 8;
         pos++;
       } else if (c == '\n') { // entirely blank line: ignore
         indentLen = 0;
         pos++;
       } else if (c == '#') { // line containing only indented comment
         if (oldPos != pos) {
           addToken(TokenKind.WHITESPACE, oldPos, pos);
           oldPos = pos;
         }
         while (pos < buffer.length && c != '\n') {
           c = buffer[pos++];
         }
         addToken(TokenKind.COMMENT, oldPos, pos - 1, bufferSlice(oldPos, pos - 1));
         oldPos = pos - 1;
         indentLen = 0;
       } else { // printing character
         break;
       }
     }

     if (oldPos != pos) {
       addToken(TokenKind.WHITESPACE, oldPos, pos);
     }
     if (pos == buffer.length) {
       indentLen = 0;
     } // trailing space on last line

     int peekedIndent = indentStack.peek();
     if (peekedIndent < indentLen) { // push a level
       indentStack.push(indentLen);
       addToken(TokenKind.INDENT, pos, pos);

     } else if (peekedIndent > indentLen) { // pop one or more levels
       while (peekedIndent > indentLen) {
         indentStack.pop();
         addToken(TokenKind.DEDENT, pos, pos);
         peekedIndent = indentStack.peek();
       }

       if (peekedIndent < indentLen) {
         error("indentation error");
       }
     }
   }

   /** Collapse adjacent whitespace characters into a single token */
   private void addWhitespace() {
     int oldPos = pos - 1;
     while (pos < buffer.length) {
       switch (buffer[pos]) {
         case ' ':
         case '\t':
         case '\r':
           pos++;
           break;
         default:
           addToken(TokenKind.WHITESPACE, oldPos, pos, bufferSlice(oldPos, pos));
           return;
       }
     }
     addToken(TokenKind.WHITESPACE, oldPos, pos, bufferSlice(oldPos, pos));
   }

   /**
    * Returns true if current position is in the middle of a triple quote delimiter (3 x quot), and
    * advances 'pos' by two if so.
    */
   private boolean skipTripleQuote(char quot) {
     if (pos + 1 < buffer.length && buffer[pos] == quot && buffer[pos + 1] == quot) {
       pos += 2;
       return true;
     } else {
       return false;
     }
   }

   /**
    * Scans a string literal delimited by 'quot', containing escape sequences.
    *
    * <p>ON ENTRY: 'pos' is 1 + the index of the first delimiter ON EXIT: 'pos' is 1 + the index of
    * the last delimiter.
    */
   private void escapedStringLiteral(char quot, boolean isRaw) {
     int oldPos = isRaw ? pos - 2 : pos - 1;
     boolean inTripleQuote = skipTripleQuote(quot);

     // more expensive second choice that expands escaped into a buffer
     StringBuilder literal = new StringBuilder();
     while (pos < buffer.length) {
       char c = buffer[pos];
       pos++;
       switch (c) {
         case '\n':
           if (inTripleQuote) {
             literal.append(c);
             break;
           } else {
             error("unterminated string literal at eol", oldPos, pos);
             addToken(TokenKind.STRING, oldPos, pos - 1, literal.toString());
             newline();
             return;
           }
         case '\\':
           if (pos == buffer.length) {
             error("unterminated string literal at eof", oldPos, pos);
             addToken(TokenKind.STRING, oldPos, pos - 1, literal.toString());
             return;
           }
           if (isRaw) {
             // Insert \ and the following character.
             // As in Python, it means that a raw string can never end with a single \.
             literal.append('\\');
             literal.append(buffer[pos]);
             pos++;
             break;
           }
           c = buffer[pos];
           pos++;
           switch (c) {
             case '\n':
               // ignore end of line character
               break;
             case 'n':
               literal.append('\n');
               break;
             case 'r':
               literal.append('\r');
               break;
             case 't':
               literal.append('\t');
               break;
             case '\\':
               literal.append('\\');
               break;
             case '\'':
               literal.append('\'');
               break;
             case '"':
               literal.append('"');
               break;
             case '0':
             case '1':
             case '2':
             case '3':
             case '4':
             case '5':
             case '6':
             case '7':
               { // octal escape
                 int octal = c - '0';
                 if (pos < buffer.length) {
                   c = buffer[pos];
                   if (c >= '0' && c <= '7') {
                     pos++;
                     octal = (octal << 3) | (c - '0');
                     if (pos < buffer.length) {
                       c = buffer[pos];
                       if (c >= '0' && c <= '7') {
                         pos++;
                         octal = (octal << 3) | (c - '0');
                       }
                     }
                   }
                 }
                 literal.append((char) (octal & 0xff));
                 break;
               }
             case 'a':
             case 'b':
             case 'f':
             case 'N':
             case 'u':
             case 'U':
             case 'v':
             case 'x':
               // exists in Python but not implemented in Blaze => error
               error("escape sequence not implemented: \\" + c, oldPos, pos);
               break;
             default:
               // unknown char escape => "\literal"
               literal.append('\\');
               literal.append(c);
               break;
           }
           break;
         case '\'':
         case '"':
           if (c != quot || (inTripleQuote && !skipTripleQuote(quot))) {
             // Non-matching quote, treat it like a regular char.
             literal.append(c);
           } else {
             // Matching close-delimiter, all done.
             addToken(TokenKind.STRING, oldPos, pos, literal.toString());
             return;
           }
           break;
         default:
           literal.append(c);
           break;
       }
     }
     error("unterminated string literal at eof", oldPos, pos);
     addToken(TokenKind.STRING, oldPos, pos, literal.toString());
   }

   /**
    * Scans a string literal delimited by 'quot'.
    *
    * <ul>
    * <li> ON ENTRY: 'pos' is 1 + the index of the first delimiter
    * <li> ON EXIT: 'pos' is 1 + the index of the last delimiter.
    * </ul>
    *
    * @param isRaw if true, do not escape the string.
    */
   private void addStringLiteral(char quot, boolean isRaw) {
     int oldPos = isRaw ? pos - 2 : pos - 1;
     int start = pos;

     // Don't even attempt to parse triple-quotes here.
     if (skipTripleQuote(quot)) {
       pos -= 2;
       escapedStringLiteral(quot, isRaw);
       return;
     }

     // first quick optimistic scan for a simple non-escaped string
     while (pos < buffer.length) {
       char c = buffer[pos++];
       switch (c) {
         case '\n':
           error("unterminated string literal at eol", oldPos, pos);
           addToken(TokenKind.STRING, oldPos, pos - 1, bufferSlice(start, pos - 1));
           newline();
           return;
         case '\\':
           if (isRaw) {
             // skip the next character
             pos++;
             break;
           }
           // oops, hit an escape, need to start over & build a new string buffer
           pos = oldPos + 1;
           escapedStringLiteral(quot, false);
           return;
         case '\'':
         case '"':
           if (c == quot) {
             // close-quote, all done.
             addToken(TokenKind.STRING, oldPos, pos, bufferSlice(start, pos - 1));
             return;
           }
       }
     }

     error("unterminated string literal at eof", oldPos, pos);
     addToken(TokenKind.STRING, oldPos, pos, bufferSlice(start, pos));
   }

   private static final ImmutableMap<String, TokenKind> KEYWORD_MAP = createKeywordMap();

   private static ImmutableMap<String, TokenKind> createKeywordMap() {
     ImmutableMap.Builder<String, TokenKind> builder = ImmutableMap.builder();
     for (TokenKind kind : TokenKind.KEYWORDS) {
       builder.put(kind.toString(), kind);
     }
     return builder.build();
   }

   private TokenKind getTokenKindForIdentfier(String id) {
     TokenKind kind = KEYWORD_MAP.get(id);
     return kind == null ? TokenKind.IDENTIFIER : kind;
   }

   private String scanIdentifier() {
     int oldPos = pos - 1;
     while (pos < buffer.length) {
       switch (buffer[pos]) {
         case '_':
         case 'a':
         case 'b':
         case 'c':
         case 'd':
         case 'e':
         case 'f':
         case 'g':
         case 'h':
         case 'i':
         case 'j':
         case 'k':
         case 'l':
         case 'm':
         case 'n':
         case 'o':
         case 'p':
         case 'q':
         case 'r':
         case 's':
         case 't':
         case 'u':
         case 'v':
         case 'w':
         case 'x':
         case 'y':
         case 'z':
         case 'A':
         case 'B':
         case 'C':
         case 'D':
         case 'E':
         case 'F':
         case 'G':
         case 'H':
         case 'I':
         case 'J':
         case 'K':
         case 'L':
         case 'M':
         case 'N':
         case 'O':
         case 'P':
         case 'Q':
         case 'R':
         case 'S':
         case 'T':
         case 'U':
         case 'V':
         case 'W':
         case 'X':
         case 'Y':
         case 'Z':
         case '0':
         case '1':
         case '2':
         case '3':
         case '4':
         case '5':
         case '6':
         case '7':
         case '8':
         case '9':
           pos++;
           break;
         default:
           return bufferSlice(oldPos, pos);
       }
     }
     return bufferSlice(oldPos, pos);
   }

   /**
    * Scans an identifier or keyword.
    *
    * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the identifier. ON EXIT: 'pos' is 1 +
    * the index of the last char in the identifier.
    *
    * @return the identifier or keyword token.
    */
   private void addIdentifierOrKeyword() {
     int oldPos = pos - 1;
     String id = scanIdentifier();
     TokenKind kind = getTokenKindForIdentfier(id);
     addToken(kind, oldPos, pos, (kind == TokenKind.IDENTIFIER) ? id : null);
   }

   private String scanInteger() {
     int oldPos = pos - 1;
     while (pos < buffer.length) {
       char c = buffer[pos];
       switch (c) {
         case 'X':
         case 'x':
         case 'a':
         case 'A':
         case 'b':
         case 'B':
         case 'c':
         case 'C':
         case 'd':
         case 'D':
         case 'e':
         case 'E':
         case 'f':
         case 'F':
         case '0':
         case '1':
         case '2':
         case '3':
         case '4':
         case '5':
         case '6':
         case '7':
         case '8':
         case '9':
           pos++;
           break;
         default:
           return bufferSlice(oldPos, pos);
       }
     }
     return bufferSlice(oldPos, pos);
   }

   /**
    * Scans an addInteger literal.
    *
    * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the literal. ON EXIT: 'pos' is 1 + the
    * index of the last char in the literal.
    */
   private void addInteger() {
     int oldPos = pos - 1;
     String literal = scanInteger();

     final String substring;
     final int radix;
     if (literal.startsWith("0x") || literal.startsWith("0X")) {
       radix = 16;
       substring = literal.substring(2);
     } else if (literal.startsWith("0") && literal.length() > 1) {
       radix = 8;
       substring = literal.substring(1);
     } else {
       radix = 10;
       substring = literal;
     }

     int value = 0;
     try {
       value = Integer.parseInt(substring, radix);
     } catch (NumberFormatException e) {
       error("invalid base-" + radix + " integer constant: " + literal);
     }

     addToken(TokenKind.INT, oldPos, pos, value);
   }

   /**
    * Tokenizes a two-char operator.
    *
    * @return true if it tokenized an operator
    */
   private boolean tokenizeTwoChars() {
     if (pos + 2 >= buffer.length) {
       return false;
     }
     char c1 = buffer[pos];
     char c2 = buffer[pos + 1];
     TokenKind tok = null;
     if (c2 == '=') {
       tok = EQUAL_TOKENS.get(c1);
     } else if (c2 == '*' && c1 == '*') {
       tok = TokenKind.STAR_STAR;
     }
     if (tok == null) {
       return false;
     }
     addToken(tok, pos, pos + 2);
     return true;
   }

   /** Performs tokenization of the character buffer of file contents provided to the constructor. */
   private void tokenize() {
     while (pos < buffer.length) {
       if (tokenizeTwoChars()) {
         pos += 2;
         continue;
       }
       char c = buffer[pos];
       pos++;
       switch (c) {
         case '{':
           {
             addToken(TokenKind.LBRACE, pos - 1, pos);
             openParenStackDepth++;
             break;
           }
         case '}':
           {
             addToken(TokenKind.RBRACE, pos - 1, pos);
             popParen();
             break;
           }
         case '(':
           {
             addToken(TokenKind.LPAREN, pos - 1, pos);
             openParenStackDepth++;
             break;
           }
         case ')':
           {
             addToken(TokenKind.RPAREN, pos - 1, pos);
             popParen();
             break;
           }
         case '[':
           {
             addToken(TokenKind.LBRACKET, pos - 1, pos);
             openParenStackDepth++;
             break;
           }
         case ']':
           {
             addToken(TokenKind.RBRACKET, pos - 1, pos);
             popParen();
             break;
           }
         case '>':
           {
             addToken(TokenKind.GREATER, pos - 1, pos);
             break;
           }
         case '<':
           {
             addToken(TokenKind.LESS, pos - 1, pos);
             break;
           }
         case ':':
           {
             addToken(TokenKind.COLON, pos - 1, pos);
             break;
           }
         case ',':
           {
             addToken(TokenKind.COMMA, pos - 1, pos);
             break;
           }
         case '+':
           {
             addToken(TokenKind.PLUS, pos - 1, pos);
             break;
           }
         case '-':
           {
             addToken(TokenKind.MINUS, pos - 1, pos);
             break;
           }
         case '|':
           {
             addToken(TokenKind.PIPE, pos - 1, pos);
             break;
           }
         case '=':
           {
             addToken(TokenKind.EQUALS, pos - 1, pos);
             break;
           }
         case '%':
           {
             addToken(TokenKind.PERCENT, pos - 1, pos);
             break;
           }
         case '/':
           {
             addToken(TokenKind.SLASH, pos - 1, pos);
             break;
           }
         case ';':
           {
             addToken(TokenKind.SEMI, pos - 1, pos);
             break;
           }
         case '.':
           {
             addToken(TokenKind.DOT, pos - 1, pos);
             break;
           }
         case '*':
           {
             addToken(TokenKind.STAR, pos - 1, pos);
             break;
           }
         case ' ':
         case '\t':
         case '\r':
           {
             addWhitespace();
             break;
           }
         case '\\':
           {
             // Backslash character is valid only at the end of a line (or in a string)
             if (pos + 1 < buffer.length && buffer[pos] == '\n') {
               // treat end of line backslash and newline char as whitespace
               // (they're ignored by the parser)
               pos++;
               addToken(TokenKind.WHITESPACE, pos - 2, pos, Character.toString(c));
             } else {
               addToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c));
             }
             break;
           }
         case '\n':
           {
             newline();
             break;
           }
         case '#':
           {
             int oldPos = pos - 1;
             while (pos < buffer.length) {
               c = buffer[pos];
               if (c == '\n') {
                 break;
               } else {
                 pos++;
               }
             }
             addToken(TokenKind.COMMENT, oldPos, pos, bufferSlice(oldPos, pos));
             break;
           }
         case '\'':
         case '\"':
           {
             addStringLiteral(c, false);
             break;
           }
         default:
           {
             // detect raw strings, e.g. r"str"
             if (c == 'r' && pos < buffer.length && (buffer[pos] == '\'' || buffer[pos] == '\"')) {
               c = buffer[pos];
               pos++;
               addStringLiteral(c, true);
               break;
             }

             if (Character.isDigit(c)) {
               addInteger();
             } else if (Character.isJavaIdentifierStart(c) && c != '$') {
               addIdentifierOrKeyword();
             } else {
               // Some characters in Python are not recognized in Blaze syntax (e.g. '!')
               addToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c));
               error("invalid character: '" + c + "'");
             }
             break;
           } // default
       } // switch
     } // while
   }

   /**
    * Returns parts of the source buffer based on offsets
    *
    * @param start the beginning offset for the slice
    * @param end the offset immediately following the slice
    * @return the text at offset start with length end - start
    */
   private String bufferSlice(int start, int end) {
     return new String(this.buffer, start, end - start);
   }
 }
	/*
	* Copyright 2016 The Bazel Authors. All rights reserved.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package com.google.idea.blaze.base.lang.buildfile.lexer;

	import com.google.common.collect.ImmutableMap;
	import com.google.common.collect.Lists;
	import java.util.List;
	import java.util.Map;
	import java.util.Stack;
	import javax.annotation.Nullable;

	/**
	* A tokenizer for the BUILD language.
	*
	* <p>Copied from blaze/bazel's lexer. The differences are: 1. Blaze's lexer isn't 'faithful', in
	* that it reorders characters, skips characters, and adds ghost characters. We can't do that,
	* because we need to match the editor's view of the document. 2. Blaze's lexer only lexes entire
	* files (it can't incrementally lex part of a file, starting from a given indent stack depth).
	*/
	public class BuildLexerBase {

	/**
	* When tokenizing for the purposes of parsing, we handle indentation.<br>
	* For syntax highlighting, we need to tokenize every character, and don't care about indentation.
	*/
	public enum LexerMode {
	Parsing,
	SyntaxHighlighting
	}

	// Characters that can come immediately prior to an '=' character to generate
	// a different token
	private static final Map<Character, TokenKind> EQUAL_TOKENS =
	ImmutableMap.<Character, TokenKind>builder()
	.put('=', TokenKind.EQUALS_EQUALS)
	.put('!', TokenKind.NOT_EQUALS)
	.put('>', TokenKind.GREATER_EQUALS)
	.put('<', TokenKind.LESS_EQUALS)
	.put('+', TokenKind.PLUS_EQUALS)
	.put('-', TokenKind.MINUS_EQUALS)
	.put('*', TokenKind.STAR_EQUALS)
	.put('/', TokenKind.SLASH_EQUALS)
	.put('%', TokenKind.PERCENT_EQUALS)
	.build();

	private final LexerMode mode;

	// Input buffer and position
	private final char[] buffer;
	private int pos;

	private final List<Token> tokens;

	// The number of unclosed open-parens ("(", '{', '[') at the current point in
	// the stream. Whitespace is handled differently when this is nonzero.
	private int openParenStackDepth = 0;

	// The stack of enclosing indentation levels; always contains '0' at the
	// bottom.
	private final Stack<Integer> indentStack = new Stack<>();

	private boolean containsErrors;

	/**
	* Constructs a lexer which tokenizes the contents of the specified InputBuffer. Any errors during
	* lexing are reported on "handler".
	*/
	public BuildLexerBase(CharSequence input, int initialStackDepth, LexerMode mode) {
	this.buffer = input.toString().toCharArray();
	// Empirical measurements show roughly 1 token per 8 characters in buffer.
	this.tokens = Lists.newArrayListWithExpectedSize(buffer.length / 8);
	this.pos = 0;
	this.openParenStackDepth = initialStackDepth;
	this.mode = mode;

	indentStack.push(0);
	tokenize();
	}

	/** The number of unclosed open-parens ("(", '{', '[') at the end of this string. */
	public int getOpenParenStackDepth() {
	return openParenStackDepth;
	}

	/**
	* Returns true if there were errors during scanning of this input file or string. The
	* BuildLexerBase may attempt to recover from errors, but clients should not rely on the results
	* of scanning if this flag is set.
	*/
	public boolean containsErrors() {
	return containsErrors;
	}

	/** Returns the (mutable) list of tokens generated by the BuildLexerBase. */
	public List<Token> getTokens() {
	return tokens;
	}

	private void popParen() {
	if (openParenStackDepth == 0) {
	error("indentation error");
	} else {
	openParenStackDepth--;
	}
	}

	private void error(String message) {
	error(message, pos - 1, pos - 1);
	}

	protected void error(String message, int start, int end) {
	this.containsErrors = true;
	}

	/** invariant: symbol positions are half-open intervals. */
	private void addToken(TokenKind kind, int left, int right) {
	addToken(kind, left, right, null);
	}

	private void addToken(TokenKind kind, int left, int right, @Nullable Object value) {
	tokens.add(new Token(kind, left, right, value));
	}

	/**
	* Parses an end-of-line sequence, handling statement indentation correctly.
	*
	* <p>UNIX newlines are assumed (LF). Carriage returns are always ignored.
	*
	* <p>ON ENTRY: 'pos' is the index of the char after '\n'. ON EXIT: 'pos' is the index of the next
	* non-space char after '\n'.
	*/
	protected void newline() {
	if (mode == LexerMode.SyntaxHighlighting) {
	addToken(TokenKind.NEWLINE, pos - 1, pos);
	return;
	}
	if (openParenStackDepth > 0) {
	newlineInsideExpression(); // in an expression: ignore space
	} else {
	newlineOutsideExpression(); // generate NEWLINE/INDENT/DEDENT tokens
	}
	}

	private void newlineInsideExpression() {
	int oldPos = pos - 1;
	while (pos < buffer.length) {
	switch (buffer[pos]) {
	case ' ':
	case '\t':
	case '\r':
	pos++;
	break;
	default:
	// ignored by the parser
	addToken(TokenKind.WHITESPACE, oldPos, pos);
	return;
	}
	}
	addToken(TokenKind.WHITESPACE, oldPos, pos);
	}

	/**
	* Handle INDENT and DEDENT within statements.
	*
	* <p>Note these tokens have zero length -- this is because we can have an arbitrary number of
	* dedents squeezed into some number of characters, and the size of all the lexical elements must
	* match the number of characters in the file.
	*/
	private void newlineOutsideExpression() {
	int oldPos = pos - 1;
	if (pos > 1) { // skip over newline at start of file
	addToken(TokenKind.NEWLINE, oldPos, pos);
	oldPos = pos;
	}

	// we're in a stmt: suck up space at beginning of next line
	int indentLen = 0;
	while (pos < buffer.length) {
	char c = buffer[pos];
	if (c == ' ') {
	indentLen++;
	pos++;
	} else if (c == '\t') {
	indentLen += 8 - indentLen % 8;
	pos++;
	} else if (c == '\n') { // entirely blank line: ignore
	indentLen = 0;
	pos++;
	} else if (c == '#') { // line containing only indented comment
	if (oldPos != pos) {
	addToken(TokenKind.WHITESPACE, oldPos, pos);
	oldPos = pos;
	}
	while (pos < buffer.length && c != '\n') {
	c = buffer[pos++];
	}
	addToken(TokenKind.COMMENT, oldPos, pos - 1, bufferSlice(oldPos, pos - 1));
	oldPos = pos - 1;
	indentLen = 0;
	} else { // printing character
	break;
	}
	}

	if (oldPos != pos) {
	addToken(TokenKind.WHITESPACE, oldPos, pos);
	}
	if (pos == buffer.length) {
	indentLen = 0;
	} // trailing space on last line

	int peekedIndent = indentStack.peek();
	if (peekedIndent < indentLen) { // push a level
	indentStack.push(indentLen);
	addToken(TokenKind.INDENT, pos, pos);

	} else if (peekedIndent > indentLen) { // pop one or more levels
	while (peekedIndent > indentLen) {
	indentStack.pop();
	addToken(TokenKind.DEDENT, pos, pos);
	peekedIndent = indentStack.peek();
	}

	if (peekedIndent < indentLen) {
	error("indentation error");
	}
	}
	}

	/** Collapse adjacent whitespace characters into a single token */
	private void addWhitespace() {
	int oldPos = pos - 1;
	while (pos < buffer.length) {
	switch (buffer[pos]) {
	case ' ':
	case '\t':
	case '\r':
	pos++;
	break;
	default:
	addToken(TokenKind.WHITESPACE, oldPos, pos, bufferSlice(oldPos, pos));
	return;
	}
	}
	addToken(TokenKind.WHITESPACE, oldPos, pos, bufferSlice(oldPos, pos));
	}

	/**
	* Returns true if current position is in the middle of a triple quote delimiter (3 x quot), and
	* advances 'pos' by two if so.
	*/
	private boolean skipTripleQuote(char quot) {
	if (pos + 1 < buffer.length && buffer[pos] == quot && buffer[pos + 1] == quot) {
	pos += 2;
	return true;
	} else {
	return false;
	}
	}

	/**
	* Scans a string literal delimited by 'quot', containing escape sequences.
	*
	* <p>ON ENTRY: 'pos' is 1 + the index of the first delimiter ON EXIT: 'pos' is 1 + the index of
	* the last delimiter.
	*/
	private void escapedStringLiteral(char quot, boolean isRaw) {
	int oldPos = isRaw ? pos - 2 : pos - 1;
	boolean inTripleQuote = skipTripleQuote(quot);

	// more expensive second choice that expands escaped into a buffer
	StringBuilder literal = new StringBuilder();
	while (pos < buffer.length) {
	char c = buffer[pos];
	pos++;
	switch (c) {
	case '\n':
	if (inTripleQuote) {
	literal.append(c);
	break;
	} else {
	error("unterminated string literal at eol", oldPos, pos);
	addToken(TokenKind.STRING, oldPos, pos - 1, literal.toString());
	newline();
	return;
	}
	case '\\':
	if (pos == buffer.length) {
	error("unterminated string literal at eof", oldPos, pos);
	addToken(TokenKind.STRING, oldPos, pos - 1, literal.toString());
	return;
	}
	if (isRaw) {
	// Insert \ and the following character.
	// As in Python, it means that a raw string can never end with a single \.
	literal.append('\\');
	literal.append(buffer[pos]);
	pos++;
	break;
	}
	c = buffer[pos];
	pos++;
	switch (c) {
	case '\n':
	// ignore end of line character
	break;
	case 'n':
	literal.append('\n');
	break;
	case 'r':
	literal.append('\r');
	break;
	case 't':
	literal.append('\t');
	break;
	case '\\':
	literal.append('\\');
	break;
	case '\'':
	literal.append('\'');
	break;
	case '"':
	literal.append('"');
	break;
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	{ // octal escape
	int octal = c - '0';
	if (pos < buffer.length) {
	c = buffer[pos];
	if (c >= '0' && c <= '7') {
	pos++;
	octal = (octal << 3) \| (c - '0');
	if (pos < buffer.length) {
	c = buffer[pos];
	if (c >= '0' && c <= '7') {
	pos++;
	octal = (octal << 3) \| (c - '0');
	}
	}
	}
	}
	literal.append((char) (octal & 0xff));
	break;
	}
	case 'a':
	case 'b':
	case 'f':
	case 'N':
	case 'u':
	case 'U':
	case 'v':
	case 'x':
	// exists in Python but not implemented in Blaze => error
	error("escape sequence not implemented: \\" + c, oldPos, pos);
	break;
	default:
	// unknown char escape => "\literal"
	literal.append('\\');
	literal.append(c);
	break;
	}
	break;
	case '\'':
	case '"':
	if (c != quot \|\| (inTripleQuote && !skipTripleQuote(quot))) {
	// Non-matching quote, treat it like a regular char.
	literal.append(c);
	} else {
	// Matching close-delimiter, all done.
	addToken(TokenKind.STRING, oldPos, pos, literal.toString());
	return;
	}
	break;
	default:
	literal.append(c);
	break;
	}
	}
	error("unterminated string literal at eof", oldPos, pos);
	addToken(TokenKind.STRING, oldPos, pos, literal.toString());
	}

	/**
	* Scans a string literal delimited by 'quot'.
	*
	* <ul>
	* <li> ON ENTRY: 'pos' is 1 + the index of the first delimiter
	* <li> ON EXIT: 'pos' is 1 + the index of the last delimiter.
	* </ul>
	*
	* @param isRaw if true, do not escape the string.
	*/
	private void addStringLiteral(char quot, boolean isRaw) {
	int oldPos = isRaw ? pos - 2 : pos - 1;
	int start = pos;

	// Don't even attempt to parse triple-quotes here.
	if (skipTripleQuote(quot)) {
	pos -= 2;
	escapedStringLiteral(quot, isRaw);
	return;
	}

	// first quick optimistic scan for a simple non-escaped string
	while (pos < buffer.length) {
	char c = buffer[pos++];
	switch (c) {
	case '\n':
	error("unterminated string literal at eol", oldPos, pos);
	addToken(TokenKind.STRING, oldPos, pos - 1, bufferSlice(start, pos - 1));
	newline();
	return;
	case '\\':
	if (isRaw) {
	// skip the next character
	pos++;
	break;
	}
	// oops, hit an escape, need to start over & build a new string buffer
	pos = oldPos + 1;
	escapedStringLiteral(quot, false);
	return;
	case '\'':
	case '"':
	if (c == quot) {
	// close-quote, all done.
	addToken(TokenKind.STRING, oldPos, pos, bufferSlice(start, pos - 1));
	return;
	}
	}
	}

	error("unterminated string literal at eof", oldPos, pos);
	addToken(TokenKind.STRING, oldPos, pos, bufferSlice(start, pos));
	}

	private static final ImmutableMap<String, TokenKind> KEYWORD_MAP = createKeywordMap();

	private static ImmutableMap<String, TokenKind> createKeywordMap() {
	ImmutableMap.Builder<String, TokenKind> builder = ImmutableMap.builder();
	for (TokenKind kind : TokenKind.KEYWORDS) {
	builder.put(kind.toString(), kind);
	}
	return builder.build();
	}

	private TokenKind getTokenKindForIdentfier(String id) {
	TokenKind kind = KEYWORD_MAP.get(id);
	return kind == null ? TokenKind.IDENTIFIER : kind;
	}

	private String scanIdentifier() {
	int oldPos = pos - 1;
	while (pos < buffer.length) {
	switch (buffer[pos]) {
	case '_':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'e':
	case 'f':
	case 'g':
	case 'h':
	case 'i':
	case 'j':
	case 'k':
	case 'l':
	case 'm':
	case 'n':
	case 'o':
	case 'p':
	case 'q':
	case 'r':
	case 's':
	case 't':
	case 'u':
	case 'v':
	case 'w':
	case 'x':
	case 'y':
	case 'z':
	case 'A':
	case 'B':
	case 'C':
	case 'D':
	case 'E':
	case 'F':
	case 'G':
	case 'H':
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'O':
	case 'P':
	case 'Q':
	case 'R':
	case 'S':
	case 'T':
	case 'U':
	case 'V':
	case 'W':
	case 'X':
	case 'Y':
	case 'Z':
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
	pos++;
	break;
	default:
	return bufferSlice(oldPos, pos);
	}
	}
	return bufferSlice(oldPos, pos);
	}

	/**
	* Scans an identifier or keyword.
	*
	* <p>ON ENTRY: 'pos' is 1 + the index of the first char in the identifier. ON EXIT: 'pos' is 1 +
	* the index of the last char in the identifier.
	*
	* @return the identifier or keyword token.
	*/
	private void addIdentifierOrKeyword() {
	int oldPos = pos - 1;
	String id = scanIdentifier();
	TokenKind kind = getTokenKindForIdentfier(id);
	addToken(kind, oldPos, pos, (kind == TokenKind.IDENTIFIER) ? id : null);
	}

	private String scanInteger() {
	int oldPos = pos - 1;
	while (pos < buffer.length) {
	char c = buffer[pos];
	switch (c) {
	case 'X':
	case 'x':
	case 'a':
	case 'A':
	case 'b':
	case 'B':
	case 'c':
	case 'C':
	case 'd':
	case 'D':
	case 'e':
	case 'E':
	case 'f':
	case 'F':
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
	pos++;
	break;
	default:
	return bufferSlice(oldPos, pos);
	}
	}
	return bufferSlice(oldPos, pos);
	}

	/**
	* Scans an addInteger literal.
	*
	* <p>ON ENTRY: 'pos' is 1 + the index of the first char in the literal. ON EXIT: 'pos' is 1 + the
	* index of the last char in the literal.
	*/
	private void addInteger() {
	int oldPos = pos - 1;
	String literal = scanInteger();

	final String substring;
	final int radix;
	if (literal.startsWith("0x") \|\| literal.startsWith("0X")) {
	radix = 16;
	substring = literal.substring(2);
	} else if (literal.startsWith("0") && literal.length() > 1) {
	radix = 8;
	substring = literal.substring(1);
	} else {
	radix = 10;
	substring = literal;
	}

	int value = 0;
	try {
	value = Integer.parseInt(substring, radix);
	} catch (NumberFormatException e) {
	error("invalid base-" + radix + " integer constant: " + literal);
	}

	addToken(TokenKind.INT, oldPos, pos, value);
	}

	/**
	* Tokenizes a two-char operator.
	*
	* @return true if it tokenized an operator
	*/
	private boolean tokenizeTwoChars() {
	if (pos + 2 >= buffer.length) {
	return false;
	}
	char c1 = buffer[pos];
	char c2 = buffer[pos + 1];
	TokenKind tok = null;
	if (c2 == '=') {
	tok = EQUAL_TOKENS.get(c1);
	} else if (c2 == '' && c1 == '') {
	tok = TokenKind.STAR_STAR;
	}
	if (tok == null) {
	return false;
	}
	addToken(tok, pos, pos + 2);
	return true;
	}

	/** Performs tokenization of the character buffer of file contents provided to the constructor. */
	private void tokenize() {
	while (pos < buffer.length) {
	if (tokenizeTwoChars()) {
	pos += 2;
	continue;
	}
	char c = buffer[pos];
	pos++;
	switch (c) {
	case '{':
	{
	addToken(TokenKind.LBRACE, pos - 1, pos);
	openParenStackDepth++;
	break;
	}
	case '}':
	{
	addToken(TokenKind.RBRACE, pos - 1, pos);
	popParen();
	break;
	}
	case '(':
	{
	addToken(TokenKind.LPAREN, pos - 1, pos);
	openParenStackDepth++;
	break;
	}
	case ')':
	{
	addToken(TokenKind.RPAREN, pos - 1, pos);
	popParen();
	break;
	}
	case '[':
	{
	addToken(TokenKind.LBRACKET, pos - 1, pos);
	openParenStackDepth++;
	break;
	}
	case ']':
	{
	addToken(TokenKind.RBRACKET, pos - 1, pos);
	popParen();
	break;
	}
	case '>':
	{
	addToken(TokenKind.GREATER, pos - 1, pos);
	break;
	}
	case '<':
	{
	addToken(TokenKind.LESS, pos - 1, pos);
	break;
	}
	case ':':
	{
	addToken(TokenKind.COLON, pos - 1, pos);
	break;
	}
	case ',':
	{
	addToken(TokenKind.COMMA, pos - 1, pos);
	break;
	}
	case '+':
	{
	addToken(TokenKind.PLUS, pos - 1, pos);
	break;
	}
	case '-':
	{
	addToken(TokenKind.MINUS, pos - 1, pos);
	break;
	}
	case '\|':
	{
	addToken(TokenKind.PIPE, pos - 1, pos);
	break;
	}
	case '=':
	{
	addToken(TokenKind.EQUALS, pos - 1, pos);
	break;
	}
	case '%':
	{
	addToken(TokenKind.PERCENT, pos - 1, pos);
	break;
	}
	case '/':
	{
	addToken(TokenKind.SLASH, pos - 1, pos);
	break;
	}
	case ';':
	{
	addToken(TokenKind.SEMI, pos - 1, pos);
	break;
	}
	case '.':
	{
	addToken(TokenKind.DOT, pos - 1, pos);
	break;
	}
	case '*':
	{
	addToken(TokenKind.STAR, pos - 1, pos);
	break;
	}
	case ' ':
	case '\t':
	case '\r':
	{
	addWhitespace();
	break;
	}
	case '\\':
	{
	// Backslash character is valid only at the end of a line (or in a string)
	if (pos + 1 < buffer.length && buffer[pos] == '\n') {
	// treat end of line backslash and newline char as whitespace
	// (they're ignored by the parser)
	pos++;
	addToken(TokenKind.WHITESPACE, pos - 2, pos, Character.toString(c));
	} else {
	addToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c));
	}
	break;
	}
	case '\n':
	{
	newline();
	break;
	}
	case '#':
	{
	int oldPos = pos - 1;
	while (pos < buffer.length) {
	c = buffer[pos];
	if (c == '\n') {
	break;
	} else {
	pos++;
	}
	}
	addToken(TokenKind.COMMENT, oldPos, pos, bufferSlice(oldPos, pos));
	break;
	}
	case '\'':
	case '\"':
	{
	addStringLiteral(c, false);
	break;
	}
	default:
	{
	// detect raw strings, e.g. r"str"
	if (c == 'r' && pos < buffer.length && (buffer[pos] == '\'' \|\| buffer[pos] == '\"')) {
	c = buffer[pos];
	pos++;
	addStringLiteral(c, true);
	break;
	}

	if (Character.isDigit(c)) {
	addInteger();
	} else if (Character.isJavaIdentifierStart(c) && c != '$') {
	addIdentifierOrKeyword();
	} else {
	// Some characters in Python are not recognized in Blaze syntax (e.g. '!')
	addToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c));
	error("invalid character: '" + c + "'");
	}
	break;
	} // default
	} // switch
	} // while
	}

	/**
	* Returns parts of the source buffer based on offsets
	*
	* @param start the beginning offset for the slice
	* @param end the offset immediately following the slice
	* @return the text at offset start with length end - start
	*/
	private String bufferSlice(int start, int end) {
	return new String(this.buffer, start, end - start);
	}
	}