src/main/java/com/google/devtools/build/lib/syntax/Lexer.java - bazel - Git at Google

 // Copyright 2014 The Bazel Authors. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package com.google.devtools.build.lib.syntax;

 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableMap;
 import com.google.devtools.build.lib.concurrent.ThreadSafety.Immutable;
 import com.google.devtools.build.lib.events.Event;
 import com.google.devtools.build.lib.events.Location;
 import com.google.devtools.build.lib.skyframe.serialization.autocodec.AutoCodec;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Stack;

 /** A scanner for Starlark. */
 final class Lexer {

   // Characters that can come immediately prior to an '=' character to generate
   // a different token
   private static final ImmutableMap<Character, TokenKind> EQUAL_TOKENS =
       ImmutableMap.<Character, TokenKind>builder()
           .put('=', TokenKind.EQUALS_EQUALS)
           .put('!', TokenKind.NOT_EQUALS)
           .put('>', TokenKind.GREATER_EQUALS)
           .put('<', TokenKind.LESS_EQUALS)
           .put('+', TokenKind.PLUS_EQUALS)
           .put('-', TokenKind.MINUS_EQUALS)
           .put('*', TokenKind.STAR_EQUALS)
           .put('/', TokenKind.SLASH_EQUALS)
           .put('%', TokenKind.PERCENT_EQUALS)
           .put('^', TokenKind.CARET_EQUALS)
           .put('&', TokenKind.AMPERSAND_EQUALS)
           .put('|', TokenKind.PIPE_EQUALS)
           .build();

   // Input buffer and position
   private final char[] buffer;
   private int pos;

   private final LineNumberTable lnt; // maps offsets to Locations

   // The stack of enclosing indentation levels; always contains '0' at the
   // bottom.
   private final Stack<Integer> indentStack = new Stack<>();

   /**
    * Token to return. This token is mutated in-place. Its kind is set to
    * null to indicate the intermediate state, where the new token has not
    * been scanned yet.
    */
   private final Token token;

   private final List<Comment> comments;

   // The number of unclosed open-parens ("(", '{', '[') at the current point in
   // the stream. Whitespace is handled differently when this is nonzero.
   private int openParenStackDepth = 0;

   // List of errors appended to by Lexer and Parser.
   private final List<Event> errors;

   /**
    * True after a NEWLINE token.
    * In other words, we are outside an expression and we have to check the indentation.
    */
   private boolean checkIndentation;

   private int dents; // number of saved INDENT (>0) or OUTDENT (<0) tokens to return

   /**
    * StringEscapeEvents contains the errors related to invalid escape sequences like "\a". This is
    * not handled by the normal eventHandler. Instead, it is passed to the parser and then the AST.
    * During the evaluation, we can decide to show the events based on a flag in StarlarkSemantics.
    * This code is temporary, during the migration.
    */
   private final List<Event> stringEscapeEvents = new ArrayList<>();

   /** Constructs a lexer which tokenizes the parser input. Errors are appended to {@code errors}. */
   Lexer(ParserInput input, List<Event> errors) {
     this.lnt = LineNumberTable.create(input.getContent(), input.getFile());
     this.buffer = input.getContent();
     this.pos = 0;
     this.errors = errors;
     this.checkIndentation = true;
     this.comments = new ArrayList<>();
     this.dents = 0;
     this.token = new Token(null, -1, -1);

     indentStack.push(0);
   }

   List<Comment> getComments() {
     return comments;
   }

   List<Event> getStringEscapeEvents() {
     return stringEscapeEvents;
   }

   /** Returns the apparent name of the lexer's input file. */
   String getFile() {
     return lnt.getFile();
   }

   /**
    * Returns the next token, or EOF if it is the end of the file. It is an error to call nextToken()
    * after EOF has been returned.
    */
   Token nextToken() {
     boolean afterNewline = token.kind == TokenKind.NEWLINE;
     token.kind = null;
     tokenize();
     Preconditions.checkState(token.kind != null);

     // Like Python, always end with a NEWLINE token, even if no '\n' in input:
     if (token.kind == TokenKind.EOF && !afterNewline) {
       token.kind = TokenKind.NEWLINE;
     }
     return token;
   }

   private void popParen() {
     if (openParenStackDepth == 0) {
       error("indentation error");
     } else {
       openParenStackDepth--;
     }
   }

   private void error(String message) {
      error(message, pos - 1, pos - 1);
   }

   private void error(String message, int start, int end) {
     errors.add(Event.error(createLocation(start, end), message));
   }

   LexerLocation createLocation(int start, int end) {
     return new LexerLocation(lnt, start, end);
   }

   // A LexerLocation records the span (both start and end) of a token or grammar production.
   // It implements Location by describing the start position,
   // but it also exposes the end location through getEndLocation.
   // This class will be merged with Location and eliminated when we make the Parser
   // record token offsets in the syntax tree, and create Locations on demand.
   @AutoCodec
   @Immutable
   static final class LexerLocation extends Location {
     private final LineNumberTable lineNumberTable;
     final int startOffset;
     final int endOffset;

     LexerLocation(LineNumberTable lineNumberTable, int startOffset, int endOffset) {
       this.startOffset = startOffset;
       this.endOffset = endOffset;
       this.lineNumberTable = lineNumberTable;
     }

     @Override
     public String file() {
       return lineNumberTable.getFile();
     }

     @Override
     public LineAndColumn getLineAndColumn() {
       return lineNumberTable.getLineAndColumn(startOffset);
     }

     // For Node.getEndLocation. This is a temporary measure.
     Location getEndLocation() {
       // The end offset is the location *past* the actual end position --> subtract 1:
       // TODO(adonovan): use half-open intervals again. CL 170723732 was a mistake.
       int endOffset = this.endOffset - 1;
       if (endOffset < 0) {
         endOffset = 0;
       }
       LineAndColumn linecol = lineNumberTable.getLineAndColumn(endOffset);
       return Location.fromFileLineColumn(file(), linecol.line, linecol.column);
     }
   }

   /** invariant: symbol positions are half-open intervals. */
   private void setToken(TokenKind kind, int left, int right) {
     Preconditions.checkState(token.kind == null);
     token.kind = kind;
     token.left = left;
     token.right = right;
     token.value = null;
   }

   private void setToken(TokenKind kind, int left, int right, Object value) {
     Preconditions.checkState(token.kind == null);
     token.kind = kind;
     token.left = left;
     token.right = right;
     token.value = value;
   }

   /**
    * Parses an end-of-line sequence, handling statement indentation correctly.
    *
    * <p>UNIX newlines are assumed (LF). Carriage returns are always ignored.
    */
   private void newline() {
     if (openParenStackDepth > 0) {
       newlineInsideExpression(); // in an expression: ignore space
     } else {
       checkIndentation = true;
       setToken(TokenKind.NEWLINE, pos - 1, pos);
     }
   }

   private void newlineInsideExpression() {
     while (pos < buffer.length) {
       switch (buffer[pos]) {
         case ' ': case '\t': case '\r':
           pos++;
           break;
         default:
           return;
       }
     }
   }

   /** Computes indentation (updates dent) and advances pos. */
   private void computeIndentation() {
     // we're in a stmt: suck up space at beginning of next line
     int indentLen = 0;
     while (pos < buffer.length) {
       char c = buffer[pos];
       if (c == ' ') {
         indentLen++;
         pos++;
       } else if (c == '\r') {
         pos++;
       } else if (c == '\t') {
         indentLen++;
         pos++;
         error("Tab characters are not allowed for indentation. Use spaces instead.");
       } else if (c == '\n') { // entirely blank line: discard
         indentLen = 0;
         pos++;
       } else if (c == '#') { // line containing only indented comment
         int oldPos = pos;
         while (pos < buffer.length && c != '\n') {
           c = buffer[pos++];
         }
         makeComment(oldPos, pos - 1, bufferSlice(oldPos, pos - 1));
         indentLen = 0;
       } else { // printing character
         break;
       }
     }

     if (pos == buffer.length) {
       indentLen = 0;
     } // trailing space on last line

     int peekedIndent = indentStack.peek();
     if (peekedIndent < indentLen) { // push a level
       indentStack.push(indentLen);
       dents++;

     } else if (peekedIndent > indentLen) { // pop one or more levels
       while (peekedIndent > indentLen) {
         indentStack.pop();
         dents--;
         peekedIndent = indentStack.peek();
       }

       if (peekedIndent < indentLen) {
         error("indentation error");
       }
     }
   }

   /**
    * Returns true if current position is in the middle of a triple quote
    * delimiter (3 x quot), and advances 'pos' by two if so.
    */
   private boolean skipTripleQuote(char quot) {
     if (lookaheadIs(0, quot) && lookaheadIs(1, quot)) {
       pos += 2;
       return true;
     } else {
       return false;
     }
   }

   /**
    * Scans a string literal delimited by 'quot', containing escape sequences.
    *
    * <p>ON ENTRY: 'pos' is 1 + the index of the first delimiter
    * ON EXIT: 'pos' is 1 + the index of the last delimiter.
    *
    * @return the string-literal token.
    */
   private void escapedStringLiteral(char quot, boolean isRaw) {
     int literalStartPos = isRaw ? pos - 2 : pos - 1;
     boolean inTriplequote = skipTripleQuote(quot);
     // more expensive second choice that expands escaped into a buffer
     StringBuilder literal = new StringBuilder();
     while (pos < buffer.length) {
       char c = buffer[pos];
       pos++;
       switch (c) {
         case '\n':
           if (inTriplequote) {
             literal.append(c);
             break;
           } else {
             error("unterminated string literal at eol", literalStartPos, pos);
             setToken(TokenKind.STRING, literalStartPos, pos, literal.toString());
             return;
           }
         case '\\':
           if (pos == buffer.length) {
             error("unterminated string literal at eof", literalStartPos, pos);
             setToken(TokenKind.STRING, literalStartPos, pos, literal.toString());
             return;
           }
           if (isRaw) {
             // Insert \ and the following character.
             // As in Python, it means that a raw string can never end with a single \.
             literal.append('\\');
             if (lookaheadIs(0, '\r') && lookaheadIs(1, '\n')) {
               literal.append("\n");
               pos += 2;
             } else if (buffer[pos] == '\r' || buffer[pos] == '\n') {
               literal.append("\n");
               pos += 1;
             } else {
               literal.append(buffer[pos]);
               pos += 1;
             }
             break;
           }
           c = buffer[pos];
           pos++;
           switch (c) {
             case '\r':
               if (lookaheadIs(0, '\n')) {
                 pos += 1;
                 break;
               } else {
                 break;
               }
             case '\n':
               // ignore end of line character
               break;
             case 'n':
               literal.append('\n');
               break;
             case 'r':
               literal.append('\r');
               break;
             case 't':
               literal.append('\t');
               break;
             case '\\':
               literal.append('\\');
               break;
             case '\'':
               literal.append('\'');
               break;
             case '"':
               literal.append('"');
               break;
             case '0':
             case '1':
             case '2':
             case '3':
             case '4':
             case '5':
             case '6':
             case '7':
               { // octal escape
                 int octal = c - '0';
                 if (pos < buffer.length) {
                   c = buffer[pos];
                   if (c >= '0' && c <= '7') {
                     pos++;
                     octal = (octal << 3) | (c - '0');
                     if (pos < buffer.length) {
                       c = buffer[pos];
                       if (c >= '0' && c <= '7') {
                         pos++;
                         octal = (octal << 3) | (c - '0');
                       }
                     }
                   }
                 }
                 if (octal > 0xff) {
                   error("octal escape sequence out of range (maximum is \\377)");
                 }
                 literal.append((char) (octal & 0xff));
                 break;
               }
             case 'a':
             case 'b':
             case 'f':
             case 'N':
             case 'u':
             case 'U':
             case 'v':
             case 'x':
               // exists in Python but not implemented in Blaze => error
               error("invalid escape sequence: \\" + c, literalStartPos, pos);
               break;
             default:
               // unknown char escape => "\literal"
               stringEscapeEvents.add(
                   Event.error(
                       createLocation(pos - 1, pos),
                       "invalid escape sequence: \\"
                           + c
                           + ". You can enable unknown escape sequences by passing the flag "
                           + "--incompatible_restrict_string_escapes=false"));

               literal.append('\\');
               literal.append(c);
               break;
           }
           break;
         case '\'':
         case '"':
           if (c != quot || (inTriplequote && !skipTripleQuote(quot))) {
             // Non-matching quote, treat it like a regular char.
             literal.append(c);
           } else {
             // Matching close-delimiter, all done.
             setToken(TokenKind.STRING, literalStartPos, pos, literal.toString());
             return;
           }
           break;
         default:
           literal.append(c);
           break;
       }
     }
     error("unterminated string literal at eof", literalStartPos, pos);
     setToken(TokenKind.STRING, literalStartPos, pos, literal.toString());
   }

   /**
    * Scans a string literal delimited by 'quot'.
    *
    * <ul>
    * <li> ON ENTRY: 'pos' is 1 + the index of the first delimiter
    * <li> ON EXIT: 'pos' is 1 + the index of the last delimiter.
    * </ul>
    *
    * @param isRaw if true, do not escape the string.
    * @return the string-literal token.
    */
   private void stringLiteral(char quot, boolean isRaw) {
     int literalStartPos = isRaw ? pos - 2 : pos - 1;
     int contentStartPos = pos;

     // Don't even attempt to parse triple-quotes here.
     if (skipTripleQuote(quot)) {
       pos -= 2;
       escapedStringLiteral(quot, isRaw);
       return;
     }

     // first quick optimistic scan for a simple non-escaped string
     while (pos < buffer.length) {
       char c = buffer[pos++];
       switch (c) {
         case '\n':
           error("unterminated string literal at eol", literalStartPos, pos);
           setToken(TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos - 1));
           return;
         case '\\':
           if (isRaw) {
             if (lookaheadIs(0, '\r') && lookaheadIs(1, '\n')) {
               // There was a CRLF after the newline. No shortcut possible, since it needs to be
               // transformed into a single LF.
               pos = contentStartPos;
               escapedStringLiteral(quot, true);
               return;
             } else {
               pos++;
               break;
             }
           }
           // oops, hit an escape, need to start over & build a new string buffer
           pos = contentStartPos;
           escapedStringLiteral(quot, false);
           return;
         case '\'':
         case '"':
           if (c == quot) {
             // close-quote, all done.
             setToken(
                 TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos - 1));
             return;
           }
           break;
         default: // fall out
       }
     }

     // If the current position is beyond the end of the file, need to move it backwards
     // Possible if the file ends with `r"\` (unterminated raw string literal with a backslash)
     if (pos > buffer.length) {
       pos = buffer.length;
     }

     error("unterminated string literal at eof", literalStartPos, pos);
     setToken(TokenKind.STRING, literalStartPos, pos, bufferSlice(contentStartPos, pos));
   }

   private static final Map<String, TokenKind> keywordMap = new HashMap<>();

   static {
     keywordMap.put("and", TokenKind.AND);
     keywordMap.put("as", TokenKind.AS);
     keywordMap.put("assert", TokenKind.ASSERT);
     keywordMap.put("break", TokenKind.BREAK);
     keywordMap.put("class", TokenKind.CLASS);
     keywordMap.put("continue", TokenKind.CONTINUE);
     keywordMap.put("def", TokenKind.DEF);
     keywordMap.put("del", TokenKind.DEL);
     keywordMap.put("elif", TokenKind.ELIF);
     keywordMap.put("else", TokenKind.ELSE);
     keywordMap.put("except", TokenKind.EXCEPT);
     keywordMap.put("finally", TokenKind.FINALLY);
     keywordMap.put("for", TokenKind.FOR);
     keywordMap.put("from", TokenKind.FROM);
     keywordMap.put("global", TokenKind.GLOBAL);
     keywordMap.put("if", TokenKind.IF);
     keywordMap.put("import", TokenKind.IMPORT);
     keywordMap.put("in", TokenKind.IN);
     keywordMap.put("is", TokenKind.IS);
     keywordMap.put("lambda", TokenKind.LAMBDA);
     keywordMap.put("load", TokenKind.LOAD);
     keywordMap.put("nonlocal", TokenKind.NONLOCAL);
     keywordMap.put("not", TokenKind.NOT);
     keywordMap.put("or", TokenKind.OR);
     keywordMap.put("pass", TokenKind.PASS);
     keywordMap.put("raise", TokenKind.RAISE);
     keywordMap.put("return", TokenKind.RETURN);
     keywordMap.put("try", TokenKind.TRY);
     keywordMap.put("while", TokenKind.WHILE);
     keywordMap.put("with", TokenKind.WITH);
     keywordMap.put("yield", TokenKind.YIELD);
   }

   /**
    * Scans an identifier or keyword.
    *
    * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the identifier.
    * ON EXIT: 'pos' is 1 + the index of the last char in the identifier.
    *
    * @return the identifier or keyword token.
    */
   private void identifierOrKeyword() {
     int oldPos = pos - 1;
     String id = scanIdentifier();
     TokenKind kind = keywordMap.get(id);
     if (kind == null) {
       setToken(TokenKind.IDENTIFIER, oldPos, pos, id);
     } else {
       setToken(kind, oldPos, pos, null);
     }
   }

   private String scanIdentifier() {
     int oldPos = pos - 1;
     while (pos < buffer.length) {
       switch (buffer[pos]) {
         case '_':
         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
         case 'y': case 'z':
         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
         case 'Y': case 'Z':
         case '0': case '1': case '2': case '3': case '4': case '5':
         case '6': case '7': case '8': case '9':
           pos++;
           break;
        default:
           return bufferSlice(oldPos, pos);
       }
     }
     return bufferSlice(oldPos, pos);
   }

   private String scanInteger() {
     int oldPos = pos - 1;
     loop:
     while (pos < buffer.length) {
       char c = buffer[pos];
       switch (c) {
         case 'X': case 'x': // for hexadecimal prefix
         case 'O': case 'o': // for octal prefix
         case 'a': case 'A':
         case 'b': case 'B':
         case 'c': case 'C':
         case 'd': case 'D':
         case 'e': case 'E':
         case 'f': case 'F':
           if (buffer[oldPos] != '0') {
             // A number not starting with zero must be decimal and can only contain decimal digits.
             break loop;
           }
           pos++;
           break;
         case '0': case '1':
         case '2': case '3':
         case '4': case '5':
         case '6': case '7':
         case '8': case '9':
           pos++;
           break;
         default:
           break loop;
       }
     }
     // TODO(bazel-team): (2009) to do roundtripping when we evaluate the integer
     // constants, we must save the actual text of the tokens, not just their
     // integer value.

     return bufferSlice(oldPos, pos);
   }

   /**
    * Scans an integer literal.
    *
    * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the literal.
    * ON EXIT: 'pos' is 1 + the index of the last char in the literal.
    */
   private void integer() {
     int oldPos = pos - 1;
     String literal = scanInteger();

     final String substring;
     final int radix;
     if (literal.startsWith("0x") || literal.startsWith("0X")) {
       radix = 16;
       substring = literal.substring(2);
     } else if (literal.startsWith("0o") || literal.startsWith("0O")) {
       radix = 8;
       substring = literal.substring(2);
     } else if (literal.startsWith("0") && literal.length() > 1) {
       radix = 8;
       substring = literal.substring(1);
       error("invalid octal value `" + literal + "`, should be: `0o" + substring + "`");
     } else {
       radix = 10;
       substring = literal;
     }

     int value = 0;
     try {
       value = Integer.parseInt(substring, radix);
     } catch (NumberFormatException e) {
       error("invalid base-" + radix + " integer constant: " + literal);
     }

     setToken(TokenKind.INT, oldPos, pos, value);
   }

   /**
    * Tokenizes a two-char operator.
    * @return true if it tokenized an operator
    */
   private boolean tokenizeTwoChars() {
     if (pos + 2 >= buffer.length) {
       return false;
     }
     char c1 = buffer[pos];
     char c2 = buffer[pos + 1];
     TokenKind tok = null;
     if (c2 == '=') {
       tok = EQUAL_TOKENS.get(c1);
     } else if (c2 == '*' && c1 == '*') {
       tok = TokenKind.STAR_STAR;
     }
     if (tok == null) {
       return false;
     } else {
       setToken(tok, pos, pos + 2);
       return true;
     }
   }

   /** Test if the character at pos+p is c. */
   private boolean lookaheadIs(int p, char c) {
     return pos + p < buffer.length && buffer[pos + p] == c;
   }

   /**
    * Performs tokenization of the character buffer of file contents provided to the constructor. At
    * least one token will be added to the tokens queue.
    */
   private void tokenize() {
     if (checkIndentation) {
       checkIndentation = false;
       computeIndentation();
     }

     // Return saved indentation tokens.
     if (dents != 0) {
       if (dents < 0) {
         dents++;
         setToken(TokenKind.OUTDENT, pos - 1, pos);
       } else {
         dents--;
         setToken(TokenKind.INDENT, pos - 1, pos);
       }
       return;
     }

     while (pos < buffer.length) {
       if (tokenizeTwoChars()) {
         pos += 2;
         return;
       }
       char c = buffer[pos];
       pos++;
       switch (c) {
         case '{':
           setToken(TokenKind.LBRACE, pos - 1, pos);
           openParenStackDepth++;
           break;
         case '}':
           setToken(TokenKind.RBRACE, pos - 1, pos);
           popParen();
           break;
         case '(':
           setToken(TokenKind.LPAREN, pos - 1, pos);
           openParenStackDepth++;
           break;
         case ')':
           setToken(TokenKind.RPAREN, pos - 1, pos);
           popParen();
           break;
         case '[':
           setToken(TokenKind.LBRACKET, pos - 1, pos);
           openParenStackDepth++;
           break;
         case ']':
           setToken(TokenKind.RBRACKET, pos - 1, pos);
           popParen();
           break;
         case '>':
           if (lookaheadIs(0, '>') && lookaheadIs(1, '=')) {
             setToken(TokenKind.GREATER_GREATER_EQUALS, pos - 1, pos + 2);
             pos += 2;
           } else if (lookaheadIs(0, '>')) {
             setToken(TokenKind.GREATER_GREATER, pos - 1, pos + 1);
             pos += 1;
           } else {
             setToken(TokenKind.GREATER, pos - 1, pos);
           }
           break;
         case '<':
           if (lookaheadIs(0, '<') && lookaheadIs(1, '=')) {
             setToken(TokenKind.LESS_LESS_EQUALS, pos - 1, pos + 2);
             pos += 2;
           } else if (lookaheadIs(0, '<')) {
             setToken(TokenKind.LESS_LESS, pos - 1, pos + 1);
             pos += 1;
           } else {
             setToken(TokenKind.LESS, pos - 1, pos);
           }
           break;
         case ':':
           setToken(TokenKind.COLON, pos - 1, pos);
           break;
         case ',':
           setToken(TokenKind.COMMA, pos - 1, pos);
           break;
         case '+':
           setToken(TokenKind.PLUS, pos - 1, pos);
           break;
         case '-':
           setToken(TokenKind.MINUS, pos - 1, pos);
           break;
         case '|':
           setToken(TokenKind.PIPE, pos - 1, pos);
           break;
         case '=':
           setToken(TokenKind.EQUALS, pos - 1, pos);
           break;
         case '%':
           setToken(TokenKind.PERCENT, pos - 1, pos);
           break;
         case '~':
           setToken(TokenKind.TILDE, pos - 1, pos);
           break;
         case '&':
           setToken(TokenKind.AMPERSAND, pos - 1, pos);
           break;
         case '^':
           setToken(TokenKind.CARET, pos - 1, pos);
           break;
         case '/':
           if (lookaheadIs(0, '/') && lookaheadIs(1, '=')) {
             setToken(TokenKind.SLASH_SLASH_EQUALS, pos - 1, pos + 2);
             pos += 2;
           } else if (lookaheadIs(0, '/')) {
             setToken(TokenKind.SLASH_SLASH, pos - 1, pos + 1);
             pos += 1;
           } else {
             // /= is handled by tokenizeTwoChars.
             setToken(TokenKind.SLASH, pos - 1, pos);
           }
           break;
         case ';':
           setToken(TokenKind.SEMI, pos - 1, pos);
           break;
         case '.':
           setToken(TokenKind.DOT, pos - 1, pos);
           break;
         case '*':
           setToken(TokenKind.STAR, pos - 1, pos);
           break;
         case ' ':
         case '\t':
         case '\r':
           /* ignore */
           break;
         case '\\':
           // Backslash character is valid only at the end of a line (or in a string)
           if (lookaheadIs(0, '\n')) {
             pos += 1; // skip the end of line character
           } else if (lookaheadIs(0, '\r') && lookaheadIs(1, '\n')) {
             pos += 2; // skip the CRLF at the end of line
           } else {
             setToken(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c));
           }
           break;
         case '\n':
           newline();
           break;
         case '#':
           int oldPos = pos - 1;
           while (pos < buffer.length) {
             c = buffer[pos];
             if (c == '\n') {
               break;
             } else {
               pos++;
             }
           }
           makeComment(oldPos, pos, bufferSlice(oldPos, pos));
           break;
         case '\'':
         case '\"':
           stringLiteral(c, false);
           break;
         default:
           // detect raw strings, e.g. r"str"
           if (c == 'r' && pos < buffer.length && (buffer[pos] == '\'' || buffer[pos] == '\"')) {
             c = buffer[pos];
             pos++;
             stringLiteral(c, true);
             break;
           }

           if (c >= '0' && c <= '9') {
             integer();
           } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
             identifierOrKeyword();
           } else {
             error("invalid character: '" + c + "'");
           }
           break;
       } // switch
       if (token.kind != null) { // stop here if we scanned a token
         return;
       }
     } // while

     if (indentStack.size() > 1) { // top of stack is always zero
       setToken(TokenKind.NEWLINE, pos - 1, pos);
       while (indentStack.size() > 1) {
         indentStack.pop();
         dents--;
       }
       return;
     }

     setToken(TokenKind.EOF, pos, pos);
   }

   /**
    * Returns parts of the source buffer based on offsets
    *
    * @param start the beginning offset for the slice
    * @param end the offset immediately following the slice
    * @return the text at offset start with length end - start
    */
   private String bufferSlice(int start, int end) {
     return new String(this.buffer, start, end - start);
   }

   private void makeComment(int start, int end, String content) {
     comments.add(Node.setLocation(createLocation(start, end), new Comment(content)));
   }
 }