| // Copyright 2014 Google Inc. All rights reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package com.google.devtools.build.lib.syntax; |
| |
| import com.google.common.collect.ImmutableMap; |
| import com.google.devtools.build.lib.concurrent.ThreadSafety.Immutable; |
| import com.google.devtools.build.lib.events.Event; |
| import com.google.devtools.build.lib.events.EventHandler; |
| import com.google.devtools.build.lib.events.Location; |
| import com.google.devtools.build.lib.profiler.Profiler; |
| import com.google.devtools.build.lib.profiler.ProfilerTask; |
| import com.google.devtools.build.lib.util.Pair; |
| import com.google.devtools.build.lib.vfs.PathFragment; |
| |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Objects; |
| import java.util.Stack; |
| |
| /** |
| * A tokenizer for the BUILD language. |
| * <p> |
| * See: <a href="https://docs.python.org/2/reference/lexical_analysis.html"/> |
| * for some details. |
| * <p> |
| * Since BUILD files are small, we just tokenize the entire file a-priori |
| * instead of interleaving scanning with parsing. |
| */ |
| public final class Lexer { |
| |
| private static final Map<Character, TokenKind> EQUAL_TOKENS = |
| ImmutableMap.<Character, TokenKind>of( |
| '=', TokenKind.EQUALS_EQUALS, |
| '!', TokenKind.NOT_EQUALS, |
| '>', TokenKind.GREATER_EQUALS, |
| '<', TokenKind.LESS_EQUALS, |
| '+', TokenKind.PLUS_EQUALS); |
| |
| private final EventHandler eventHandler; |
| |
| // Input buffer and position |
| private char[] buffer; |
| private int pos; |
| |
| /** |
| * The part of the location information that is common to all LexerLocation |
| * instances created by this Lexer. Factored into a separate object so that |
| * many Locations instances can share the same information as compactly as |
| * possible, without closing over a Lexer instance. |
| */ |
| private static class LocationInfo { |
| final LineNumberTable lineNumberTable; |
| final PathFragment filename; |
| LocationInfo(PathFragment filename, LineNumberTable lineNumberTable) { |
| this.filename = filename; |
| this.lineNumberTable = lineNumberTable; |
| } |
| } |
| |
| private final LocationInfo locationInfo; |
| |
| // The stack of enclosing indentation levels; always contains '0' at the |
| // bottom. |
| private final Stack<Integer> indentStack = new Stack<>(); |
| |
| private final List<Token> tokens = new ArrayList<>(); |
| |
| // The number of unclosed open-parens ("(", '{', '[') at the current point in |
| // the stream. Whitespace is handled differently when this is nonzero. |
| private int openParenStackDepth = 0; |
| |
| private boolean containsErrors; |
| |
| private boolean parsePython; |
| |
| /** |
| * Constructs a lexer which tokenizes the contents of the specified |
| * InputBuffer. Any errors during lexing are reported on "handler". |
| */ |
| public Lexer(ParserInputSource input, EventHandler eventHandler, boolean parsePython, |
| LineNumberTable lineNumberTable) { |
| this.buffer = input.getContent(); |
| this.pos = 0; |
| this.parsePython = parsePython; |
| this.eventHandler = eventHandler; |
| this.locationInfo = new LocationInfo(input.getPath(), lineNumberTable); |
| |
| indentStack.push(0); |
| long startTime = Profiler.nanoTimeMaybe(); |
| tokenize(); |
| Profiler.instance().logSimpleTask(startTime, ProfilerTask.SKYLARK_LEXER, getFilename()); |
| } |
| |
| public Lexer(ParserInputSource input, EventHandler eventHandler) { |
| this(input, eventHandler, /*parsePython=*/false, |
| LineNumberTable.create(input.getContent(), input.getPath())); |
| } |
| |
| public Lexer(ParserInputSource input, EventHandler eventHandler, boolean parsePython) { |
| this(input, eventHandler, parsePython, |
| LineNumberTable.create(input.getContent(), input.getPath())); |
| } |
| |
| /** |
| * Returns the filename from which the lexer's input came. Returns a dummy |
| * value if the input came from a string. |
| */ |
| public PathFragment getFilename() { |
| return locationInfo.filename; |
| } |
| |
| /** |
| * Returns true if there were errors during scanning of this input file or |
| * string. The Lexer may attempt to recover from errors, but clients should |
| * not rely on the results of scanning if this flag is set. |
| */ |
| public boolean containsErrors() { |
| return containsErrors; |
| } |
| |
| /** |
| * Returns the (mutable) list of tokens generated by the Lexer. |
| */ |
| public List<Token> getTokens() { |
| return tokens; |
| } |
| |
| private void popParen() { |
| if (openParenStackDepth == 0) { |
| error("indentation error"); |
| } else { |
| openParenStackDepth--; |
| } |
| } |
| |
| private void error(String message) { |
| error(message, pos - 1, pos - 1); |
| } |
| |
| private void error(String message, int start, int end) { |
| this.containsErrors = true; |
| eventHandler.handle(Event.error(createLocation(start, end), message)); |
| } |
| |
| Location createLocation(int start, int end) { |
| return new LexerLocation(locationInfo, start, end); |
| } |
| |
| // Don't use an inner class as we don't want to close over the Lexer, only |
| // the LocationInfo. |
| @Immutable |
| private static final class LexerLocation extends Location { |
| |
| private final LineNumberTable lineNumberTable; |
| |
| LexerLocation(LocationInfo locationInfo, int start, int end) { |
| super(start, end); |
| this.lineNumberTable = locationInfo.lineNumberTable; |
| } |
| |
| @Override |
| public PathFragment getPath() { |
| PathFragment path = lineNumberTable.getPath(getStartOffset()); |
| return path; |
| } |
| |
| @Override |
| public LineAndColumn getStartLineAndColumn() { |
| return lineNumberTable.getLineAndColumn(getStartOffset()); |
| } |
| |
| @Override |
| public LineAndColumn getEndLineAndColumn() { |
| return lineNumberTable.getLineAndColumn(getEndOffset()); |
| } |
| |
| |
| @Override |
| public int hashCode() { |
| return Objects.hash(lineNumberTable, internalHashCode()); |
| } |
| |
| @Override |
| public boolean equals(Object other) { |
| if (other == null || !other.getClass().equals(getClass())) { |
| return false; |
| } |
| LexerLocation that = (LexerLocation) other; |
| return internalEquals(that) && Objects.equals(this.lineNumberTable, that.lineNumberTable); |
| } |
| } |
| |
| /** invariant: symbol positions are half-open intervals. */ |
| private void addToken(Token s) { |
| tokens.add(s); |
| } |
| |
| /** |
| * Parses an end-of-line sequence, handling statement indentation correctly. |
| * |
| * <p>UNIX newlines are assumed (LF). Carriage returns are always ignored. |
| * |
| * <p>ON ENTRY: 'pos' is the index of the char after '\n'. |
| * ON EXIT: 'pos' is the index of the next non-space char after '\n'. |
| */ |
| private void newline() { |
| if (openParenStackDepth > 0) { |
| newlineInsideExpression(); // in an expression: ignore space |
| } else { |
| newlineOutsideExpression(); // generate NEWLINE/INDENT/OUTDENT tokens |
| } |
| } |
| |
| private void newlineInsideExpression() { |
| while (pos < buffer.length) { |
| switch (buffer[pos]) { |
| case ' ': case '\t': case '\r': |
| pos++; |
| break; |
| default: |
| return; |
| } |
| } |
| } |
| |
| private void newlineOutsideExpression() { |
| if (pos > 1) { // skip over newline at start of file |
| addToken(new Token(TokenKind.NEWLINE, pos - 1, pos)); |
| } |
| |
| // we're in a stmt: suck up space at beginning of next line |
| int indentLen = 0; |
| while (pos < buffer.length) { |
| char c = buffer[pos]; |
| if (c == ' ') { |
| indentLen++; |
| pos++; |
| } else if (c == '\t') { |
| indentLen += 8 - indentLen % 8; |
| pos++; |
| } else if (c == '\n') { // entirely blank line: discard |
| indentLen = 0; |
| pos++; |
| } else if (c == '#') { // line containing only indented comment |
| int oldPos = pos; |
| while (pos < buffer.length && c != '\n') { |
| c = buffer[pos++]; |
| } |
| addToken(new Token(TokenKind.COMMENT, oldPos, pos - 1, bufferSlice(oldPos, pos - 1))); |
| indentLen = 0; |
| } else { // printing character |
| break; |
| } |
| } |
| |
| if (pos == buffer.length) { |
| indentLen = 0; |
| } // trailing space on last line |
| |
| int peekedIndent = indentStack.peek(); |
| if (peekedIndent < indentLen) { // push a level |
| indentStack.push(indentLen); |
| addToken(new Token(TokenKind.INDENT, pos - 1, pos)); |
| |
| } else if (peekedIndent > indentLen) { // pop one or more levels |
| while (peekedIndent > indentLen) { |
| indentStack.pop(); |
| addToken(new Token(TokenKind.OUTDENT, pos - 1, pos)); |
| peekedIndent = indentStack.peek(); |
| } |
| |
| if (peekedIndent < indentLen) { |
| error("indentation error"); |
| } |
| } |
| } |
| |
| /** |
| * Returns true if current position is in the middle of a triple quote |
| * delimiter (3 x quot), and advances 'pos' by two if so. |
| */ |
| private boolean skipTripleQuote(char quot) { |
| if (pos + 1 < buffer.length && buffer[pos] == quot && buffer[pos + 1] == quot) { |
| pos += 2; |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Scans a string literal delimited by 'quot', containing escape sequences. |
| * |
| * <p>ON ENTRY: 'pos' is 1 + the index of the first delimiter |
| * ON EXIT: 'pos' is 1 + the index of the last delimiter. |
| * |
| * @return the string-literal token. |
| */ |
| private Token escapedStringLiteral(char quot) { |
| boolean inTriplequote = skipTripleQuote(quot); |
| |
| int oldPos = pos - 1; |
| // more expensive second choice that expands escaped into a buffer |
| StringBuilder literal = new StringBuilder(); |
| while (pos < buffer.length) { |
| char c = buffer[pos]; |
| pos++; |
| switch (c) { |
| case '\n': |
| if (inTriplequote) { |
| literal.append(c); |
| break; |
| } else { |
| error("unterminated string literal at eol", oldPos, pos); |
| newline(); |
| return new Token(TokenKind.STRING, oldPos, pos, literal.toString()); |
| } |
| case '\\': |
| if (pos == buffer.length) { |
| error("unterminated string literal at eof", oldPos, pos); |
| return new Token(TokenKind.STRING, oldPos, pos, literal.toString()); |
| } |
| c = buffer[pos]; |
| pos++; |
| switch (c) { |
| case '\n': |
| // ignore end of line character |
| break; |
| case 'n': |
| literal.append('\n'); |
| break; |
| case 'r': |
| literal.append('\r'); |
| break; |
| case 't': |
| literal.append('\t'); |
| break; |
| case '\\': |
| literal.append('\\'); |
| break; |
| case '\'': |
| literal.append('\''); |
| break; |
| case '"': |
| literal.append('"'); |
| break; |
| case '0': case '1': case '2': case '3': |
| case '4': case '5': case '6': case '7': { // octal escape |
| int octal = c - '0'; |
| if (pos < buffer.length) { |
| c = buffer[pos]; |
| if (c >= '0' && c <= '7') { |
| pos++; |
| octal = (octal << 3) | (c - '0'); |
| if (pos < buffer.length) { |
| c = buffer[pos]; |
| if (c >= '0' && c <= '7') { |
| pos++; |
| octal = (octal << 3) | (c - '0'); |
| } |
| } |
| } |
| } |
| literal.append((char) (octal & 0xff)); |
| break; |
| } |
| case 'a': case 'b': case 'f': case 'N': case 'u': case 'U': case 'v': case 'x': |
| // exists in Python but not implemented in Blaze => error |
| error("escape sequence not implemented: \\" + c, oldPos, pos); |
| break; |
| default: |
| // unknown char escape => "\literal" |
| literal.append('\\'); |
| literal.append(c); |
| break; |
| } |
| break; |
| case '\'': |
| case '"': |
| if (c != quot |
| || (inTriplequote && !skipTripleQuote(quot))) { |
| // Non-matching quote, treat it like a regular char. |
| literal.append(c); |
| } else { |
| // Matching close-delimiter, all done. |
| return new Token(TokenKind.STRING, oldPos, pos, literal.toString()); |
| } |
| break; |
| default: |
| literal.append(c); |
| break; |
| } |
| } |
| error("unterminated string literal at eof", oldPos, pos); |
| return new Token(TokenKind.STRING, oldPos, pos, literal.toString()); |
| } |
| |
| /** |
| * Scans a string literal delimited by 'quot'. |
| * |
| * <ul> |
| * <li> ON ENTRY: 'pos' is 1 + the index of the first delimiter |
| * <li> ON EXIT: 'pos' is 1 + the index of the last delimiter. |
| * </ul> |
| * |
| * @param isRaw if true, do not escape the string. |
| * @return the string-literal token. |
| */ |
| private Token stringLiteral(char quot, boolean isRaw) { |
| int oldPos = pos - 1; |
| |
| // Don't even attempt to parse triple-quotes here. |
| if (skipTripleQuote(quot)) { |
| pos -= 2; |
| return escapedStringLiteral(quot); |
| } |
| |
| // first quick optimistic scan for a simple non-escaped string |
| while (pos < buffer.length) { |
| char c = buffer[pos++]; |
| switch (c) { |
| case '\n': |
| error("unterminated string literal at eol", oldPos, pos); |
| Token t = new Token(TokenKind.STRING, oldPos, pos, |
| bufferSlice(oldPos + 1, pos - 1)); |
| newline(); |
| return t; |
| case '\\': |
| if (isRaw) { |
| // skip the next character |
| pos++; |
| break; |
| } else { |
| // oops, hit an escape, need to start over & build a new string buffer |
| pos = oldPos + 1; |
| return escapedStringLiteral(quot); |
| } |
| case '\'': |
| case '"': |
| if (c == quot) { |
| // close-quote, all done. |
| return new Token(TokenKind.STRING, oldPos, pos, |
| bufferSlice(oldPos + 1, pos - 1)); |
| } |
| } |
| } |
| |
| error("unterminated string literal at eof", oldPos, pos); |
| return new Token(TokenKind.STRING, oldPos, pos, |
| bufferSlice(oldPos + 1, pos)); |
| } |
| |
| private static final Map<String, TokenKind> keywordMap = new HashMap<>(); |
| |
| static { |
| keywordMap.put("and", TokenKind.AND); |
| keywordMap.put("as", TokenKind.AS); |
| keywordMap.put("assert", TokenKind.ASSERT); |
| keywordMap.put("break", TokenKind.BREAK); |
| keywordMap.put("class", TokenKind.CLASS); |
| keywordMap.put("continue", TokenKind.CONTINUE); |
| keywordMap.put("def", TokenKind.DEF); |
| keywordMap.put("del", TokenKind.DEL); |
| keywordMap.put("elif", TokenKind.ELIF); |
| keywordMap.put("else", TokenKind.ELSE); |
| keywordMap.put("except", TokenKind.EXCEPT); |
| keywordMap.put("finally", TokenKind.FINALLY); |
| keywordMap.put("for", TokenKind.FOR); |
| keywordMap.put("from", TokenKind.FROM); |
| keywordMap.put("global", TokenKind.GLOBAL); |
| keywordMap.put("if", TokenKind.IF); |
| keywordMap.put("import", TokenKind.IMPORT); |
| keywordMap.put("in", TokenKind.IN); |
| keywordMap.put("is", TokenKind.IS); |
| keywordMap.put("lambda", TokenKind.LAMBDA); |
| keywordMap.put("nonlocal", TokenKind.NONLOCAL); |
| keywordMap.put("not", TokenKind.NOT); |
| keywordMap.put("or", TokenKind.OR); |
| keywordMap.put("pass", TokenKind.PASS); |
| keywordMap.put("raise", TokenKind.RAISE); |
| keywordMap.put("return", TokenKind.RETURN); |
| keywordMap.put("try", TokenKind.TRY); |
| keywordMap.put("while", TokenKind.WHILE); |
| keywordMap.put("with", TokenKind.WITH); |
| keywordMap.put("yield", TokenKind.YIELD); |
| } |
| |
| private TokenKind getTokenKindForIdentfier(String id) { |
| TokenKind kind = keywordMap.get(id); |
| return kind == null ? TokenKind.IDENTIFIER : kind; |
| } |
| |
| private String scanIdentifier() { |
| int oldPos = pos - 1; |
| while (pos < buffer.length) { |
| switch (buffer[pos]) { |
| case '_': |
| case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': |
| case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': |
| case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': |
| case 's': case 't': case 'u': case 'v': case 'w': case 'x': |
| case 'y': case 'z': |
| case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': |
| case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': |
| case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': |
| case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': |
| case 'Y': case 'Z': |
| case '0': case '1': case '2': case '3': case '4': case '5': |
| case '6': case '7': case '8': case '9': |
| pos++; |
| break; |
| default: |
| return bufferSlice(oldPos, pos); |
| } |
| } |
| return bufferSlice(oldPos, pos); |
| } |
| |
| /** |
| * Scans an identifier or keyword. |
| * |
| * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the identifier. |
| * ON EXIT: 'pos' is 1 + the index of the last char in the identifier. |
| * |
| * @return the identifier or keyword token. |
| */ |
| private Token identifierOrKeyword() { |
| int oldPos = pos - 1; |
| String id = scanIdentifier(); |
| TokenKind kind = getTokenKindForIdentfier(id); |
| return new Token(kind, oldPos, pos, |
| (kind == TokenKind.IDENTIFIER) ? id : null); |
| } |
| |
| private String scanInteger() { |
| int oldPos = pos - 1; |
| while (pos < buffer.length) { |
| char c = buffer[pos]; |
| switch (c) { |
| case 'X': case 'x': |
| case 'a': case 'A': |
| case 'b': case 'B': |
| case 'c': case 'C': |
| case 'd': case 'D': |
| case 'e': case 'E': |
| case 'f': case 'F': |
| case '0': case '1': |
| case '2': case '3': |
| case '4': case '5': |
| case '6': case '7': |
| case '8': case '9': |
| pos++; |
| break; |
| default: |
| return bufferSlice(oldPos, pos); |
| } |
| } |
| // TODO(bazel-team): (2009) to do roundtripping when we evaluate the integer |
| // constants, we must save the actual text of the tokens, not just their |
| // integer value. |
| |
| return bufferSlice(oldPos, pos); |
| } |
| |
| /** |
| * Scans an integer literal. |
| * |
| * <p>ON ENTRY: 'pos' is 1 + the index of the first char in the literal. |
| * ON EXIT: 'pos' is 1 + the index of the last char in the literal. |
| * |
| * @return the integer token. |
| */ |
| private Token integer() { |
| int oldPos = pos - 1; |
| String literal = scanInteger(); |
| |
| final String substring; |
| final int radix; |
| if (literal.startsWith("0x") || literal.startsWith("0X")) { |
| radix = 16; |
| substring = literal.substring(2); |
| } else if (literal.startsWith("0") && literal.length() > 1) { |
| radix = 8; |
| substring = literal.substring(1); |
| } else { |
| radix = 10; |
| substring = literal; |
| } |
| |
| int value = 0; |
| try { |
| value = Integer.parseInt(substring, radix); |
| } catch (NumberFormatException e) { |
| error("invalid base-" + radix + " integer constant: " + literal); |
| } |
| |
| return new Token(TokenKind.INT, oldPos, pos, value); |
| } |
| |
| /** |
| * Tokenizes a two-char operator. |
| * @return true if it tokenized an operator |
| */ |
| private boolean tokenizeTwoChars() { |
| if (pos + 2 >= buffer.length) { |
| return false; |
| } |
| char c1 = buffer[pos]; |
| char c2 = buffer[pos + 1]; |
| TokenKind tok = null; |
| if (c2 == '=') { |
| tok = EQUAL_TOKENS.get(c1); |
| } else if (c2 == '*' && c1 == '*') { |
| tok = TokenKind.STAR_STAR; |
| } |
| if (tok == null) { |
| return false; |
| } else { |
| addToken(new Token(tok, pos, pos + 2)); |
| return true; |
| } |
| } |
| |
| /** |
| * Performs tokenization of the character buffer of file contents provided to |
| * the constructor. |
| */ |
| private void tokenize() { |
| while (pos < buffer.length) { |
| if (tokenizeTwoChars()) { |
| pos += 2; |
| continue; |
| } |
| char c = buffer[pos]; |
| pos++; |
| switch (c) { |
| case '{': { |
| addToken(new Token(TokenKind.LBRACE, pos - 1, pos)); |
| openParenStackDepth++; |
| break; |
| } |
| case '}': { |
| addToken(new Token(TokenKind.RBRACE, pos - 1, pos)); |
| popParen(); |
| break; |
| } |
| case '(': { |
| addToken(new Token(TokenKind.LPAREN, pos - 1, pos)); |
| openParenStackDepth++; |
| break; |
| } |
| case ')': { |
| addToken(new Token(TokenKind.RPAREN, pos - 1, pos)); |
| popParen(); |
| break; |
| } |
| case '[': { |
| addToken(new Token(TokenKind.LBRACKET, pos - 1, pos)); |
| openParenStackDepth++; |
| break; |
| } |
| case ']': { |
| addToken(new Token(TokenKind.RBRACKET, pos - 1, pos)); |
| popParen(); |
| break; |
| } |
| case '>': { |
| addToken(new Token(TokenKind.GREATER, pos - 1, pos)); |
| break; |
| } |
| case '<': { |
| addToken(new Token(TokenKind.LESS, pos - 1, pos)); |
| break; |
| } |
| case ':': { |
| addToken(new Token(TokenKind.COLON, pos - 1, pos)); |
| break; |
| } |
| case ',': { |
| addToken(new Token(TokenKind.COMMA, pos - 1, pos)); |
| break; |
| } |
| case '+': { |
| addToken(new Token(TokenKind.PLUS, pos - 1, pos)); |
| break; |
| } |
| case '-': { |
| addToken(new Token(TokenKind.MINUS, pos - 1, pos)); |
| break; |
| } |
| case '|': { |
| addToken(new Token(TokenKind.PIPE, pos - 1, pos)); |
| break; |
| } |
| case '=': { |
| addToken(new Token(TokenKind.EQUALS, pos - 1, pos)); |
| break; |
| } |
| case '%': { |
| addToken(new Token(TokenKind.PERCENT, pos - 1, pos)); |
| break; |
| } |
| case '/': { |
| addToken(new Token(TokenKind.SLASH, pos - 1, pos)); |
| break; |
| } |
| case ';': { |
| addToken(new Token(TokenKind.SEMI, pos - 1, pos)); |
| break; |
| } |
| case '.': { |
| addToken(new Token(TokenKind.DOT, pos - 1, pos)); |
| break; |
| } |
| case '*': { |
| addToken(new Token(TokenKind.STAR, pos - 1, pos)); |
| break; |
| } |
| case ' ': |
| case '\t': |
| case '\r': { |
| /* ignore */ |
| break; |
| } |
| case '\\': { |
| // Backslash character is valid only at the end of a line (or in a string) |
| if (pos + 1 < buffer.length && buffer[pos] == '\n') { |
| pos++; // skip the end of line character |
| } else { |
| addToken(new Token(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c))); |
| } |
| break; |
| } |
| case '\n': { |
| newline(); |
| break; |
| } |
| case '#': { |
| int oldPos = pos - 1; |
| while (pos < buffer.length) { |
| c = buffer[pos]; |
| if (c == '\n') { |
| break; |
| } else { |
| pos++; |
| } |
| } |
| addToken(new Token(TokenKind.COMMENT, oldPos, pos, bufferSlice(oldPos, pos))); |
| break; |
| } |
| case '\'': |
| case '\"': { |
| addToken(stringLiteral(c, false)); |
| break; |
| } |
| default: { |
| // detect raw strings, e.g. r"str" |
| if (c == 'r' && pos < buffer.length |
| && (buffer[pos] == '\'' || buffer[pos] == '\"')) { |
| c = buffer[pos]; |
| pos++; |
| addToken(stringLiteral(c, true)); |
| break; |
| } |
| |
| if (Character.isDigit(c)) { |
| addToken(integer()); |
| } else if (Character.isJavaIdentifierStart(c) && c != '$') { |
| addToken(identifierOrKeyword()); |
| } else { |
| // Some characters in Python are not recognized in Blaze syntax (e.g. '!') |
| if (parsePython) { |
| addToken(new Token(TokenKind.ILLEGAL, pos - 1, pos, Character.toString(c))); |
| } else { |
| error("invalid character: '" + c + "'"); |
| } |
| } |
| break; |
| } // default |
| } // switch |
| } // while |
| |
| if (indentStack.size() > 1) { // top of stack is always zero |
| addToken(new Token(TokenKind.NEWLINE, pos - 1, pos)); |
| while (indentStack.size() > 1) { |
| indentStack.pop(); |
| addToken(new Token(TokenKind.OUTDENT, pos - 1, pos)); |
| } |
| } |
| |
| // Like Python, always end with a NEWLINE token, even if no '\n' in input: |
| if (tokens.isEmpty() || tokens.get(tokens.size() - 1).kind != TokenKind.NEWLINE) { |
| addToken(new Token(TokenKind.NEWLINE, pos - 1, pos)); |
| } |
| |
| addToken(new Token(TokenKind.EOF, pos, pos)); |
| } |
| |
| /** |
| * Returns the character in the input buffer at the given position. |
| * |
| * @param at the position to get the character at. |
| * @return the character at the given position. |
| */ |
| public char charAt(int at) { |
| return buffer[at]; |
| } |
| |
| /** |
| * Returns the string at the current line, minus the new line. |
| * |
| * @param line the line from which to retrieve the String, 1-based |
| * @return the text of the line |
| */ |
| public String stringAtLine(int line) { |
| Pair<Integer, Integer> offsets = locationInfo.lineNumberTable.getOffsetsForLine(line); |
| return bufferSlice(offsets.first, offsets.second); |
| } |
| |
| /** |
| * Returns parts of the source buffer based on offsets |
| * |
| * @param start the beginning offset for the slice |
| * @param end the offset immediately following the slice |
| * @return the text at offset start with length end - start |
| */ |
| private String bufferSlice(int start, int end) { |
| return new String(this.buffer, start, end - start); |
| } |
| |
| } |