blob: de1e07ffd37c01d38f717a36261e1adf54309e97 [file] [log] [blame]
// Copyright (C) 2006 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* @fileoverview
* some functions for browser-side pretty printing of code contained in html.
*
* The lexer should work on a number of languages including C and friends,
* Java, Python, Bash, SQL, HTML, XML, CSS, Javascript, and Makefiles.
* It works passably on Ruby, PHP and Awk and a decent subset of Perl, but,
* because of commenting conventions, doesn't work on Smalltalk, Lisp-like, or
* CAML-like languages.
*
* If there's a language not mentioned here, then I don't know it, and don't
* know whether it works. If it has a C-like, Bash-like, or XML-like syntax
* then it should work passably.
*
* Usage:
* 1) include this source file in an html page via
* <script type="text/javascript" src="/path/to/prettify.js"></script>
* 2) define style rules. See the example page for examples.
* 3) mark the <pre> and <code> tags in your source with class=prettyprint.
* You can also use the (html deprecated) <xmp> tag, but the pretty printer
* needs to do more substantial DOM manipulations to support that, so some
* css styles may not be preserved.
* That's it. I wanted to keep the API as simple as possible, so there's no
* need to specify which language the code is in.
*
* Change log:
* cbeust, 2006/08/22
* Java annotations (start with "@") are now captured as literals ("lit")
*/
var PR_keywords = {};
/** initialize the keyword list for our target languages. */
(function () {
var CPP_KEYWORDS = "abstract bool break case catch char class const " +
"const_cast continue default delete deprecated dllexport dllimport do " +
"double dynamic_cast else enum explicit extern false float for friend " +
"goto if inline int long mutable naked namespace new noinline noreturn " +
"nothrow novtable operator private property protected public register " +
"reinterpret_cast return selectany short signed sizeof static " +
"static_cast struct switch template this thread throw true try typedef " +
"typeid typename union unsigned using declaration, directive uuid " +
"virtual void volatile while typeof";
var CSHARP_KEYWORDS = "as base by byte checked decimal delegate descending " +
"event finally fixed foreach from group implicit in interface internal " +
"into is lock null object out override orderby params readonly ref sbyte " +
"sealed stackalloc string select uint ulong unchecked unsafe ushort var";
var JAVA_KEYWORDS = "package synchronized boolean implements import throws " +
"instanceof transient extends final strictfp native super";
var JSCRIPT_KEYWORDS = "debugger export function with NaN Infinity";
var PERL_KEYWORDS = "require sub unless until use elsif BEGIN END";
var PYTHON_KEYWORDS = "and assert def del elif except exec global lambda " +
"not or pass print raise yield False True None";
var RUBY_KEYWORDS = "then end begin rescue ensure module when undef next " +
"redo retry alias defined";
var SH_KEYWORDS = "done fi";
var KEYWORDS = [CPP_KEYWORDS, CSHARP_KEYWORDS, JAVA_KEYWORDS,
JSCRIPT_KEYWORDS, PERL_KEYWORDS, PYTHON_KEYWORDS,
RUBY_KEYWORDS, SH_KEYWORDS];
for ( var k = 0; k < KEYWORDS.length; k++ ) {
var kw = KEYWORDS[k].split(' ');
for ( var i = 0; i < kw.length; i++ ) {
if ( kw[i] ) {
PR_keywords[kw[i]] = true;
}
}
}
}).call(this);
// token style names. correspond to css classes
/** token style for a string literal */
var PR_STRING = 'str';
/** token style for a keyword */
var PR_KEYWORD = 'kwd';
/** token style for a comment */
var PR_COMMENT = 'com';
/** token style for a type */
var PR_TYPE = 'typ';
/** token style for a literal value. e.g. 1, null, true. */
var PR_LITERAL = 'lit';
/** token style for a punctuation string. */
var PR_PUNCTUATION = 'pun';
/** token style for a punctuation string. */
var PR_PLAIN = 'pln';
/** token style for an sgml tag. */
var PR_TAG = 'tag';
/** token style for a markup declaration such as a DOCTYPE. */
var PR_DECLARATION = 'dec';
/** token style for embedded source. */
var PR_SOURCE = 'src';
/** token style for an sgml attribute name. */
var PR_ATTRIB_NAME = 'atn';
/** token style for an sgml attribute value. */
var PR_ATTRIB_VALUE = 'atv';
/** the number of characters between tab columns */
var PR_TAB_WIDTH = 8;
/** the position of the end of a token during. A division of a string into
* n tokens can be represented as a series n - 1 token ends, as long as
* runs of whitespace warrant their own token.
* @private
*/
function PR_TokenEnd( end, style ) {
if ( undefined === style ) {
throw new Error('BAD');
}
if ( 'number' != typeof(end) ) {
throw new Error('BAD');
}
this.end = end;
this.style = style;
}
PR_TokenEnd.prototype.toString = function () {
return '[PR_TokenEnd ' + this.end +
(this.style ? ':' + this.style : '') + ']';
};
/** a chunk of text with a style. These are used to represent both the output
* from the lexing functions as well as intermediate results.
* @constructor
* @param token the token text
* @param style one of the token styles defined in designdoc-template, or null
* for a styleless token, such as an embedded html tag.
* @private
*/
function PR_Token( token, style ) {
if ( undefined === style ) {
throw new Error('BAD');
}
this.token = token;
this.style = style;
}
PR_Token.prototype.toString = function () {
return '[PR_Token ' + this.token + (this.style ? ':' + this.style : '') + ']';
};
/** a helper class that decodes common html entities used to escape special
* characters in source code.
* @constructor
* @private
*/
function PR_DecodeHelper() {
this.next = 0;
this.ch = '\0';
}
var PR_NAMED_ENTITIES = {
'lt': '<',
'gt': '>',
'quot': '"',
'apos': "'",
'amp': '&' // reencoding requires that & always be decoded properly
};
PR_DecodeHelper.prototype.decode = function ( s, i ) {
var next = i + 1;
var ch = s.charAt(i);
if ( '&' === ch ) {
var semi = s.indexOf(';', next);
if ( semi >= 0 && semi < next + 4 ) {
var entityName = s.substring(next, semi);
var decoded = null;
if ( entityName.charAt(0) === '#' ) { // check for numeric entity
var ch1 = entityName.charAt(1);
var charCode;
if ( ch1 === 'x' || ch1 === 'X' ) { // like &#xA0;
charCode = parseInt(entityName.substring(2), 16);
}
else { // like &#160;
charCode = parseInt(entityName.substring(1), 10);
}
if ( !isNaN(charCode) ) {
decoded = String.fromCharCode(charCode);
}
}
if ( !decoded ) {
decoded = PR_NAMED_ENTITIES[entityName.toLowerCase()];
}
if ( decoded ) {
ch = decoded;
next = semi + 1;
}
else { // skip over unrecognized entity
next = i + 1;
ch = '\0';
}
}
}
this.next = next;
this.ch = ch;
return this.ch;
};
// some string utilities
function PR_isWordChar( ch ) {
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
}
function PR_isIdentifierStart( ch ) {
return PR_isWordChar(ch) || ch == '_' || ch == '$' || ch == '@';
}
function PR_isIdentifierPart( ch ) {
return PR_isIdentifierStart(ch) || PR_isDigitChar(ch);
}
function PR_isSpaceChar( ch ) {
return "\t \r\n".indexOf(ch) >= 0;
}
function PR_isDigitChar( ch ) {
return ch >= '0' && ch <= '9';
}
function PR_trim( s ) {
var i = 0, j = s.length - 1;
while ( i <= j && PR_isSpaceChar(s.charAt(i)) ) {
++i;
}
while ( j > i && PR_isSpaceChar(s.charAt(j)) ) {
--j;
}
return s.substring(i, j + 1);
}
function PR_startsWith( s, prefix ) {
return s.length >= prefix.length && prefix == s.substring(0, prefix.length);
}
function PR_endsWith( s, suffix ) {
return s.length >= suffix.length &&
suffix == s.substring(s.length - suffix.length, s.length);
}
/** true iff prefix matches the first prefix characters in chars[0:len].
* @private
*/
function PR_prefixMatch( chars, len, prefix ) {
if ( len < prefix.length ) {
return false;
}
for ( var i = 0, n = prefix.length; i < n; ++i ) {
if ( prefix.charAt(i) != chars[i] ) {
return false;
}
}
return true;
}
/** like textToHtml but escapes double quotes to be attribute safe. */
function PR_attribToHtml( str ) {
return str.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/\"/g, '&quot;')
.replace(/\xa0/, '&nbsp;');
}
/** escapest html special characters to html. */
function PR_textToHtml( str ) {
return str.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/\xa0/g, '&nbsp;');
}
/** is the given node's innerHTML normally unescaped? */
function PR_isRawContent( node ) {
return 'XMP' == node.tagName;
}
var PR_innerHtmlWorks = null;
function PR_getInnerHtml( node ) {
// inner html is hopelessly broken in Safari 2.0.4 when the content is
// an html description of well formed XML and the containing tag is a PRE
// tag, so we detect that case and emulate innerHTML.
if ( null == PR_innerHtmlWorks ) {
var testNode = document.createElement('PRE');
testNode.appendChild(
document.createTextNode('<!DOCTYPE foo PUBLIC "foo bar">\n<foo />'));
PR_innerHtmlWorks = !/</.test(testNode.innerHTML);
}
if ( PR_innerHtmlWorks ) {
var content = node.innerHTML;
// XMP tags contain unescaped entities so require special handling.
if ( PR_isRawContent(node) ) {
content = PR_textToHtml(content);
}
return content;
}
var out = [];
for ( var child = node.firstChild; child; child = child.nextSibling ) {
PR_normalizedHtml(child, out);
}
return out.join('');
}
/**
* walks the DOM returning a properly escaped version of innerHTML.
*/
function PR_normalizedHtml( node, out ) {
switch ( node.nodeType ) {
case 1: // an element
var name = node.tagName.toLowerCase();
out.push('\074', name);
for ( var i = 0; i < node.attributes.length; ++i ) {
var attr = node.attributes[i];
if ( !attr.specified ) {
continue;
}
out.push(' ');
PR_normalizedHtml(attr, out);
}
out.push('>');
for ( var child = node.firstChild; child; child = child.nextSibling ) {
PR_normalizedHtml(child, out);
}
if ( node.firstChild || !/^(?:br|link|img)$/.test(name) ) {
out.push('<\/', name, '>');
}
break;
case 2: // an attribute
out.push(node.name.toLowerCase(), '="', PR_attribToHtml(node.value), '"');
break;
case 3: case 4: // text
out.push(PR_textToHtml(node.nodeValue));
break;
}
}
/** expand tabs to spaces
* @param {Array} chunks PR_Tokens possibly containing tabs
* @param {Number} tabWidth number of spaces between tab columns
* @return {Array} chunks with tabs replaced with spaces
*/
function PR_expandTabs( chunks, tabWidth ) {
var SPACES = ' ';
var charInLine = 0;
var decodeHelper = new PR_DecodeHelper();
var chunksOut = []
for ( var chunkIndex = 0; chunkIndex < chunks.length; ++chunkIndex ) {
var chunk = chunks[chunkIndex];
if ( chunk.style == null ) {
chunksOut.push(chunk);
continue;
}
var s = chunk.token;
var pos = 0;
// index of last character output
var out = [];
// walk over each character looking for tabs and newlines.
// On tabs, expand them. On newlines, reset charInLine.
// Otherwise increment charInLine
for ( var charIndex = 0, n = s.length; charIndex < n;
charIndex = decodeHelper.next ) {
decodeHelper.decode(s, charIndex);
var ch = decodeHelper.ch;
switch ( ch ) {
case '\t':
out.push(s.substring(pos, charIndex));
// calculate how much space we need in front of this part
// nSpaces is the amount of padding -- the number of spaces needed to
// move us to the next column, where columns occur at factors of
// tabWidth.
var nSpaces = tabWidth - (charInLine % tabWidth);
charInLine += nSpaces;
for ( ; nSpaces >= 0; nSpaces -= SPACES.length ) {
out.push(SPACES.substring(0, nSpaces));
}
pos = decodeHelper.next;
break;
case '\n': case '\r':
charInLine = 0;
break;
default:
++charInLine;
}
}
out.push(s.substring(pos));
chunksOut.push(new PR_Token(out.join(''), chunk.style));
}
return chunksOut
}
/** split markup into chunks of html tags (style null) and
* plain text (style {@link #PR_PLAIN}).
*
* @param {String} s html.
* @return {Array} of PR_Tokens of style PR_PLAIN, and null.
* @private
*/
function PR_chunkify( s ) {
// The below pattern matches one of the following
// (1) /[^<]+/ : A run of characters other than '<'
// (2) /<\/?[a-zA-Z][^>]*>/ : A probably tag that should not be highlighted
// (3) /</ : A '<' that does not begin a larger chunk. Treated as 1
var chunkPattern = /(?:[^<]+|<\/?[a-zA-Z][^>]*>|<)/g;
// since the pattern has the 'g' modifier and defines no capturing groups,
// this will return a list of all chunks which we then classify and wrap as
// PR_Tokens
var matches = s.match(chunkPattern);
var chunks = [];
if ( matches ) {
var lastChunk = null;
for ( var i = 0, n = matches.length; i < n; ++i ) {
var chunkText = matches[i];
var style;
if ( chunkText.length < 2 || chunkText.charAt(0) !== '<' ) {
if ( lastChunk && lastChunk.style === PR_PLAIN ) {
lastChunk.token += chunkText;
continue;
}
style = PR_PLAIN;
}
else { // a tag
style = null;
}
lastChunk = new PR_Token(chunkText, style);
chunks.push(lastChunk);
}
}
return chunks;
}
/** walk the tokenEnds list and the chunk list in parallel to generate a list
* of split tokens.
* @private
*/
function PR_splitChunks( chunks, tokenEnds ) {
var tokens = [];
// the output
var ci = 0;
// index into chunks
// position of beginning of amount written so far in absolute space.
var posAbs = 0;
// position of amount written so far in chunk space
var posChunk = 0;
// current chunk
var chunk = new PR_Token('', null);
for ( var ei = 0, ne = tokenEnds.length, lastEnd = 0; ei < ne; ++ei ) {
var tokenEnd = tokenEnds[ei];
var end = tokenEnd.end;
if ( end === lastEnd ) {
continue;
} // skip empty regions
var tokLen = end - posAbs;
var remainingInChunk = chunk.token.length - posChunk;
while ( remainingInChunk <= tokLen ) {
if ( remainingInChunk > 0 ) {
tokens.push(
new PR_Token(chunk.token.substring(posChunk, chunk.token.length),
null == chunk.style ? null : tokenEnd.style));
}
posAbs += remainingInChunk;
posChunk = 0;
if ( ci < chunks.length ) {
chunk = chunks[ci++];
}
tokLen = end - posAbs;
remainingInChunk = chunk.token.length - posChunk;
}
if ( tokLen ) {
tokens.push(
new PR_Token(chunk.token.substring(posChunk, posChunk + tokLen),
tokenEnd.style));
posAbs += tokLen;
posChunk += tokLen;
}
}
return tokens;
}
/** splits markup tokens into declarations, tags, and source chunks.
* @private
*/
function PR_splitMarkup( chunks ) {
// A state machine to split out declarations, tags, etc.
// This state machine deals with absolute space in the text, indexed by k,
// and position in the current chunk, indexed by pos and tokenStart to
// generate a list of the ends of tokens.
// Absolute space is calculated by considering the chunks as appended into
// one big string, as they were before being split.
// Known failure cases
// Server side scripting sections such as <?...?> in attributes.
// i.e. <span class="<? foo ?>">
// Handling this would require a stack, and we don't use PHP.
// The output: a list of pairs of PR_TokenEnd instances
var tokenEnds = [];
var state = 0;
// FSM state variable
var k = 0;
// position in absolute space of the start of the current chunk
var tokenStart = -1;
// the start of the current token
// Try to find a closing tag for any open <style> or <script> tags
// We can't do this at a later stage because then the following case
// would fail:
// <script>document.writeln('<!--');</script>
// We use tokenChars[:tokenCharsI] to accumulate the tag name so that we
// can check whether to enter into a no scripting section when the tag ends.
var tokenChars = new Array(12);
var tokenCharsI = 0;
// if non null, the tag prefix that we need to see to break out.
var endScriptTag = null;
var decodeHelper = new PR_DecodeHelper();
for ( var ci = 0, nc = chunks.length; ci < nc; ++ci ) {
var chunk = chunks[ci];
if ( PR_PLAIN != chunk.style ) {
k += chunk.token.length;
continue;
}
var s = chunk.token;
var pos = 0;
// the position past the last character processed so far in s
for ( var i = 0, n = s.length; i < n; /* i = next at bottom */ ) {
decodeHelper.decode(s, i);
var ch = decodeHelper.ch;
var next = decodeHelper.next;
var tokenStyle = null;
switch ( state ) {
case 0:
if ( '<' == ch ) {
state = 1;
}
break;
case 1:
tokenCharsI = 0;
if ( '/' == ch ) { // only consider close tags if we're in script/style
state = 7;
}
else if ( null == endScriptTag ) {
if ( '!' == ch ) {
state = 2;
}
else if ( PR_isWordChar(ch) ) {
state = 8;
}
else if ( '?' == ch ) {
state = 9;
}
else if ( '%' == ch ) {
state = 11;
}
else if ( '<' != ch ) {
state = 0;
}
}
else if ( '<' != ch ) {
state = 0;
}
break;
case 2:
if ( '-' == ch ) {
state = 4;
}
else if ( PR_isWordChar(ch) ) {
state = 3;
}
else if ( '<' == ch ) {
state = 1;
}
else {
state = 0;
}
break;
case 3:
if ( '>' == ch ) {
state = 0;
tokenStyle = PR_DECLARATION;
}
break;
case 4:
if ( '-' == ch ) {
state = 5;
}
break;
case 5:
if ( '-' == ch ) {
state = 6;
}
break;
case 6:
if ( '>' == ch ) {
state = 0;
tokenStyle = PR_COMMENT;
}
else if ( '-' == ch ) {
state = 6;
}
else {
state = 4;
}
break;
case 7:
if ( PR_isWordChar(ch) ) {
state = 8;
}
else if ( '<' == ch ) {
state = 1;
}
else {
state = 0;
}
break;
case 8:
if ( '>' == ch ) {
state = 0;
tokenStyle = PR_TAG;
}
break;
case 9:
if ( '?' == ch ) {
state = 10;
}
break;
case 10:
if ( '>' == ch ) {
state = 0;
tokenStyle = PR_SOURCE;
}
else if ( '?' != ch ) {
state = 9;
}
break;
case 11:
if ( '%' == ch ) {
state = 12;
}
break;
case 12:
if ( '>' == ch ) {
state = 0;
tokenStyle = PR_SOURCE;
}
else if ( '%' != ch ) {
state = 11;
}
break;
}
if ( tokenCharsI < tokenChars.length ) {
tokenChars[tokenCharsI++] = ch.toLowerCase();
}
if ( 1 == state ) {
tokenStart = k + i;
}
i = next;
if ( tokenStyle != null ) {
if ( null != tokenStyle ) {
if ( endScriptTag ) {
if ( PR_prefixMatch(tokenChars, tokenCharsI, endScriptTag) ) {
endScriptTag = null;
}
}
else {
if ( PR_prefixMatch(tokenChars, tokenCharsI, 'script') ) {
endScriptTag = '/script';
}
else if ( PR_prefixMatch(tokenChars, tokenCharsI, 'style') ) {
endScriptTag = '/style';
}
else if ( PR_prefixMatch(tokenChars, tokenCharsI, 'xmp') ) {
endScriptTag = '/xmp';
}
}
// disallow the tag if endScriptTag is set and this was not an open
// tag.
if ( endScriptTag && tokenCharsI && '/' == tokenChars[0] ) {
tokenStyle = null;
}
}
if ( null != tokenStyle ) {
tokenEnds.push(new PR_TokenEnd(tokenStart, PR_PLAIN));
tokenEnds.push(new PR_TokenEnd(k + next, tokenStyle));
}
}
}
k += chunk.token.length;
}
tokenEnds.push(new PR_TokenEnd(k, PR_PLAIN));
return tokenEnds;
}
/** splits the given string into comment, string, and "other" tokens.
* @return {Array} of PR_Tokens with style in
* (PR_STRING, PR_COMMENT, PR_PLAIN, null)
* The result array may contain spurious zero length tokens. Ignore them.
*
* @private
*/
function PR_splitStringAndCommentTokens( chunks ) {
// a state machine to split out comments, strings, and other stuff
var tokenEnds = [];
// positions of ends of tokens in absolute space
var state = 0;
// FSM state variable
var delim = -1;
// string delimiter
var k = 0;
// absolute position of beginning of current chunk
for ( var ci = 0, nc = chunks.length; ci < nc; ++ci ) {
var chunk = chunks[ci];
var s = chunk.token;
if ( PR_PLAIN == chunk.style ) {
var decodeHelper = new PR_DecodeHelper();
var last = -1;
var next;
for ( var i = 0, n = s.length; i < n; last = i,i = next ) {
decodeHelper.decode(s, i);
var ch = decodeHelper.ch;
next = decodeHelper.next;
if ( 0 == state ) {
if ( ch == '"' || ch == '\'' || ch == '`' ) {
tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN));
state = 1;
delim = ch;
}
else if ( ch == '/' ) {
state = 3;
}
else if ( ch == '#' ) {
tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN));
state = 4;
}
}
else if ( 1 == state ) {
if ( ch == delim ) {
state = 0;
tokenEnds.push(new PR_TokenEnd(k + next, PR_STRING));
}
else if ( ch == '\\' ) {
state = 2;
}
}
else if ( 2 == state ) {
state = 1;
}
else if ( 3 == state ) {
if ( ch == '/' ) {
state = 4;
tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN));
}
else if ( ch == '*' ) {
state = 5;
tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN));
}
else {
state = 0;
// next loop will reenter state 0 without same value of i, so
// ch will be reconsidered as start of new token.
next = i;
}
}
else if ( 4 == state ) {
if ( ch == '\r' || ch == '\n' ) {
state = 0;
tokenEnds.push(new PR_TokenEnd(k + i, PR_COMMENT));
}
}
else if ( 5 == state ) {
if ( ch == '*' ) {
state = 6;
}
}
else if ( 6 == state ) {
if ( ch == '/' ) {
state = 0;
tokenEnds.push(new PR_TokenEnd(k + next, PR_COMMENT));
}
else if ( ch != '*' ) {
state = 5;
}
}
}
}
k += s.length;
}
var endTokenType;
switch ( state ) {
case 1: case 2:
endTokenType = PR_STRING;
break;
case 4: case 5: case 6:
endTokenType = PR_COMMENT;
break;
default:
endTokenType = PR_PLAIN;
break;
}
// handle unclosed token which can legally happen for line comments (state 4)
tokenEnds.push(new PR_TokenEnd(k, endTokenType));
// a token ends at the end
return PR_splitChunks(chunks, tokenEnds);
}
/** used by lexSource to split a non string, non comment token.
* @private
*/
function PR_splitNonStringNonCommentToken( s, outlist ) {
var pos = 0;
var state = 0;
var decodeHelper = new PR_DecodeHelper();
var next;
for ( var i = 0; i <= s.length; i = next ) {
if ( i == s.length ) {
// nstate will not be equal to state, so it will append the token
nstate = -2;
next = i + 1;
}
else {
decodeHelper.decode(s, i);
next = decodeHelper.next;
var ch = decodeHelper.ch;
// the next state.
// if set to -1 then it will cause a reentry to state 0 without consuming
// another character.
var nstate = state;
switch ( state ) {
case 0: // whitespace state
if ( PR_isIdentifierStart(ch) ) {
nstate = 1;
}
else if ( PR_isDigitChar(ch) ) {
nstate = 2;
}
else if ( !PR_isSpaceChar(ch) ) {
nstate = 3;
}
if ( nstate && pos < i ) {
var t = s.substring(pos, i);
outlist.push(new PR_Token(t, PR_PLAIN));
pos = i;
}
break;
case 1: // identifier state
if ( !PR_isIdentifierPart(ch) ) {
nstate = -1;
}
break;
case 2: // number literal state
// handle numeric literals like
// 0x7f 300UL 100_000
// this does not treat floating point values as a single literal
// 0.1 and 3e-6
// are each split into multiple tokens
if ( !(PR_isDigitChar(ch) || PR_isWordChar(ch) || ch == '_') ) {
nstate = -1;
}
break;
case 3: // punctuation state
if ( PR_isIdentifierStart(ch) || PR_isDigitChar(ch) ||
PR_isSpaceChar(ch) ) {
nstate = -1;
}
break;
}
}
if ( nstate != state ) {
if ( nstate < 0 ) {
if ( i > pos ) {
var t = s.substring(pos, i);
var wordDecodeHelper = new PR_DecodeHelper();
wordDecodeHelper.decode(t, 0);
var ch0 = wordDecodeHelper.ch;
var isSingleCharacter = wordDecodeHelper.next == t.length;
var style;
if ( PR_isIdentifierStart(ch0) ) {
if ( PR_keywords[t] ) {
style = PR_KEYWORD;
}
else if ( ch0 === '@' ) {
style = PR_LITERAL;
}
else {
// Treat any word that starts with an uppercase character and
// contains at least one lowercase character as a type, or
// ends with _t.
// This works perfectly for Java, pretty well for C++, and
// passably for Python. The _t catches C structs.
var isType = false;
if ( ch0 >= 'A' && ch0 <= 'Z' ) {
for ( var j = wordDecodeHelper.next;
j < t.length; j = wordDecodeHelper.next ) {
wordDecodeHelper.decode(t, j);
var ch1 = wordDecodeHelper.ch;
if ( ch1 >= 'a' && ch1 <= 'z' ) {
isType = true;
break;
}
}
if ( !isType && !isSingleCharacter &&
t.substring(t.length - 2) == '_t' ) {
isType = true;
}
}
style = isType ? PR_TYPE : PR_PLAIN;
}
}
else if ( PR_isDigitChar(ch0) ) {
style = PR_LITERAL;
}
else if ( !PR_isSpaceChar(ch0) ) {
style = PR_PUNCTUATION;
}
else {
style = PR_PLAIN;
}
pos = i;
outlist.push(new PR_Token(t, style));
}
state = 0;
if ( nstate == -1 ) {
// don't increment. This allows us to use state 0 to redispatch based
// on the current character.
next = i;
continue;
}
}
state = nstate;
}
}
}
/** split a group of chunks of markup.
* @private
*/
function PR_tokenizeMarkup( chunks ) {
if ( !(chunks && chunks.length) ) {
return chunks;
}
var tokenEnds = PR_splitMarkup(chunks);
return PR_splitChunks(chunks, tokenEnds);
}
/** split tags attributes and their values out from the tag name, and
* recursively lex source chunks.
* @private
*/
function PR_splitTagAttributes( tokens ) {
var tokensOut = [];
var state = 0;
var stateStyle = PR_TAG;
var delim = null;
// attribute delimiter for quoted value state.
var decodeHelper = new PR_DecodeHelper();
for ( var ci = 0; ci < tokens.length; ++ci ) {
var tok = tokens[ci];
if ( PR_TAG == tok.style ) {
var s = tok.token;
var start = 0;
for ( var i = 0; i < s.length; /* i = next at bottom */ ) {
decodeHelper.decode(s, i);
var ch = decodeHelper.ch;
var next = decodeHelper.next;
var emitEnd = null;
// null or position of end of chunk to emit.
var nextStyle = null;
// null or next value of stateStyle
if ( ch == '>' ) {
if ( PR_TAG != stateStyle ) {
emitEnd = i;
nextStyle = PR_TAG;
}
}
else {
switch ( state ) {
case 0:
if ( '<' == ch ) {
state = 1;
}
break;
case 1:
if ( PR_isSpaceChar(ch) ) {
state = 2;
}
break;
case 2:
if ( !PR_isSpaceChar(ch) ) {
nextStyle = PR_ATTRIB_NAME;
emitEnd = i;
state = 3;
}
break;
case 3:
if ( '=' == ch ) {
emitEnd = i;
nextStyle = PR_TAG;
state = 5;
}
else if ( PR_isSpaceChar(ch) ) {
emitEnd = i;
nextStyle = PR_TAG;
state = 4;
}
break;
case 4:
if ( '=' == ch ) {
state = 5;
}
else if ( !PR_isSpaceChar(ch) ) {
emitEnd = i;
nextStyle = PR_ATTRIB_NAME;
state = 3;
}
break;
case 5:
if ( '"' == ch || '\'' == ch ) {
emitEnd = i;
nextStyle = PR_ATTRIB_VALUE;
state = 6;
delim = ch;
}
else if ( !PR_isSpaceChar(ch) ) {
emitEnd = i;
nextStyle = PR_ATTRIB_VALUE;
state = 7;
}
break;
case 6:
if ( ch == delim ) {
emitEnd = next;
nextStyle = PR_TAG;
state = 2;
}
break;
case 7:
if ( PR_isSpaceChar(ch) ) {
emitEnd = i;
nextStyle = PR_TAG;
state = 2;
}
break;
}
}
if ( emitEnd ) {
if ( emitEnd > start ) {
tokensOut.push(
new PR_Token(s.substring(start, emitEnd), stateStyle));
start = emitEnd;
}
stateStyle = nextStyle;
}
i = next;
}
if ( s.length > start ) {
tokensOut.push(new PR_Token(s.substring(start, s.length), stateStyle));
}
}
else {
if ( tok.style ) {
state = 0;
stateStyle = PR_TAG;
}
tokensOut.push(tok);
}
}
return tokensOut;
}
/** identify regions of markup that are really source code, and recursivley
* lex them.
* @private
*/
function PR_splitSourceNodes( tokens ) {
var tokensOut = [];
// when we see a <script> tag, store '/' here so that we know to end the
// source processing
var endScriptTag = null;
var decodeHelper = new PR_DecodeHelper();
var sourceChunks = null;
for ( var ci = 0, nc = tokens.length; /* break below */; ++ci ) {
var tok;
if ( ci < nc ) {
tok = tokens[ci];
if ( null == tok.style ) {
tokens.push(tok);
continue;
}
}
else if ( !endScriptTag ) {
break;
}
else {
// else pretend there's an end tag so we can gracefully handle
// unclosed source blocks
tok = new PR_Token('', null);
}
var s = tok.token;
if ( null == endScriptTag ) {
if ( PR_SOURCE == tok.style ) {
// split off any starting and trailing <?, <%
if ( '<' == decodeHelper.decode(s, 0) ) {
decodeHelper.decode(s, decodeHelper.next);
if ( '%' == decodeHelper.ch || '?' == decodeHelper.ch ) {
endScriptTag = decodeHelper.ch;
tokensOut.push(new PR_Token(s.substring(0, decodeHelper.next),
PR_TAG));
s = s.substring(decodeHelper.next, s.length);
}
}
}
else if ( PR_TAG == tok.style ) {
if ( '<' == decodeHelper.decode(s, 0) &&
'/' != s.charAt(decodeHelper.next) ) {
var tagContent = s.substring(decodeHelper.next).toLowerCase();
// FIXME(msamuel): this does not mirror exactly the code in
// in PR_splitMarkup that defers splitting tags inside script and
// style blocks.
if ( PR_startsWith(tagContent, 'script') ||
PR_startsWith(tagContent, 'style') ||
PR_startsWith(tagContent, 'xmp') ) {
endScriptTag = '/';
}
}
}
}
if ( null != endScriptTag ) {
var endTok = null;
if ( PR_SOURCE == tok.style ) {
if ( endScriptTag == '%' || endScriptTag == '?' ) {
var pos = s.lastIndexOf(endScriptTag);
if ( pos >= 0 && '>' == decodeHelper.decode(s, pos + 1) &&
s.length == decodeHelper.next ) {
endTok = new PR_Token(s.substring(pos, s.length), PR_TAG);
s = s.substring(0, pos);
}
}
if ( null == sourceChunks ) {
sourceChunks = [];
}
sourceChunks.push(new PR_Token(s, PR_PLAIN));
}
else if ( PR_PLAIN == tok.style ) {
if ( null == sourceChunks ) {
sourceChunks = [];
}
sourceChunks.push(tok);
}
else if ( PR_TAG == tok.style ) {
// if it starts with </ then it must be the end tag.
if ( '<' == decodeHelper.decode(tok.token, 0) &&
tok.token.length > decodeHelper.next &&
'/' == decodeHelper.decode(tok.token, decodeHelper.next) ) {
endTok = tok;
}
else {
tokensOut.push(tok);
}
}
else if ( ci >= nc ) {
// force the token to close
endTok = tok;
}
else {
if ( sourceChunks ) {
sourceChunks.push(tok);
}
else {
// push remaining tag and attribute tokens from the opening tag
tokensOut.push(tok);
}
}
if ( endTok ) {
if ( sourceChunks ) {
var sourceTokens = PR_lexSource(sourceChunks);
tokensOut.push(new PR_Token('<span class=embsrc>', null));
for ( var si = 0, ns = sourceTokens.length; si < ns; ++si ) {
tokensOut.push(sourceTokens[si]);
}
tokensOut.push(new PR_Token('</span>', null));
sourceChunks = null;
}
if ( endTok.token ) {
tokensOut.push(endTok);
}
endScriptTag = null;
}
}
else {
tokensOut.push(tok);
}
}
return tokensOut;
}
/** splits the quotes from an attribute value.
* ['"foo"'] -> ['"', 'foo', '"']
* @private
*/
function PR_splitAttributeQuotes( tokens ) {
var firstPlain = null, lastPlain = null;
for ( var i = 0; i < tokens.length; ++i ) {
if ( PR_PLAIN == tokens[i].style ) {
firstPlain = i;
break;
}
}
for ( var i = tokens.length; --i >= 0; ) {
if ( PR_PLAIN == tokens[i].style ) {
lastPlain = i;
break;
}
}
if ( null == firstPlain ) {
return tokens;
}
var decodeHelper = new PR_DecodeHelper();
var fs = tokens[firstPlain].token;
var fc = decodeHelper.decode(fs, 0);
if ( '"' != fc && '\'' != fc ) {
return tokens;
}
var fpos = decodeHelper.next;
var ls = tokens[lastPlain].token;
var lpos = ls.lastIndexOf('&');
if ( lpos < 0 ) {
lpos = ls.length - 1;
}
var lc = decodeHelper.decode(ls, lpos);
if ( lc != fc || decodeHelper.next != ls.length ) {
lc = null;
lpos = ls.length;
}
var tokensOut = [];
for ( var i = 0; i < firstPlain; ++i ) {
tokensOut.push(tokens[i]);
}
tokensOut.push(new PR_Token(fs.substring(0, fpos), PR_ATTRIB_VALUE));
if ( lastPlain == firstPlain ) {
tokensOut.push(new PR_Token(fs.substring(fpos, lpos), PR_PLAIN));
}
else {
tokensOut.push(new PR_Token(fs.substring(fpos, fs.length), PR_PLAIN));
for ( var i = firstPlain + 1; i < lastPlain; ++i ) {
tokensOut.push(tokens[i]);
}
if ( lc ) {
tokens.push(new PR_Token(ls.substring(0, lpos), PR_PLAIN));
}
else {
tokens.push(tokens[lastPlain]);
}
}
if ( lc ) {
tokensOut.push(new PR_Token(ls.substring(lpos, ls.length), PR_PLAIN));
}
for ( var i = lastPlain + 1; i < tokens.length; ++i ) {
tokensOut.push(tokens[i]);
}
return tokensOut;
}
/** identify attribute values that really contain source code and recursively
* lex them.
* @private
*/
function PR_splitSourceAttributes( tokens ) {
var tokensOut = [];
var sourceChunks = null;
var inSource = false;
var name = '';
for ( var ci = 0, nc = tokens.length; ci < nc; ++ci ) {
var tok = tokens[ci];
var outList = tokensOut;
if ( PR_TAG == tok.style ) {
if ( inSource ) {
inSource = false;
name = '';
if ( sourceChunks ) {
tokensOut.push(new PR_Token('<span class=embsrc>', null));
var sourceTokens =
PR_lexSource(PR_splitAttributeQuotes(sourceChunks));
for ( var si = 0, ns = sourceTokens.length; si < ns; ++si ) {
tokensOut.push(sourceTokens[si]);
}
tokensOut.push(new PR_Token('</span>', null));
sourceChunks = null;
}
}
else if ( name && tok.token.indexOf('=') >= 0 ) {
var nameLower = name.toLowerCase();
if ( PR_startsWith(nameLower, 'on') || 'style' == nameLower ) {
inSource = true;
}
}
else {
name = '';
}
}
else if ( PR_ATTRIB_NAME == tok.style ) {
name += tok.token;
}
else if ( PR_ATTRIB_VALUE == tok.style ) {
if ( inSource ) {
if ( null == sourceChunks ) {
sourceChunks = [];
}
outList = sourceChunks;
tok = new PR_Token(tok.token, PR_PLAIN);
}
}
else {
if ( sourceChunks ) {
outList = sourceChunks;
}
}
outList.push(tok);
}
return tokensOut;
}
/** returns a list of PR_Token objects given chunks of source code.
*
* This code treats ", ', and ` as string delimiters, and \ as a string escape.
* It does not recognize perl's qq() style strings. It has no special handling
* for double delimiter escapes as in basic, or tje tripled delimiters used in
* python, but should work on those regardless although in those cases a single
* string literal may be broken up into multiple adjacent string literals.
*
* It recognizes C, C++, and shell style comments.
*
* @param chunks PR_Tokens with style in (null, PR_PLAIN)
*/
function PR_lexSource( chunks ) {
// split into strings, comments, and other.
// We do this because strings and comments are easily recognizable and can
// contain stuff that looks like other tokens, so we want to mark those early
// so we don't recurse into them.
var tokens = PR_splitStringAndCommentTokens(chunks);
// split non comment|string tokens on whitespace and word boundaries
var tokensOut = [];
for ( var i = 0; i < tokens.length; ++i ) {
var tok = tokens[i];
if ( PR_PLAIN === tok.style ) {
PR_splitNonStringNonCommentToken(tok.token, tokensOut);
continue;
}
tokensOut.push(tok);
}
return tokensOut;
}
/** returns a list of PR_Token objects given a string of markup.
*
* This code assumes that < tokens are html escaped, but " are not.
* It will do a resonable job with <, but will not recognize an &quot;
* as starting a string.
*
* This code recognizes a number of constructs.
* <!-- ... --> comment
* <!\w ... > declaration
* <\w ... > tag
* </\w ... > tag
* <?...?> embedded source
* &[#\w]...; entity
*
* It does not recognizes %foo; entities.
*
* It will recurse into any <style>, <script>, and on* attributes using
* PR_lexSource.
*/
function PR_lexMarkup( chunks ) {
// This function works as follows:
// 1) Start by splitting the markup into text and tag chunks
// Input: String s
// Output: List<PR_Token> where style in (PR_PLAIN, null)
// 2) Then split the text chunks further into comments, declarations,
// tags, etc.
// After each split, consider whether the token is the start of an
// embedded source section, i.e. is an open <script> tag. If it is,
// find the corresponding close token, and don't bother to lex in between.
// Input: List<String>
// Output: List<PR_Token> with style in (PR_TAG, PR_PLAIN, PR_SOURCE, null)
// 3) Finally go over each tag token and split out attribute names and values.
// Input: List<PR_Token>
// Output: List<PR_Token> where style in
// (PR_TAG, PR_PLAIN, PR_SOURCE, NAME, VALUE, null)
var tokensOut = PR_tokenizeMarkup(chunks);
tokensOut = PR_splitTagAttributes(tokensOut);
tokensOut = PR_splitSourceNodes(tokensOut);
tokensOut = PR_splitSourceAttributes(tokensOut);
return tokensOut;
}
/**
* classify the string as either source or markup and lex appropriately.
* @param {String} html
*/
function PR_lexOne( html ) {
var chunks = PR_expandTabs(PR_chunkify(html), PR_TAB_WIDTH);
// treat it as markup if the first non whitespace character is a < and the
// last non-whitespace character is a >
var isMarkup = false;
for ( var i = 0; i < chunks.length; ++i ) {
if ( PR_PLAIN == chunks[i].style ) {
if ( PR_startsWith(PR_trim(chunks[i].token), '&lt;') ) {
for ( var j = chunks.length; --j >= 0; ) {
if ( PR_PLAIN == chunks[j].style ) {
isMarkup = PR_endsWith(PR_trim(chunks[j].token), '&gt;');
break;
}
}
}
break;
}
}
return isMarkup ? PR_lexMarkup(chunks) : PR_lexSource(chunks);
}
/** pretty print a chunk of code.
*
* @param s code as html
* @return code as html, but prettier
*/
function prettyPrintOne( s ) {
try {
var tokens = PR_lexOne(s);
var out = [];
var lastStyle = null;
for ( var i = 0; i < tokens.length; i++ ) {
var t = tokens[i];
if ( t.style != lastStyle ) {
if ( lastStyle != null ) {
out.push('</span>');
}
if ( t.style != null ) {
out.push('<span class=', t.style, '>');
}
lastStyle = t.style;
}
var html = t.token;
if ( null != t.style ) {
// This interacts badly with some wikis which introduces paragraph tags
// into pre blocks for some strange reason.
// It's necessary for IE though which seems to lose the preformattedness
// of <pre> tags when their innerHTML is assigned.
// http://stud3.tuwien.ac.at/~e0226430/innerHtmlQuirk.html
html = html
.replace(/(\r\n?|\n| ) /g, '$1&nbsp;')
.replace(/\r\n?|\n/g, '<br>');
}
out.push(html);
}
if ( lastStyle != null ) {
out.push('</span>');
}
return out.join('');
}
catch ( e ) {
if ( 'console' in window ) {
console.log(e);
console.trace();
}
return s;
}
}
/** find all the < pre > and < code > tags in the DOM with class=prettyprint and
* prettify them.
*/
function prettyPrint() {
// fetch a list of nodes to rewrite
var codeSegments = [
document.getElementsByTagName('pre'),
document.getElementsByTagName('code'),
document.getElementsByTagName('xmp') ];
var elements = [];
for ( var i = 0; i < codeSegments.length; ++i ) {
for ( var j = 0; j < codeSegments[i].length; ++j ) {
elements.push(codeSegments[i][j]);
}
}
codeSegments = null;
// the loop is broken into a series of continuations to make sure that we
// don't make the browser unresponsive when rewriting a large page.
var k = 0;
function doWork() {
var endTime = new Date().getTime() + 250;
for ( ; k < elements.length && new Date().getTime() < endTime; k++ ) {
var cs = elements[k];
if ( cs.className && cs.className.indexOf('prettyprint') >= 0 ) {
// make sure this is not nested in an already prettified element
var nested = false;
for ( var p = cs.parentNode; p != null; p = p.parentNode ) {
if ( (p.tagName == 'pre' || p.tagName == 'code' ||
p.tagName == 'xmp') &&
p.className && p.className.indexOf('prettyprint') >= 0 ) {
nested = true;
break;
}
}
if ( !nested ) {
// fetch the content as a snippet of properly escaped HTML.
// Firefox adds newlines at the end.
var content = PR_getInnerHtml(cs);
content = content.replace(/(?:\r\n?|\n)$/, '');
// do the pretty printing
var newContent = prettyPrintOne(content);
// push the prettified html back into the tag.
if ( !PR_isRawContent(cs) ) {
// just replace the old html with the new
cs.innerHTML = newContent;
}
else {
// we need to change the tag to a <pre> since <xmp>s do not allow
// embedded tags such as the span tags used to attach styles to
// sections of source code.
var pre = document.createElement('PRE');
for ( var i = 0; i < cs.attributes.length; ++i ) {
var a = cs.attributes[i];
if ( a.specified ) {
pre.setAttribute(a.name, a.value);
}
}
pre.innerHTML = newContent;
// remove the old
cs.parentNode.replaceChild(pre, cs);
}
}
}
}
if ( k < elements.length ) {
// finish up in a continuation
setTimeout(doWork, 250);
}
}
doWork();
}