// Copyright (C) 2010 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /** * @fileoverview * Implements RFC 3986 for parsing/formatting URIs. * * @author mikesamuel@gmail.com * \@provides URI * \@overrides window */ var URI = (function () { /** * creates a uri from the string form. The parser is relaxed, so special * characters that aren't escaped but don't cause ambiguities will not cause * parse failures. * * @return {URI|null} */ function parse(uriStr) { var m = ('' + uriStr).match(URI_RE_); if (!m) { return null; } return new URI( nullIfAbsent(m[1]), nullIfAbsent(m[2]), nullIfAbsent(m[3]), nullIfAbsent(m[4]), nullIfAbsent(m[5]), nullIfAbsent(m[6]), nullIfAbsent(m[7])); } /** * creates a uri from the given parts. * * @param scheme {string} an unencoded scheme such as "http" or null * @param credentials {string} unencoded user credentials or null * @param domain {string} an unencoded domain name or null * @param port {number} a port number in [1, 32768]. * -1 indicates no port, as does null. * @param path {string} an unencoded path * @param query {Array.<string>|string|null} a list of unencoded cgi * parameters where even values are keys and odds the corresponding values * or an unencoded query. * @param fragment {string} an unencoded fragment without the "#" or null. * @return {URI} */ function create(scheme, credentials, domain, port, path, query, fragment) { var uri = new URI( encodeIfExists2(scheme, URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_), encodeIfExists2( credentials, URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_), encodeIfExists(domain), port > 0 ? port.toString() : null, encodeIfExists2(path, URI_DISALLOWED_IN_PATH_), null, encodeIfExists(fragment)); if (query) { if ('string' === typeof query) { uri.setRawQuery(query.replace(/[^?&=0-9A-Za-z_\-~.%]/g, encodeOne)); } else { uri.setAllParameters(query); } } return uri; } function encodeIfExists(unescapedPart) { if ('string' == typeof unescapedPart) { return encodeURIComponent(unescapedPart); } return null; }; /** * if unescapedPart is non null, then escapes any characters in it that aren't * valid characters in a url and also escapes any special characters that * appear in extra. * * @param unescapedPart {string} * @param extra {RegExp} a character set of characters in [\01-\177]. * @return {string|null} null iff unescapedPart == null. */ function encodeIfExists2(unescapedPart, extra) { if ('string' == typeof unescapedPart) { return encodeURI(unescapedPart).replace(extra, encodeOne); } return null; }; /** converts a character in [\01-\177] to its url encoded equivalent. */ function encodeOne(ch) { var n = ch.charCodeAt(0); return '%' + '0123456789ABCDEF'.charAt((n >> 4) & 0xf) + '0123456789ABCDEF'.charAt(n & 0xf); } /** * {@updoc * $ normPath('foo/./bar') * # 'foo/bar' * $ normPath('./foo') * # 'foo' * $ normPath('foo/.') * # 'foo' * $ normPath('foo//bar') * # 'foo/bar' * } */ function normPath(path) { return path.replace(/(^|\/)\.(?:\/|$)/g, '$1').replace(/\/{2,}/g, '/'); } var PARENT_DIRECTORY_HANDLER = new RegExp( '' // A path break + '(/|^)' // followed by a non .. path element // (cannot be . because normPath is used prior to this RegExp) + '(?:[^./][^/]*|\\.{2,}(?:[^./][^/]*)|\\.{3,}[^/]*)' // followed by .. followed by a path break. + '/\\.\\.(?:/|$)'); var PARENT_DIRECTORY_HANDLER_RE = new RegExp(PARENT_DIRECTORY_HANDLER); var EXTRA_PARENT_PATHS_RE = /^(?:\.\.\/)*(?:\.\.$)?/; /** * Normalizes its input path and collapses all . and .. sequences except for * .. sequences that would take it above the root of the current parent * directory. * {@updoc * $ collapse_dots('foo/../bar') * # 'bar' * $ collapse_dots('foo/./bar') * # 'foo/bar' * $ collapse_dots('foo/../bar/./../../baz') * # 'baz' * $ collapse_dots('../foo') * # '../foo' * $ collapse_dots('../foo').replace(EXTRA_PARENT_PATHS_RE, '') * # 'foo' * } */ function collapse_dots(path) { if (path === null) { return null; } var p = normPath(path); // Only /../ left to flatten var r = PARENT_DIRECTORY_HANDLER_RE; // We replace with $1 which matches a / before the .. because this // guarantees that: // (1) we have at most 1 / between the adjacent place, // (2) always have a slash if there is a preceding path section, and // (3) we never turn a relative path into an absolute path. for (var q; (q = p.replace(r, '$1')) != p; p = q) {}; return p; } /** * resolves a relative url string to a base uri. * @return {URI} */ function resolve(baseUri, relativeUri) { // there are several kinds of relative urls: // 1. //foo - replaces everything from the domain on. foo is a domain name // 2. foo - replaces the last part of the path, the whole query and fragment // 3. /foo - replaces the the path, the query and fragment // 4. ?foo - replace the query and fragment // 5. #foo - replace the fragment only var absoluteUri = baseUri.clone(); // we satisfy these conditions by looking for the first part of relativeUri // that is not blank and applying defaults to the rest var overridden = relativeUri.hasScheme(); if (overridden) { absoluteUri.setRawScheme(relativeUri.getRawScheme()); } else { overridden = relativeUri.hasCredentials(); } if (overridden) { absoluteUri.setRawCredentials(relativeUri.getRawCredentials()); } else { overridden = relativeUri.hasDomain(); } if (overridden) { absoluteUri.setRawDomain(relativeUri.getRawDomain()); } else { overridden = relativeUri.hasPort(); } var rawPath = relativeUri.getRawPath(); var simplifiedPath = collapse_dots(rawPath); if (overridden) { absoluteUri.setPort(relativeUri.getPort()); simplifiedPath = simplifiedPath && simplifiedPath.replace(EXTRA_PARENT_PATHS_RE, ''); } else { overridden = !!rawPath; if (overridden) { // resolve path properly if (simplifiedPath.charCodeAt(0) !== 0x2f /* / */) { // path is relative var absRawPath = collapse_dots(absoluteUri.getRawPath() || '') .replace(EXTRA_PARENT_PATHS_RE, ''); var slash = absRawPath.lastIndexOf('/') + 1; simplifiedPath = collapse_dots( (slash ? absRawPath.substring(0, slash) : '') + collapse_dots(rawPath)) .replace(EXTRA_PARENT_PATHS_RE, ''); } } else { simplifiedPath = simplifiedPath && simplifiedPath.replace(EXTRA_PARENT_PATHS_RE, ''); if (simplifiedPath !== rawPath) { absoluteUri.setRawPath(simplifiedPath); } } } if (overridden) { absoluteUri.setRawPath(simplifiedPath); } else { overridden = relativeUri.hasQuery(); } if (overridden) { absoluteUri.setRawQuery(relativeUri.getRawQuery()); } else { overridden = relativeUri.hasFragment(); } if (overridden) { absoluteUri.setRawFragment(relativeUri.getRawFragment()); } return absoluteUri; } /** * a mutable URI. * * This class contains setters and getters for the parts of the URI. * The <tt>getXYZ</tt>/<tt>setXYZ</tt> methods return the decoded part -- so * <code>uri.parse('/foo%20bar').getPath()</code> will return the decoded path, * <tt>/foo bar</tt>. * * <p>The raw versions of fields are available too. * <code>uri.parse('/foo%20bar').getRawPath()</code> will return the raw path, * <tt>/foo%20bar</tt>. Use the raw setters with care, since * <code>URI::toString</code> is not guaranteed to return a valid url if a * raw setter was used. * * <p>All setters return <tt>this</tt> and so may be chained, a la * <code>uri.parse('/foo').setFragment('part').toString()</code>. * * <p>You should not use this constructor directly -- please prefer the factory * functions {@link uri.parse}, {@link uri.create}, {@link uri.resolve} * instead.</p> * * <p>The parameters are all raw (assumed to be properly escaped) parts, and * any (but not all) may be null. Undefined is not allowed.</p> * * @constructor */ function URI( rawScheme, rawCredentials, rawDomain, port, rawPath, rawQuery, rawFragment) { this.scheme_ = rawScheme; this.credentials_ = rawCredentials; this.domain_ = rawDomain; this.port_ = port; this.path_ = rawPath; this.query_ = rawQuery; this.fragment_ = rawFragment; /** * @type {Array|null} */ this.paramCache_ = null; } /** returns the string form of the url. */ URI.prototype.toString = function () { var out = []; if (null !== this.scheme_) { out.push(this.scheme_, ':'); } if (null !== this.domain_) { out.push('//'); if (null !== this.credentials_) { out.push(this.credentials_, '@'); } out.push(this.domain_); if (null !== this.port_) { out.push(':', this.port_.toString()); } } if (null !== this.path_) { out.push(this.path_); } if (null !== this.query_) { out.push('?', this.query_); } if (null !== this.fragment_) { out.push('#', this.fragment_); } return out.join(''); }; URI.prototype.clone = function () { return new URI(this.scheme_, this.credentials_, this.domain_, this.port_, this.path_, this.query_, this.fragment_); }; URI.prototype.getScheme = function () { // HTML5 spec does not require the scheme to be lowercased but // all common browsers except Safari lowercase the scheme. return this.scheme_ && decodeURIComponent(this.scheme_).toLowerCase(); }; URI.prototype.getRawScheme = function () { return this.scheme_; }; URI.prototype.setScheme = function (newScheme) { this.scheme_ = encodeIfExists2( newScheme, URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_); return this; }; URI.prototype.setRawScheme = function (newScheme) { this.scheme_ = newScheme ? newScheme : null; return this; }; URI.prototype.hasScheme = function () { return null !== this.scheme_; }; URI.prototype.getCredentials = function () { return this.credentials_ && decodeURIComponent(this.credentials_); }; URI.prototype.getRawCredentials = function () { return this.credentials_; }; URI.prototype.setCredentials = function (newCredentials) { this.credentials_ = encodeIfExists2( newCredentials, URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_); return this; }; URI.prototype.setRawCredentials = function (newCredentials) { this.credentials_ = newCredentials ? newCredentials : null; return this; }; URI.prototype.hasCredentials = function () { return null !== this.credentials_; }; URI.prototype.getDomain = function () { return this.domain_ && decodeURIComponent(this.domain_); }; URI.prototype.getRawDomain = function () { return this.domain_; }; URI.prototype.setDomain = function (newDomain) { return this.setRawDomain(newDomain && encodeURIComponent(newDomain)); }; URI.prototype.setRawDomain = function (newDomain) { this.domain_ = newDomain ? newDomain : null; // Maintain the invariant that paths must start with a slash when the URI // is not path-relative. return this.setRawPath(this.path_); }; URI.prototype.hasDomain = function () { return null !== this.domain_; }; URI.prototype.getPort = function () { return this.port_ && decodeURIComponent(this.port_); }; URI.prototype.setPort = function (newPort) { if (newPort) { newPort = Number(newPort); if (newPort !== (newPort & 0xffff)) { throw new Error('Bad port number ' + newPort); } this.port_ = '' + newPort; } else { this.port_ = null; } return this; }; URI.prototype.hasPort = function () { return null !== this.port_; }; URI.prototype.getPath = function () { return this.path_ && decodeURIComponent(this.path_); }; URI.prototype.getRawPath = function () { return this.path_; }; URI.prototype.setPath = function (newPath) { return this.setRawPath(encodeIfExists2(newPath, URI_DISALLOWED_IN_PATH_)); }; URI.prototype.setRawPath = function (newPath) { if (newPath) { newPath = String(newPath); this.path_ = // Paths must start with '/' unless this is a path-relative URL. (!this.domain_ || /^\//.test(newPath)) ? newPath : '/' + newPath; } else { this.path_ = null; } return this; }; URI.prototype.hasPath = function () { return null !== this.path_; }; URI.prototype.getQuery = function () { // From http://www.w3.org/Addressing/URL/4_URI_Recommentations.html // Within the query string, the plus sign is reserved as shorthand notation // for a space. return this.query_ && decodeURIComponent(this.query_).replace(/\+/g, ' '); }; URI.prototype.getRawQuery = function () { return this.query_; }; URI.prototype.setQuery = function (newQuery) { this.paramCache_ = null; this.query_ = encodeIfExists(newQuery); return this; }; URI.prototype.setRawQuery = function (newQuery) { this.paramCache_ = null; this.query_ = newQuery ? newQuery : null; return this; }; URI.prototype.hasQuery = function () { return null !== this.query_; }; /** * sets the query given a list of strings of the form * [ key0, value0, key1, value1, ... ]. * * <p><code>uri.setAllParameters(['a', 'b', 'c', 'd']).getQuery()</code> * will yield <code>'a=b&c=d'</code>. */ URI.prototype.setAllParameters = function (params) { if (typeof params === 'object') { if (!(params instanceof Array) && (params instanceof Object || Object.prototype.toString.call(params) !== '[object Array]')) { var newParams = []; var i = -1; for (var k in params) { var v = params[k]; if ('string' === typeof v) { newParams[++i] = k; newParams[++i] = v; } } params = newParams; } } this.paramCache_ = null; var queryBuf = []; var separator = ''; for (var j = 0; j < params.length;) { var k = params[j++]; var v = params[j++]; queryBuf.push(separator, encodeURIComponent(k.toString())); separator = '&'; if (v) { queryBuf.push('=', encodeURIComponent(v.toString())); } } this.query_ = queryBuf.join(''); return this; }; URI.prototype.checkParameterCache_ = function () { if (!this.paramCache_) { var q = this.query_; if (!q) { this.paramCache_ = []; } else { var cgiParams = q.split(/[&\?]/); var out = []; var k = -1; for (var i = 0; i < cgiParams.length; ++i) { var m = cgiParams[i].match(/^([^=]*)(?:=(.*))?$/); // From http://www.w3.org/Addressing/URL/4_URI_Recommentations.html // Within the query string, the plus sign is reserved as shorthand // notation for a space. out[++k] = decodeURIComponent(m[1]).replace(/\+/g, ' '); out[++k] = decodeURIComponent(m[2] || '').replace(/\+/g, ' '); } this.paramCache_ = out; } } }; /** * sets the values of the named cgi parameters. * * <p>So, <code>uri.parse('foo?a=b&c=d&e=f').setParameterValues('c', ['new']) * </code> yields <tt>foo?a=b&c=new&e=f</tt>.</p> * * @param key {string} * @param values {Array.<string>} the new values. If values is a single string * then it will be treated as the sole value. */ URI.prototype.setParameterValues = function (key, values) { // be nice and avoid subtle bugs where [] operator on string performs charAt // on some browsers and crashes on IE if (typeof values === 'string') { values = [ values ]; } this.checkParameterCache_(); var newValueIndex = 0; var pc = this.paramCache_; var params = []; for (var i = 0, k = 0; i < pc.length; i += 2) { if (key === pc[i]) { if (newValueIndex < values.length) { params.push(key, values[newValueIndex++]); } } else { params.push(pc[i], pc[i + 1]); } } while (newValueIndex < values.length) { params.push(key, values[newValueIndex++]); } this.setAllParameters(params); return this; }; URI.prototype.removeParameter = function (key) { return this.setParameterValues(key, []); }; /** * returns the parameters specified in the query part of the uri as a list of * keys and values like [ key0, value0, key1, value1, ... ]. * * @return {Array.<string>} */ URI.prototype.getAllParameters = function () { this.checkParameterCache_(); return this.paramCache_.slice(0, this.paramCache_.length); }; /** * returns the value<b>s</b> for a given cgi parameter as a list of decoded * query parameter values. * @return {Array.<string>} */ URI.prototype.getParameterValues = function (paramNameUnescaped) { this.checkParameterCache_(); var values = []; for (var i = 0; i < this.paramCache_.length; i += 2) { if (paramNameUnescaped === this.paramCache_[i]) { values.push(this.paramCache_[i + 1]); } } return values; }; /** * returns a map of cgi parameter names to (non-empty) lists of values. * @return {Object.<string,Array.<string>>} */ URI.prototype.getParameterMap = function (paramNameUnescaped) { this.checkParameterCache_(); var paramMap = {}; for (var i = 0; i < this.paramCache_.length; i += 2) { var key = this.paramCache_[i++], value = this.paramCache_[i++]; if (!(key in paramMap)) { paramMap[key] = [value]; } else { paramMap[key].push(value); } } return paramMap; }; /** * returns the first value for a given cgi parameter or null if the given * parameter name does not appear in the query string. * If the given parameter name does appear, but has no '<tt>=</tt>' following * it, then the empty string will be returned. * @return {string|null} */ URI.prototype.getParameterValue = function (paramNameUnescaped) { this.checkParameterCache_(); for (var i = 0; i < this.paramCache_.length; i += 2) { if (paramNameUnescaped === this.paramCache_[i]) { return this.paramCache_[i + 1]; } } return null; }; URI.prototype.getFragment = function () { return this.fragment_ && decodeURIComponent(this.fragment_); }; URI.prototype.getRawFragment = function () { return this.fragment_; }; URI.prototype.setFragment = function (newFragment) { this.fragment_ = newFragment ? encodeURIComponent(newFragment) : null; return this; }; URI.prototype.setRawFragment = function (newFragment) { this.fragment_ = newFragment ? newFragment : null; return this; }; URI.prototype.hasFragment = function () { return null !== this.fragment_; }; function nullIfAbsent(matchPart) { return ('string' == typeof matchPart) && (matchPart.length > 0) ? matchPart : null; } /** * a regular expression for breaking a URI into its component parts. * * <p>http://www.gbiv.com/protocols/uri/rfc/rfc3986.html#RFC2234 says * As the "first-match-wins" algorithm is identical to the "greedy" * disambiguation method used by POSIX regular expressions, it is natural and * commonplace to use a regular expression for parsing the potential five * components of a URI reference. * * <p>The following line is the regular expression for breaking-down a * well-formed URI reference into its components. * * <pre> * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? * 12 3 4 5 6 7 8 9 * </pre> * * <p>The numbers in the second line above are only to assist readability; they * indicate the reference points for each subexpression (i.e., each paired * parenthesis). We refer to the value matched for subexpression <n> as $<n>. * For example, matching the above expression to * <pre> * http://www.ics.uci.edu/pub/ietf/uri/#Related * </pre> * results in the following subexpression matches: * <pre> * $1 = http: * $2 = http * $3 = //www.ics.uci.edu * $4 = www.ics.uci.edu * $5 = /pub/ietf/uri/ * $6 = <undefined> * $7 = <undefined> * $8 = #Related * $9 = Related * </pre> * where <undefined> indicates that the component is not present, as is the * case for the query component in the above example. Therefore, we can * determine the value of the five components as * <pre> * scheme = $2 * authority = $4 * path = $5 * query = $7 * fragment = $9 * </pre> * * <p>msamuel: I have modified the regular expression slightly to expose the * credentials, domain, and port separately from the authority. * The modified version yields * <pre> * $1 = http scheme * $2 = <undefined> credentials -\ * $3 = www.ics.uci.edu domain | authority * $4 = <undefined> port -/ * $5 = /pub/ietf/uri/ path * $6 = <undefined> query without ? * $7 = Related fragment without # * </pre> */ var URI_RE_ = new RegExp( "^" + "(?:" + "([^:/?#]+)" + // scheme ":)?" + "(?://" + "(?:([^/?#]*)@)?" + // credentials "([^/?#:@]*)" + // domain "(?::([0-9]+))?" + // port ")?" + "([^?#]+)?" + // path "(?:\\?([^#]*))?" + // query "(?:#(.*))?" + // fragment "$" ); var URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_ = /[#\/\?@]/g; var URI_DISALLOWED_IN_PATH_ = /[\#\?]/g; URI.parse = parse; URI.create = create; URI.resolve = resolve; URI.collapse_dots = collapse_dots; // Visible for testing. // lightweight string-based api for loadModuleMaker URI.utils = { mimeTypeOf: function (uri) { var uriObj = parse(uri); if (/\.html$/.test(uriObj.getPath())) { return 'text/html'; } else { return 'application/javascript'; } }, resolve: function (base, uri) { if (base) { return resolve(parse(base), parse(uri)).toString(); } else { return '' + uri; } } }; return URI; })(); // Exports for closure compiler. if (typeof window !== 'undefined') { window['URI'] = URI; } ; // Copyright Google Inc. // Licensed under the Apache Licence Version 2.0 // Autogenerated at Mon Oct 21 13:30:08 EDT 2013 // @overrides window // @provides html4 var html4 = {}; html4.atype = { 'NONE': 0, 'URI': 1, 'URI_FRAGMENT': 11, 'SCRIPT': 2, 'STYLE': 3, 'HTML': 12, 'ID': 4, 'IDREF': 5, 'IDREFS': 6, 'GLOBAL_NAME': 7, 'LOCAL_NAME': 8, 'CLASSES': 9, 'FRAME_TARGET': 10, 'MEDIA_QUERY': 13 }; html4[ 'atype' ] = html4.atype; html4.ATTRIBS = { '*::class': 9, '*::dir': 0, '*::draggable': 0, '*::hidden': 0, '*::id': 4, '*::inert': 0, '*::itemprop': 0, '*::itemref': 6, '*::itemscope': 0, '*::lang': 0, '*::onblur': 2, '*::onchange': 2, '*::onclick': 2, '*::ondblclick': 2, '*::onerror': 2, '*::onfocus': 2, '*::onkeydown': 2, '*::onkeypress': 2, '*::onkeyup': 2, '*::onload': 2, '*::onmousedown': 2, '*::onmousemove': 2, '*::onmouseout': 2, '*::onmouseover': 2, '*::onmouseup': 2, '*::onreset': 2, '*::onscroll': 2, '*::onselect': 2, '*::onsubmit': 2, '*::onunload': 2, '*::spellcheck': 0, '*::style': 3, '*::title': 0, '*::translate': 0, 'a::accesskey': 0, 'a::coords': 0, 'a::href': 1, 'a::hreflang': 0, 'a::name': 7, 'a::onblur': 2, 'a::onfocus': 2, 'a::shape': 0, 'a::tabindex': 0, 'a::target': 10, 'a::type': 0, 'bdo::dir': 0, 'blockquote::cite': 1, 'br::clear': 0, 'caption::align': 0, 'col::align': 0, 'col::char': 0, 'col::charoff': 0, 'col::span': 0, 'col::valign': 0, 'col::width': 0, 'colgroup::align': 0, 'colgroup::char': 0, 'colgroup::charoff': 0, 'colgroup::span': 0, 'colgroup::valign': 0, 'colgroup::width': 0, 'data::value': 0, 'del::cite': 1, 'del::datetime': 0, 'details::open': 0, 'dir::compact': 0, 'div::align': 0, 'dl::compact': 0, 'h1::align': 0, 'h2::align': 0, 'h3::align': 0, 'h4::align': 0, 'h5::align': 0, 'h6::align': 0, 'hr::align': 0, 'hr::noshade': 0, 'hr::size': 0, 'hr::width': 0, 'iframe::align': 0, 'iframe::frameborder': 0, 'iframe::height': 0, 'iframe::marginheight': 0, 'iframe::marginwidth': 0, 'iframe::width': 0, 'iframe::src': 1, 'img::alt': 0, 'img::height': 0, 'img::name': 7, 'img::src': 1, 'img::width': 0, 'ins::cite': 1, 'ins::datetime': 0, 'label::accesskey': 0, 'label::for': 5, 'label::onblur': 2, 'label::onfocus': 2, 'legend::accesskey': 0, 'legend::align': 0, 'li::type': 0, 'li::value': 0, 'meter::high': 0, 'meter::low': 0, 'meter::max': 0, 'meter::min': 0, 'meter::value': 0, 'ol::compact': 0, 'ol::reversed': 0, 'ol::start': 0, 'ol::type': 0, 'p::align': 0, 'pre::width': 0, 'q::cite': 1, 'source::type': 0, 'track::default': 0, 'track::kind': 0, 'track::label': 0, 'track::srclang': 0, 'ul::compact': 0, 'ul::type': 0, }; html4[ 'ATTRIBS' ] = html4.ATTRIBS; html4.eflags = { 'OPTIONAL_ENDTAG': 1, 'EMPTY': 2, 'CDATA': 4, 'RCDATA': 8, 'UNSAFE': 16, 'FOLDABLE': 32, 'SCRIPT': 64, 'STYLE': 128, 'VIRTUALIZED': 256 }; html4[ 'eflags' ] = html4.eflags; html4.ELEMENTS = { 'a': 0, 'abbr': 0, 'acronym': 0, 'address': 0, 'article': 0, 'aside': 0, 'b': 0, 'base': 274, 'bdi': 0, 'bdo': 0, 'big': 0, 'blockquote': 0, 'body': 305, 'br': 2, 'caption': 0, 'cite': 0, 'code': 0, 'col': 2, 'colgroup': 1, 'data': 0, 'dd': 1, 'del': 0, 'details': 0, 'dfn': 0, 'dialog': 272, 'dir': 0, 'div': 0, 'dl': 0, 'dt': 1, 'em': 0, 'figcaption': 0, 'figure': 0, 'frame': 274, 'frameset': 272, 'h1': 0, 'h2': 0, 'h3': 0, 'h4': 0, 'h5': 0, 'h6': 0, 'head': 305, 'header': 0, 'hgroup': 0, 'hr': 2, 'html': 305, 'i': 0, 'iframe': 4, 'img': 2, 'ins': 0, 'isindex': 274, 'kbd': 0, 'keygen': 274, 'label': 0, 'legend': 0, 'li': 1, 'link': 274, 'meter': 0, 'nav': 0, 'nobr': 0, 'noembed': 276, 'noframes': 276, 'noscript': 276, 'object': 272, 'ol': 0, 'p': 1, 'param': 274, 'pre': 0, 'q': 0, 's': 0, 'samp': 0, 'script': 84, 'section': 0, 'small': 0, 'span': 0, 'strike': 0, 'strong': 0, 'style': 148, 'sub': 0, 'summary': 0, 'sup': 0, 'table': 272, 'tbody': 273, 'td': 273, 'tfoot': 1, 'th': 273, 'thead': 273, 'time': 0, 'title': 280, 'tr': 273, 'track': 2, 'tt': 0, 'u': 0, 'ul': 0, 'var': 0, 'wbr': 2 }; html4[ 'ELEMENTS' ] = html4.ELEMENTS; html4.ELEMENT_DOM_INTERFACES = { 'a': 'HTMLAnchorElement', 'abbr': 'HTMLElement', 'acronym': 'HTMLElement', 'address': 'HTMLElement', 'applet': 'HTMLAppletElement', 'area': 'HTMLAreaElement', 'article': 'HTMLElement', 'aside': 'HTMLElement', 'audio': 'HTMLAudioElement', 'b': 'HTMLElement', 'base': 'HTMLBaseElement', 'basefont': 'HTMLBaseFontElement', 'bdi': 'HTMLElement', 'bdo': 'HTMLElement', 'big': 'HTMLElement', 'blockquote': 'HTMLQuoteElement', 'body': 'HTMLBodyElement', 'br': 'HTMLBRElement', 'caption': 'HTMLTableCaptionElement', 'cite': 'HTMLElement', 'code': 'HTMLElement', 'col': 'HTMLTableColElement', 'colgroup': 'HTMLTableColElement', 'command': 'HTMLCommandElement', 'data': 'HTMLElement', 'datalist': 'HTMLDataListElement', 'dd': 'HTMLElement', 'del': 'HTMLModElement', 'details': 'HTMLDetailsElement', 'dfn': 'HTMLElement', 'dialog': 'HTMLDialogElement', 'dir': 'HTMLDirectoryElement', 'div': 'HTMLDivElement', 'dl': 'HTMLDListElement', 'dt': 'HTMLElement', 'em': 'HTMLElement', 'fieldset': 'HTMLFieldSetElement', 'figcaption': 'HTMLElement', 'figure': 'HTMLElement', 'footer': 'HTMLElement', 'form': 'HTMLFormElement', 'frame': 'HTMLFrameElement', 'frameset': 'HTMLFrameSetElement', 'h1': 'HTMLHeadingElement', 'h2': 'HTMLHeadingElement', 'h3': 'HTMLHeadingElement', 'h4': 'HTMLHeadingElement', 'h5': 'HTMLHeadingElement', 'h6': 'HTMLHeadingElement', 'head': 'HTMLHeadElement', 'header': 'HTMLElement', 'hgroup': 'HTMLElement', 'hr': 'HTMLHRElement', 'html': 'HTMLHtmlElement', 'i': 'HTMLElement', 'iframe': 'HTMLIFrameElement', 'img': 'HTMLImageElement', 'input': 'HTMLInputElement', 'ins': 'HTMLModElement', 'isindex': 'HTMLUnknownElement', 'kbd': 'HTMLElement', 'keygen': 'HTMLKeygenElement', 'label': 'HTMLLabelElement', 'legend': 'HTMLLegendElement', 'li': 'HTMLLIElement', 'link': 'HTMLLinkElement', 'map': 'HTMLMapElement', 'menu': 'HTMLMenuElement', 'meta': 'HTMLMetaElement', 'meter': 'HTMLMeterElement', 'nav': 'HTMLElement', 'nobr': 'HTMLElement', 'noembed': 'HTMLElement', 'noframes': 'HTMLElement', 'noscript': 'HTMLElement', 'object': 'HTMLObjectElement', 'ol': 'HTMLOListElement', 'optgroup': 'HTMLOptGroupElement', 'option': 'HTMLOptionElement', 'output': 'HTMLOutputElement', 'p': 'HTMLParagraphElement', 'param': 'HTMLParamElement', 'pre': 'HTMLPreElement', 'q': 'HTMLQuoteElement', 's': 'HTMLElement', 'samp': 'HTMLElement', 'script': 'HTMLScriptElement', 'section': 'HTMLElement', 'select': 'HTMLSelectElement', 'small': 'HTMLElement', 'source': 'HTMLSourceElement', 'span': 'HTMLSpanElement', 'strike': 'HTMLElement', 'strong': 'HTMLElement', 'style': 'HTMLStyleElement', 'sub': 'HTMLElement', 'summary': 'HTMLElement', 'sup': 'HTMLElement', 'table': 'HTMLTableElement', 'tbody': 'HTMLTableSectionElement', 'td': 'HTMLTableDataCellElement', 'tfoot': 'HTMLTableSectionElement', 'th': 'HTMLTableHeaderCellElement', 'thead': 'HTMLTableSectionElement', 'time': 'HTMLTimeElement', 'title': 'HTMLTitleElement', 'tr': 'HTMLTableRowElement', 'track': 'HTMLTrackElement', 'tt': 'HTMLElement', 'u': 'HTMLElement', 'ul': 'HTMLUListElement', 'var': 'HTMLElement', 'video': 'HTMLVideoElement', 'wbr': 'HTMLElement' }; html4[ 'ELEMENT_DOM_INTERFACES' ] = html4.ELEMENT_DOM_INTERFACES; html4.ueffects = { 'NOT_LOADED': 0, 'SAME_DOCUMENT': 1, 'NEW_DOCUMENT': 2 }; html4[ 'ueffects' ] = html4.ueffects; html4.URIEFFECTS = { 'a::href': 2, 'area::href': 2, 'audio::src': 1, 'blockquote::cite': 0, 'command::icon': 1, 'del::cite': 0, 'form::action': 2, 'iframe::src': 1, 'img::src': 1, 'input::src': 1, 'ins::cite': 0, 'q::cite': 0, 'video::poster': 1, 'video::src': 1 }; html4[ 'URIEFFECTS' ] = html4.URIEFFECTS; html4.ltypes = { 'UNSANDBOXED': 2, 'SANDBOXED': 1, 'DATA': 0 }; html4[ 'ltypes' ] = html4.ltypes; html4.LOADERTYPES = { 'a::href': 2, 'area::href': 2, 'audio::src': 2, 'blockquote::cite': 2, 'command::icon': 1, 'del::cite': 2, 'form::action': 2, 'iframe::src': 2, 'img::src': 1, 'input::src': 1, 'ins::cite': 2, 'q::cite': 2, 'video::poster': 1, 'video::src': 2 }; html4[ 'LOADERTYPES' ] = html4.LOADERTYPES; // NOTE: currently focused only on URI-type attributes html4.REQUIREDATTRIBUTES = { "audio" : ["src"], "form" : ["action"], "iframe" : ["src"], "image" : ["src"], "video" : ["src"] }; html4[ 'REQUIREDATTRIBUTES' ] = html4.REQUIREDATTRIBUTES; // export for Closure Compiler if (typeof window !== 'undefined') { window['html4'] = html4; } ; // Copyright (C) 2006 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /** * @fileoverview * An HTML sanitizer that can satisfy a variety of security policies. * * <p> * The HTML sanitizer is built around a SAX parser and HTML element and * attributes schemas. * * If the cssparser is loaded, inline styles are sanitized using the * css property and value schemas. Else they are remove during * sanitization. * * If it exists, uses parseCssDeclarations, sanitizeCssProperty, cssSchema * * @author mikesamuel@gmail.com * @author jasvir@gmail.com * \@requires html4, URI * \@overrides window * \@provides html, html_sanitize */ // The Turkish i seems to be a non-issue, but abort in case it is. if ('I'.toLowerCase() !== 'i') { throw 'I/i problem'; } /** * \@namespace */ var html = (function(html4) { // For closure compiler var parseCssDeclarations, sanitizeCssProperty, cssSchema; if ('undefined' !== typeof window) { parseCssDeclarations = window['parseCssDeclarations']; sanitizeCssProperty = window['sanitizeCssProperty']; cssSchema = window['cssSchema']; } // The keys of this object must be 'quoted' or JSCompiler will mangle them! // This is a partial list -- lookupEntity() uses the host browser's parser // (when available) to implement full entity lookup. // Note that entities are in general case-sensitive; the uppercase ones are // explicitly defined by HTML5 (presumably as compatibility). var ENTITIES = { 'lt': '<', 'LT': '<', 'gt': '>', 'GT': '>', 'amp': '&', 'AMP': '&', 'quot': '"', 'apos': '\'', 'nbsp': '\240' }; // Patterns for types of entity/character reference names. var decimalEscapeRe = /^#(\d+)$/; var hexEscapeRe = /^#x([0-9A-Fa-f]+)$/; // contains every entity per http://www.w3.org/TR/2011/WD-html5-20110113/named-character-references.html var safeEntityNameRe = /^[A-Za-z][A-za-z0-9]+$/; // Used as a hook to invoke the browser's entity parsing. <textarea> is used // because its content is parsed for entities but not tags. // TODO(kpreid): This retrieval is a kludge and leads to silent loss of // functionality if the document isn't available. var entityLookupElement = ('undefined' !== typeof window && window['document']) ? window['document'].createElement('textarea') : null; /** * Decodes an HTML entity. * * {\@updoc * $ lookupEntity('lt') * # '<' * $ lookupEntity('GT') * # '>' * $ lookupEntity('amp') * # '&' * $ lookupEntity('nbsp') * # '\xA0' * $ lookupEntity('apos') * # "'" * $ lookupEntity('quot') * # '"' * $ lookupEntity('#xa') * # '\n' * $ lookupEntity('#10') * # '\n' * $ lookupEntity('#x0a') * # '\n' * $ lookupEntity('#010') * # '\n' * $ lookupEntity('#x00A') * # '\n' * $ lookupEntity('Pi') // Known failure * # '\u03A0' * $ lookupEntity('pi') // Known failure * # '\u03C0' * } * * @param {string} name the content between the '&' and the ';'. * @return {string} a single unicode code-point as a string. */ function lookupEntity(name) { // TODO: entity lookup as specified by HTML5 actually depends on the // presence of the ";". if (ENTITIES.hasOwnProperty(name)) { return ENTITIES[name]; } var m = name.match(decimalEscapeRe); if (m) { return String.fromCharCode(parseInt(m[1], 10)); } else if (!!(m = name.match(hexEscapeRe))) { return String.fromCharCode(parseInt(m[1], 16)); } else if (entityLookupElement && safeEntityNameRe.test(name)) { entityLookupElement.innerHTML = '&' + name + ';'; var text = entityLookupElement.textContent; ENTITIES[name] = text; return text; } else { return '&' + name + ';'; } } function decodeOneEntity(_, name) { return lookupEntity(name); } var nulRe = /\0/g; function stripNULs(s) { return s.replace(nulRe, ''); } var ENTITY_RE_1 = /&(#[0-9]+|#[xX][0-9A-Fa-f]+|\w+);/g; var ENTITY_RE_2 = /^(#[0-9]+|#[xX][0-9A-Fa-f]+|\w+);/; /** * The plain text of a chunk of HTML CDATA which possibly containing. * * {\@updoc * $ unescapeEntities('') * # '' * $ unescapeEntities('hello World!') * # 'hello World!' * $ unescapeEntities('1 < 2 && 4 > 3 ') * # '1 < 2 && 4 > 3\n' * $ unescapeEntities('<< <- unfinished entity>') * # '<< <- unfinished entity>' * $ unescapeEntities('/foo?bar=baz©=true') // & often unescaped in URLS * # '/foo?bar=baz©=true' * $ unescapeEntities('pi=ππ, Pi=Π\u03A0') // FIXME: known failure * # 'pi=\u03C0\u03c0, Pi=\u03A0\u03A0' * } * * @param {string} s a chunk of HTML CDATA. It must not start or end inside * an HTML entity. */ function unescapeEntities(s) { return s.replace(ENTITY_RE_1, decodeOneEntity); } var ampRe = /&/g; var looseAmpRe = /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi; var ltRe = /[<]/g; var gtRe = />/g; var quotRe = /\"/g; /** * Escapes HTML special characters in attribute values. * * {\@updoc * $ escapeAttrib('') * # '' * $ escapeAttrib('"<<&==&>>"') // Do not just escape the first occurrence. * # '"<<&==&>>"' * $ escapeAttrib('Hello <World>!') * # 'Hello <World>!' * } */ function escapeAttrib(s) { return ('' + s).replace(ampRe, '&').replace(ltRe, '<') .replace(gtRe, '>').replace(quotRe, '"'); } /** * Escape entities in RCDATA that can be escaped without changing the meaning. * {\@updoc * $ normalizeRCData('1 < 2 && 3 > 4 && 5 < 7&8') * # '1 < 2 && 3 > 4 && 5 < 7&8' * } */ function normalizeRCData(rcdata) { return rcdata .replace(looseAmpRe, '&$1') .replace(ltRe, '<') .replace(gtRe, '>'); } // TODO(felix8a): validate sanitizer regexs against the HTML5 grammar at // http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html // We initially split input so that potentially meaningful characters // like '<' and '>' are separate tokens, using a fast dumb process that // ignores quoting. Then we walk that token stream, and when we see a // '<' that's the start of a tag, we use ATTR_RE to extract tag // attributes from the next token. That token will never have a '>' // character. However, it might have an unbalanced quote character, and // when we see that, we combine additional tokens to balance the quote. var ATTR_RE = new RegExp( '^\\s*' + '([-.:\\w]+)' + // 1 = Attribute name '(?:' + ( '\\s*(=)\\s*' + // 2 = Is there a value? '(' + ( // 3 = Attribute value // TODO(felix8a): maybe use backref to match quotes '(\")[^\"]*(\"|$)' + // 4, 5 = Double-quoted string '|' + '(\')[^\']*(\'|$)' + // 6, 7 = Single-quoted string '|' + // Positive lookahead to prevent interpretation of // <foo a= b=c> as <foo a='b=c'> // TODO(felix8a): might be able to drop this case '(?=[a-z][-\\w]*\\s*=)' + '|' + // Unquoted value that isn't an attribute name // (since we didn't match the positive lookahead above) '[^\"\'\\s]*' ) + ')' ) + ')?', 'i'); // false on IE<=8, true on most other browsers var splitWillCapture = ('a,b'.split(/(,)/).length === 3); // bitmask for tags with special parsing, like <script> and <textarea> var EFLAGS_TEXT = html4.eflags['CDATA'] | html4.eflags['RCDATA']; /** * Given a SAX-like event handler, produce a function that feeds those * events and a parameter to the event handler. * * The event handler has the form:{@code * { * // Name is an upper-case HTML tag name. Attribs is an array of * // alternating upper-case attribute names, and attribute values. The * // attribs array is reused by the parser. Param is the value passed to * // the saxParser. * startTag: function (name, attribs, param) { ... }, * endTag: function (name, param) { ... }, * pcdata: function (text, param) { ... }, * rcdata: function (text, param) { ... }, * cdata: function (text, param) { ... }, * startDoc: function (param) { ... }, * endDoc: function (param) { ... } * }} * * @param {Object} handler a record containing event handlers. * @return {function(string, Object)} A function that takes a chunk of HTML * and a parameter. The parameter is passed on to the handler methods. */ function makeSaxParser(handler) { // Accept quoted or unquoted keys (Closure compat) var hcopy = { cdata: handler.cdata || handler['cdata'], comment: handler.comment || handler['comment'], endDoc: handler.endDoc || handler['endDoc'], endTag: handler.endTag || handler['endTag'], pcdata: handler.pcdata || handler['pcdata'], rcdata: handler.rcdata || handler['rcdata'], startDoc: handler.startDoc || handler['startDoc'], startTag: handler.startTag || handler['startTag'] }; return function(htmlText, param) { return parse(htmlText, hcopy, param); }; } // Parsing strategy is to split input into parts that might be lexically // meaningful (every ">" becomes a separate part), and then recombine // parts if we discover they're in a different context. // TODO(felix8a): Significant performance regressions from -legacy, // tested on // Chrome 18.0 // Firefox 11.0 // IE 6, 7, 8, 9 // Opera 11.61 // Safari 5.1.3 // Many of these are unusual patterns that are linearly slower and still // pretty fast (eg 1ms to 5ms), so not necessarily worth fixing. // TODO(felix8a): "<script> && && && ... <\/script>" is slower on all // browsers. The hotspot is htmlSplit. // TODO(felix8a): "<p title='>>>>...'><\/p>" is slower on all browsers. // This is partly htmlSplit, but the hotspot is parseTagAndAttrs. // TODO(felix8a): "<a><\/a><a><\/a>..." is slower on IE9. // "<a>1<\/a><a>1<\/a>..." is faster, "<a><\/a>2<a><\/a>2..." is faster. // TODO(felix8a): "<p<p<p..." is slower on IE[6-8] var continuationMarker = {}; function parse(htmlText, handler, param) { var m, p, tagName; var parts = htmlSplit(htmlText); var state = { noMoreGT: false, noMoreEndComments: false }; parseCPS(handler, parts, 0, state, param); } function continuationMaker(h, parts, initial, state, param) { return function () { parseCPS(h, parts, initial, state, param); }; } function parseCPS(h, parts, initial, state, param) { try { if (h.startDoc && initial == 0) { h.startDoc(param); } var m, p, tagName; for (var pos = initial, end = parts.length; pos < end;) { var current = parts[pos++]; var next = parts[pos]; switch (current) { case '&': if (ENTITY_RE_2.test(next)) { if (h.pcdata) { h.pcdata('&' + next, param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } pos++; } else { if (h.pcdata) { h.pcdata("&", param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } break; case '<\/': if ((m = /^([-\w:]+)[^\'\"]*/.exec(next))) { if (m[0].length === next.length && parts[pos + 1] === '>') { // fast case, no attribute parsing needed pos += 2; tagName = m[1].toLowerCase(); if (h.endTag) { h.endTag(tagName, param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } else { // slow case, need to parse attributes // TODO(felix8a): do we really care about misparsing this? pos = parseEndTag( parts, pos, h, param, continuationMarker, state); } } else { if (h.pcdata) { h.pcdata('</', param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } break; case '<': if (m = /^([-\w:]+)\s*\/?/.exec(next)) { if (m[0].length === next.length && parts[pos + 1] === '>') { // fast case, no attribute parsing needed pos += 2; tagName = m[1].toLowerCase(); if (h.startTag) { h.startTag(tagName, [], param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } // tags like <script> and <textarea> have special parsing var eflags = html4.ELEMENTS[tagName]; if (eflags & EFLAGS_TEXT) { var tag = { name: tagName, next: pos, eflags: eflags }; pos = parseText( parts, tag, h, param, continuationMarker, state); } } else { // slow case, need to parse attributes pos = parseStartTag( parts, pos, h, param, continuationMarker, state); } } else { if (h.pcdata) { h.pcdata('<', param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } break; case '<\!--': // The pathological case is n copies of '<\!--' without '-->', and // repeated failure to find '-->' is quadratic. We avoid that by // remembering when search for '-->' fails. if (!state.noMoreEndComments) { // A comment <\!--x--> is split into three tokens: // '<\!--', 'x--', '>' // We want to find the next '>' token that has a preceding '--'. // pos is at the 'x--'. for (p = pos + 1; p < end; p++) { if (parts[p] === '>' && /--$/.test(parts[p - 1])) { break; } } if (p < end) { if (h.comment) { var comment = parts.slice(pos, p).join(''); h.comment( comment.substr(0, comment.length - 2), param, continuationMarker, continuationMaker(h, parts, p + 1, state, param)); } pos = p + 1; } else { state.noMoreEndComments = true; } } if (state.noMoreEndComments) { if (h.pcdata) { h.pcdata('<!--', param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } break; case '<\!': if (!/^\w/.test(next)) { if (h.pcdata) { h.pcdata('<!', param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } else { // similar to noMoreEndComment logic if (!state.noMoreGT) { for (p = pos + 1; p < end; p++) { if (parts[p] === '>') { break; } } if (p < end) { pos = p + 1; } else { state.noMoreGT = true; } } if (state.noMoreGT) { if (h.pcdata) { h.pcdata('<!', param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } } break; case '<?': // similar to noMoreEndComment logic if (!state.noMoreGT) { for (p = pos + 1; p < end; p++) { if (parts[p] === '>') { break; } } if (p < end) { pos = p + 1; } else { state.noMoreGT = true; } } if (state.noMoreGT) { if (h.pcdata) { h.pcdata('<?', param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } } break; case '>': if (h.pcdata) { h.pcdata(">", param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } break; case '': break; default: if (h.pcdata) { h.pcdata(current, param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } break; } } if (h.endDoc) { h.endDoc(param); } } catch (e) { if (e !== continuationMarker) { throw e; } } } // Split str into parts for the html parser. function htmlSplit(str) { // can't hoist this out of the function because of the re.exec loop. var re = /(<\/|<\!--|<[!?]|[&<>])/g; str += ''; if (splitWillCapture) { return str.split(re); } else { var parts = []; var lastPos = 0; var m; while ((m = re.exec(str)) !== null) { parts.push(str.substring(lastPos, m.index)); parts.push(m[0]); lastPos = m.index + m[0].length; } parts.push(str.substring(lastPos)); return parts; } } function parseEndTag(parts, pos, h, param, continuationMarker, state) { var tag = parseTagAndAttrs(parts, pos); // drop unclosed tags if (!tag) { return parts.length; } if (h.endTag) { h.endTag(tag.name, param, continuationMarker, continuationMaker(h, parts, pos, state, param)); } return tag.next; } function parseStartTag(parts, pos, h, param, continuationMarker, state) { var tag = parseTagAndAttrs(parts, pos); // drop unclosed tags if (!tag) { return parts.length; } if (h.startTag) { h.startTag(tag.name, tag.attrs, param, continuationMarker, continuationMaker(h, parts, tag.next, state, param)); } // tags like <script> and <textarea> have special parsing if (tag.eflags & EFLAGS_TEXT) { return parseText(parts, tag, h, param, continuationMarker, state); } else { return tag.next; } } var endTagRe = {}; // Tags like <script> and <textarea> are flagged as CDATA or RCDATA, // which means everything is text until we see the correct closing tag. function parseText(parts, tag, h, param, continuationMarker, state) { var end = parts.length; if (!endTagRe.hasOwnProperty(tag.name)) { endTagRe[tag.name] = new RegExp('^' + tag.name + '(?:[\\s\\/]|$)', 'i'); } var re = endTagRe[tag.name]; var first = tag.next; var p = tag.next + 1; for (; p < end; p++) { if (parts[p - 1] === '<\/' && re.test(parts[p])) { break; } } if (p < end) { p -= 1; } var buf = parts.slice(first, p).join(''); if (tag.eflags & html4.eflags['CDATA']) { if (h.cdata) { h.cdata(buf, param, continuationMarker, continuationMaker(h, parts, p, state, param)); } } else if (tag.eflags & html4.eflags['RCDATA']) { if (h.rcdata) { h.rcdata(normalizeRCData(buf), param, continuationMarker, continuationMaker(h, parts, p, state, param)); } } else { throw new Error('bug'); } return p; } // at this point, parts[pos-1] is either "<" or "<\/". function parseTagAndAttrs(parts, pos) { var m = /^([-\w:]+)/.exec(parts[pos]); var tag = {}; tag.name = m[1].toLowerCase(); tag.eflags = html4.ELEMENTS[tag.name]; var buf = parts[pos].substr(m[0].length); // Find the next '>'. We optimistically assume this '>' is not in a // quoted context, and further down we fix things up if it turns out to // be quoted. var p = pos + 1; var end = parts.length; for (; p < end; p++) { if (parts[p] === '>') { break; } buf += parts[p]; } if (end <= p) { return void 0; } var attrs = []; while (buf !== '') { m = ATTR_RE.exec(buf); if (!m) { // No attribute found: skip garbage buf = buf.replace(/^[\s\S][^a-z\s]*/, ''); } else if ((m[4] && !m[5]) || (m[6] && !m[7])) { // Unterminated quote: slurp to the next unquoted '>' var quote = m[4] || m[6]; var sawQuote = false; var abuf = [buf, parts[p++]]; for (; p < end; p++) { if (sawQuote) { if (parts[p] === '>') { break; } } else if (0 <= parts[p].indexOf(quote)) { sawQuote = true; } abuf.push(parts[p]); } // Slurp failed: lose the garbage if (end <= p) { break; } // Otherwise retry attribute parsing buf = abuf.join(''); continue; } else { // We have an attribute var aName = m[1].toLowerCase(); var aValue = m[2] ? decodeValue(m[3]) : ''; attrs.push(aName, aValue); buf = buf.substr(m[0].length); } } tag.attrs = attrs; tag.next = p + 1; return tag; } function decodeValue(v) { var q = v.charCodeAt(0); if (q === 0x22 || q === 0x27) { // " or ' v = v.substr(1, v.length - 2); } return unescapeEntities(stripNULs(v)); } /** * Returns a function that strips unsafe tags and attributes from html. * @param {function(string, Array.<string>): ?Array.<string>} tagPolicy * A function that takes (tagName, attribs[]), where tagName is a key in * html4.ELEMENTS and attribs is an array of alternating attribute names * and values. It should return a record (as follows), or null to delete * the element. It's okay for tagPolicy to modify the attribs array, * but the same array is reused, so it should not be held between calls. * Record keys: * attribs: (required) Sanitized attributes array. * tagName: Replacement tag name. * @return {function(string, Array)} A function that sanitizes a string of * HTML and appends result strings to the second argument, an array. */ function makeHtmlSanitizer(tagPolicy) { var stack; var ignoring; var emit = function (text, out) { if (!ignoring) { out.push(text); } }; return makeSaxParser({ 'startDoc': function(_) { stack = []; ignoring = false; }, 'startTag': function(tagNameOrig, attribs, out) { if (ignoring) { return; } if (!html4.ELEMENTS.hasOwnProperty(tagNameOrig)) { return; } var eflagsOrig = html4.ELEMENTS[tagNameOrig]; if (eflagsOrig & html4.eflags['FOLDABLE']) { return; } var decision = tagPolicy(tagNameOrig, attribs); if (!decision) { ignoring = !(eflagsOrig & html4.eflags['EMPTY']); return; } else if (typeof decision !== 'object') { throw new Error('tagPolicy did not return object (old API?)'); } if ('attribs' in decision) { attribs = decision['attribs']; } else { throw new Error('tagPolicy gave no attribs'); } var eflagsRep; var tagNameRep; if ('tagName' in decision) { tagNameRep = decision['tagName']; eflagsRep = html4.ELEMENTS[tagNameRep]; } else { tagNameRep = tagNameOrig; eflagsRep = eflagsOrig; } // TODO(mikesamuel): relying on tagPolicy not to insert unsafe // attribute names. // If this is an optional-end-tag element and either this element or its // previous like sibling was rewritten, then insert a close tag to // preserve structure. if (eflagsOrig & html4.eflags['OPTIONAL_ENDTAG']) { var onStack = stack[stack.length - 1]; if (onStack && onStack.orig === tagNameOrig && (onStack.rep !== tagNameRep || tagNameOrig !== tagNameRep)) { out.push('<\/', onStack.rep, '>'); } } if (!(eflagsOrig & html4.eflags['EMPTY'])) { stack.push({orig: tagNameOrig, rep: tagNameRep}); } out.push('<', tagNameRep); for (var i = 0, n = attribs.length; i < n; i += 2) { var attribName = attribs[i], value = attribs[i + 1]; if (value !== null && value !== void 0) { out.push(' ', attribName, '="', escapeAttrib(value), '"'); } } out.push('>'); if ((eflagsOrig & html4.eflags['EMPTY']) && !(eflagsRep & html4.eflags['EMPTY'])) { // replacement is non-empty, synthesize end tag out.push('<\/', tagNameRep, '>'); } }, 'endTag': function(tagName, out) { if (ignoring) { ignoring = false; return; } if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; } var eflags = html4.ELEMENTS[tagName]; if (!(eflags & (html4.eflags['EMPTY'] | html4.eflags['FOLDABLE']))) { var index; if (eflags & html4.eflags['OPTIONAL_ENDTAG']) { for (index = stack.length; --index >= 0;) { var stackElOrigTag = stack[index].orig; if (stackElOrigTag === tagName) { break; } if (!(html4.ELEMENTS[stackElOrigTag] & html4.eflags['OPTIONAL_ENDTAG'])) { // Don't pop non optional end tags looking for a match. return; } } } else { for (index = stack.length; --index >= 0;) { if (stack[index].orig === tagName) { break; } } } if (index < 0) { return; } // Not opened. for (var i = stack.length; --i > index;) { var stackElRepTag = stack[i].rep; if (!(html4.ELEMENTS[stackElRepTag] & html4.eflags['OPTIONAL_ENDTAG'])) { out.push('<\/', stackElRepTag, '>'); } } if (index < stack.length) { tagName = stack[index].rep; } stack.length = index; out.push('<\/', tagName, '>'); } }, 'pcdata': emit, 'rcdata': emit, 'cdata': emit, 'endDoc': function(out) { for (; stack.length; stack.length--) { out.push('<\/', stack[stack.length - 1].rep, '>'); } } }); } var ALLOWED_URI_SCHEMES = /^(?:https?|mailto)$/i; function safeUri(uri, effect, ltype, hints, naiveUriRewriter) { if (!naiveUriRewriter) { return null; } try { var parsed = URI.parse('' + uri); if (parsed) { if (!parsed.hasScheme() || ALLOWED_URI_SCHEMES.test(parsed.getScheme())) { var safe = naiveUriRewriter(parsed, effect, ltype, hints); return safe ? safe.toString() : null; } } } catch (e) { return null; } return null; } function log(logger, tagName, attribName, oldValue, newValue) { if (!attribName) { logger(tagName + " removed", { change: "removed", tagName: tagName }); } if (oldValue !== newValue) { var changed = "changed"; if (oldValue && !newValue) { changed = "removed"; } else if (!oldValue && newValue) { changed = "added"; } logger(tagName + "." + attribName + " " + changed, { change: changed, tagName: tagName, attribName: attribName, oldValue: oldValue, newValue: newValue }); } } function lookupAttribute(map, tagName, attribName) { var attribKey; attribKey = tagName + '::' + attribName; if (map.hasOwnProperty(attribKey)) { return map[attribKey]; } attribKey = '*::' + attribName; if (map.hasOwnProperty(attribKey)) { return map[attribKey]; } return void 0; } function getAttributeType(tagName, attribName) { return lookupAttribute(html4.ATTRIBS, tagName, attribName); } function getLoaderType(tagName, attribName) { return lookupAttribute(html4.LOADERTYPES, tagName, attribName); } function getUriEffect(tagName, attribName) { return lookupAttribute(html4.URIEFFECTS, tagName, attribName); } /** * Sanitizes attributes on an HTML tag. * @param {string} tagName An HTML tag name in lowercase. * @param {Array.<?string>} attribs An array of alternating names and values. * @param {?function(?string): ?string} opt_naiveUriRewriter A transform to * apply to URI attributes; it can return a new string value, or null to * delete the attribute. If unspecified, URI attributes are deleted. * @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply * to attributes containing HTML names, element IDs, and space-separated * lists of classes; it can return a new string value, or null to delete * the attribute. If unspecified, these attributes are kept unchanged. * @return {Array.<?string>} The sanitized attributes as a list of alternating * names and values, where a null value means to omit the attribute. */ function sanitizeAttribs(tagName, attribs, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) { // TODO(felix8a): it's obnoxious that domado duplicates much of this // TODO(felix8a): maybe consistently enforce constraints like target= for (var i = 0; i < attribs.length; i += 2) { var attribName = attribs[i]; var value = attribs[i + 1]; var oldValue = value; var atype = null, attribKey; if ((attribKey = tagName + '::' + attribName, html4.ATTRIBS.hasOwnProperty(attribKey)) || (attribKey = '*::' + attribName, html4.ATTRIBS.hasOwnProperty(attribKey))) { atype = html4.ATTRIBS[attribKey]; } // Discourse modification: give us more flexibility with whitelists if (opt_nmTokenPolicy) { var newValue = opt_nmTokenPolicy(tagName, attribName, value); if (newValue) { attribs[i + 1] = newValue; continue; } } if (atype !== null) { switch (atype) { case html4.atype['NONE']: break; case html4.atype['SCRIPT']: value = null; if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } break; case html4.atype['STYLE']: if ('undefined' === typeof parseCssDeclarations) { value = null; if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } break; } var sanitizedDeclarations = []; parseCssDeclarations( value, { 'declaration': function (property, tokens) { var normProp = property.toLowerCase(); sanitizeCssProperty( normProp, tokens, opt_naiveUriRewriter ? function (url) { return safeUri( url, html4.ueffects.SAME_DOCUMENT, html4.ltypes.SANDBOXED, { "TYPE": "CSS", "CSS_PROP": normProp }, opt_naiveUriRewriter); } : null); if (tokens.length) { sanitizedDeclarations.push( normProp + ': ' + tokens.join(' ')); } } }); value = sanitizedDeclarations.length > 0 ? sanitizedDeclarations.join(' ; ') : null; if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } break; case html4.atype['URI']: value = safeUri(value, getUriEffect(tagName, attribName), getLoaderType(tagName, attribName), { "TYPE": "MARKUP", "XML_ATTR": attribName, "XML_TAG": tagName }, opt_naiveUriRewriter); if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } break; case html4.atype['URI_FRAGMENT']: if (value && '#' === value.charAt(0)) { value = value.substring(1); // remove the leading '#' if (value !== null && value !== void 0) { value = '#' + value; // restore the leading '#' } } else { value = null; } if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } break; default: value = null; if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } break; } } else { value = null; if (opt_logger) { log(opt_logger, tagName, attribName, oldValue, value); } } attribs[i + 1] = value; } return attribs; } /** * Creates a tag policy that omits all tags marked UNSAFE in html4-defs.js * and applies the default attribute sanitizer with the supplied policy for * URI attributes and NMTOKEN attributes. * @param {?function(?string): ?string} opt_naiveUriRewriter A transform to * apply to URI attributes. If not given, URI attributes are deleted. * @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply * to attributes containing HTML names, element IDs, and space-separated * lists of classes. If not given, such attributes are left unchanged. * @return {function(string, Array.<?string>)} A tagPolicy suitable for * passing to html.sanitize. */ function makeTagPolicy(opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) { return function(tagName, attribs) { if (!(html4.ELEMENTS[tagName] & html4.eflags['UNSAFE'])) { var sanitizedAttribs = sanitizeAttribs(tagName, attribs, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger); var requiredAttributes = html4.REQUIREDATTRIBUTES[tagName]; if (requiredAttributes && missRequiredAttributes(sanitizedAttribs, requiredAttributes)) { return } return { 'attribs': sanitizedAttribs }; } else { if (opt_logger) { log(opt_logger, tagName, undefined, undefined, undefined); } } }; } function missRequiredAttributes(sanitizedAttributes, requiredAttributes) { var requiredAttributesWithValueCount = 0; for (var i = 0, length = sanitizedAttributes.length; i < length; i += 2) { var name = sanitizedAttributes[i]; var value = sanitizedAttributes[i + 1]; if (requiredAttributes.indexOf(name) > -1 && value && value.length > 0) { requiredAttributesWithValueCount++; } } return requiredAttributesWithValueCount != requiredAttributes.length; } /** * Sanitizes HTML tags and attributes according to a given policy. * @param {string} inputHtml The HTML to sanitize. * @param {function(string, Array.<?string>)} tagPolicy A function that * decides which tags to accept and sanitizes their attributes (see * makeHtmlSanitizer above for details). * @return {string} The sanitized HTML. */ function sanitizeWithPolicy(inputHtml, tagPolicy) { var outputArray = []; makeHtmlSanitizer(tagPolicy)(inputHtml, outputArray); return outputArray.join(''); } /** * Strips unsafe tags and attributes from HTML. * @param {string} inputHtml The HTML to sanitize. * @param {?function(?string): ?string} opt_naiveUriRewriter A transform to * apply to URI attributes. If not given, URI attributes are deleted. * @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply * to attributes containing HTML names, element IDs, and space-separated * lists of classes. If not given, such attributes are left unchanged. */ function sanitize(inputHtml, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) { var tagPolicy = makeTagPolicy(opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger); return sanitizeWithPolicy(inputHtml, tagPolicy); } // Export both quoted and unquoted names for Closure linkage. var html = {}; html.escapeAttrib = html['escapeAttrib'] = escapeAttrib; html.makeHtmlSanitizer = html['makeHtmlSanitizer'] = makeHtmlSanitizer; html.makeSaxParser = html['makeSaxParser'] = makeSaxParser; html.makeTagPolicy = html['makeTagPolicy'] = makeTagPolicy; html.normalizeRCData = html['normalizeRCData'] = normalizeRCData; html.sanitize = html['sanitize'] = sanitize; html.sanitizeAttribs = html['sanitizeAttribs'] = sanitizeAttribs; html.sanitizeWithPolicy = html['sanitizeWithPolicy'] = sanitizeWithPolicy; html.unescapeEntities = html['unescapeEntities'] = unescapeEntities; return html; })(html4); var html_sanitize = html['sanitize']; // Exports for Closure compiler. Note this file is also cajoled // for domado and run in an environment without 'window' if (typeof window !== 'undefined') { window['html'] = html; window['html_sanitize'] = html_sanitize; }