mirror of
https://github.com/discourse/discourse.git
synced 2024-11-24 07:34:18 +08:00
2261 lines
69 KiB
JavaScript
2261 lines
69 KiB
JavaScript
// Copyright (C) 2010 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
/**
|
|
* @fileoverview
|
|
* Implements RFC 3986 for parsing/formatting URIs.
|
|
*
|
|
* @author mikesamuel@gmail.com
|
|
* \@provides URI
|
|
* \@overrides window
|
|
*/
|
|
|
|
var URI = (function () {
|
|
|
|
/**
|
|
* creates a uri from the string form. The parser is relaxed, so special
|
|
* characters that aren't escaped but don't cause ambiguities will not cause
|
|
* parse failures.
|
|
*
|
|
* @return {URI|null}
|
|
*/
|
|
function parse(uriStr) {
|
|
var m = ('' + uriStr).match(URI_RE_);
|
|
if (!m) { return null; }
|
|
return new URI(
|
|
nullIfAbsent(m[1]),
|
|
nullIfAbsent(m[2]),
|
|
nullIfAbsent(m[3]),
|
|
nullIfAbsent(m[4]),
|
|
nullIfAbsent(m[5]),
|
|
nullIfAbsent(m[6]),
|
|
nullIfAbsent(m[7]));
|
|
}
|
|
|
|
|
|
/**
|
|
* creates a uri from the given parts.
|
|
*
|
|
* @param scheme {string} an unencoded scheme such as "http" or null
|
|
* @param credentials {string} unencoded user credentials or null
|
|
* @param domain {string} an unencoded domain name or null
|
|
* @param port {number} a port number in [1, 32768].
|
|
* -1 indicates no port, as does null.
|
|
* @param path {string} an unencoded path
|
|
* @param query {Array.<string>|string|null} a list of unencoded cgi
|
|
* parameters where even values are keys and odds the corresponding values
|
|
* or an unencoded query.
|
|
* @param fragment {string} an unencoded fragment without the "#" or null.
|
|
* @return {URI}
|
|
*/
|
|
function create(scheme, credentials, domain, port, path, query, fragment) {
|
|
var uri = new URI(
|
|
encodeIfExists2(scheme, URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_),
|
|
encodeIfExists2(
|
|
credentials, URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_),
|
|
encodeIfExists(domain),
|
|
port > 0 ? port.toString() : null,
|
|
encodeIfExists2(path, URI_DISALLOWED_IN_PATH_),
|
|
null,
|
|
encodeIfExists(fragment));
|
|
if (query) {
|
|
if ('string' === typeof query) {
|
|
uri.setRawQuery(query.replace(/[^?&=0-9A-Za-z_\-~.%]/g, encodeOne));
|
|
} else {
|
|
uri.setAllParameters(query);
|
|
}
|
|
}
|
|
return uri;
|
|
}
|
|
function encodeIfExists(unescapedPart) {
|
|
if ('string' == typeof unescapedPart) {
|
|
return encodeURIComponent(unescapedPart);
|
|
}
|
|
return null;
|
|
};
|
|
/**
|
|
* if unescapedPart is non null, then escapes any characters in it that aren't
|
|
* valid characters in a url and also escapes any special characters that
|
|
* appear in extra.
|
|
*
|
|
* @param unescapedPart {string}
|
|
* @param extra {RegExp} a character set of characters in [\01-\177].
|
|
* @return {string|null} null iff unescapedPart == null.
|
|
*/
|
|
function encodeIfExists2(unescapedPart, extra) {
|
|
if ('string' == typeof unescapedPart) {
|
|
return encodeURI(unescapedPart).replace(extra, encodeOne);
|
|
}
|
|
return null;
|
|
};
|
|
/** converts a character in [\01-\177] to its url encoded equivalent. */
|
|
function encodeOne(ch) {
|
|
var n = ch.charCodeAt(0);
|
|
return '%' + '0123456789ABCDEF'.charAt((n >> 4) & 0xf) +
|
|
'0123456789ABCDEF'.charAt(n & 0xf);
|
|
}
|
|
|
|
/**
|
|
* {@updoc
|
|
* $ normPath('foo/./bar')
|
|
* # 'foo/bar'
|
|
* $ normPath('./foo')
|
|
* # 'foo'
|
|
* $ normPath('foo/.')
|
|
* # 'foo'
|
|
* $ normPath('foo//bar')
|
|
* # 'foo/bar'
|
|
* }
|
|
*/
|
|
function normPath(path) {
|
|
return path.replace(/(^|\/)\.(?:\/|$)/g, '$1').replace(/\/{2,}/g, '/');
|
|
}
|
|
|
|
var PARENT_DIRECTORY_HANDLER = new RegExp(
|
|
''
|
|
// A path break
|
|
+ '(/|^)'
|
|
// followed by a non .. path element
|
|
// (cannot be . because normPath is used prior to this RegExp)
|
|
+ '(?:[^./][^/]*|\\.{2,}(?:[^./][^/]*)|\\.{3,}[^/]*)'
|
|
// followed by .. followed by a path break.
|
|
+ '/\\.\\.(?:/|$)');
|
|
|
|
var PARENT_DIRECTORY_HANDLER_RE = new RegExp(PARENT_DIRECTORY_HANDLER);
|
|
|
|
var EXTRA_PARENT_PATHS_RE = /^(?:\.\.\/)*(?:\.\.$)?/;
|
|
|
|
/**
|
|
* Normalizes its input path and collapses all . and .. sequences except for
|
|
* .. sequences that would take it above the root of the current parent
|
|
* directory.
|
|
* {@updoc
|
|
* $ collapse_dots('foo/../bar')
|
|
* # 'bar'
|
|
* $ collapse_dots('foo/./bar')
|
|
* # 'foo/bar'
|
|
* $ collapse_dots('foo/../bar/./../../baz')
|
|
* # 'baz'
|
|
* $ collapse_dots('../foo')
|
|
* # '../foo'
|
|
* $ collapse_dots('../foo').replace(EXTRA_PARENT_PATHS_RE, '')
|
|
* # 'foo'
|
|
* }
|
|
*/
|
|
function collapse_dots(path) {
|
|
if (path === null) { return null; }
|
|
var p = normPath(path);
|
|
// Only /../ left to flatten
|
|
var r = PARENT_DIRECTORY_HANDLER_RE;
|
|
// We replace with $1 which matches a / before the .. because this
|
|
// guarantees that:
|
|
// (1) we have at most 1 / between the adjacent place,
|
|
// (2) always have a slash if there is a preceding path section, and
|
|
// (3) we never turn a relative path into an absolute path.
|
|
for (var q; (q = p.replace(r, '$1')) != p; p = q) {};
|
|
return p;
|
|
}
|
|
|
|
/**
|
|
* resolves a relative url string to a base uri.
|
|
* @return {URI}
|
|
*/
|
|
function resolve(baseUri, relativeUri) {
|
|
// there are several kinds of relative urls:
|
|
// 1. //foo - replaces everything from the domain on. foo is a domain name
|
|
// 2. foo - replaces the last part of the path, the whole query and fragment
|
|
// 3. /foo - replaces the the path, the query and fragment
|
|
// 4. ?foo - replace the query and fragment
|
|
// 5. #foo - replace the fragment only
|
|
|
|
var absoluteUri = baseUri.clone();
|
|
// we satisfy these conditions by looking for the first part of relativeUri
|
|
// that is not blank and applying defaults to the rest
|
|
|
|
var overridden = relativeUri.hasScheme();
|
|
|
|
if (overridden) {
|
|
absoluteUri.setRawScheme(relativeUri.getRawScheme());
|
|
} else {
|
|
overridden = relativeUri.hasCredentials();
|
|
}
|
|
|
|
if (overridden) {
|
|
absoluteUri.setRawCredentials(relativeUri.getRawCredentials());
|
|
} else {
|
|
overridden = relativeUri.hasDomain();
|
|
}
|
|
|
|
if (overridden) {
|
|
absoluteUri.setRawDomain(relativeUri.getRawDomain());
|
|
} else {
|
|
overridden = relativeUri.hasPort();
|
|
}
|
|
|
|
var rawPath = relativeUri.getRawPath();
|
|
var simplifiedPath = collapse_dots(rawPath);
|
|
if (overridden) {
|
|
absoluteUri.setPort(relativeUri.getPort());
|
|
simplifiedPath = simplifiedPath
|
|
&& simplifiedPath.replace(EXTRA_PARENT_PATHS_RE, '');
|
|
} else {
|
|
overridden = !!rawPath;
|
|
if (overridden) {
|
|
// resolve path properly
|
|
if (simplifiedPath.charCodeAt(0) !== 0x2f /* / */) { // path is relative
|
|
var absRawPath = collapse_dots(absoluteUri.getRawPath() || '')
|
|
.replace(EXTRA_PARENT_PATHS_RE, '');
|
|
var slash = absRawPath.lastIndexOf('/') + 1;
|
|
simplifiedPath = collapse_dots(
|
|
(slash ? absRawPath.substring(0, slash) : '')
|
|
+ collapse_dots(rawPath))
|
|
.replace(EXTRA_PARENT_PATHS_RE, '');
|
|
}
|
|
} else {
|
|
simplifiedPath = simplifiedPath
|
|
&& simplifiedPath.replace(EXTRA_PARENT_PATHS_RE, '');
|
|
if (simplifiedPath !== rawPath) {
|
|
absoluteUri.setRawPath(simplifiedPath);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (overridden) {
|
|
absoluteUri.setRawPath(simplifiedPath);
|
|
} else {
|
|
overridden = relativeUri.hasQuery();
|
|
}
|
|
|
|
if (overridden) {
|
|
absoluteUri.setRawQuery(relativeUri.getRawQuery());
|
|
} else {
|
|
overridden = relativeUri.hasFragment();
|
|
}
|
|
|
|
if (overridden) {
|
|
absoluteUri.setRawFragment(relativeUri.getRawFragment());
|
|
}
|
|
|
|
return absoluteUri;
|
|
}
|
|
|
|
/**
|
|
* a mutable URI.
|
|
*
|
|
* This class contains setters and getters for the parts of the URI.
|
|
* The <tt>getXYZ</tt>/<tt>setXYZ</tt> methods return the decoded part -- so
|
|
* <code>uri.parse('/foo%20bar').getPath()</code> will return the decoded path,
|
|
* <tt>/foo bar</tt>.
|
|
*
|
|
* <p>The raw versions of fields are available too.
|
|
* <code>uri.parse('/foo%20bar').getRawPath()</code> will return the raw path,
|
|
* <tt>/foo%20bar</tt>. Use the raw setters with care, since
|
|
* <code>URI::toString</code> is not guaranteed to return a valid url if a
|
|
* raw setter was used.
|
|
*
|
|
* <p>All setters return <tt>this</tt> and so may be chained, a la
|
|
* <code>uri.parse('/foo').setFragment('part').toString()</code>.
|
|
*
|
|
* <p>You should not use this constructor directly -- please prefer the factory
|
|
* functions {@link uri.parse}, {@link uri.create}, {@link uri.resolve}
|
|
* instead.</p>
|
|
*
|
|
* <p>The parameters are all raw (assumed to be properly escaped) parts, and
|
|
* any (but not all) may be null. Undefined is not allowed.</p>
|
|
*
|
|
* @constructor
|
|
*/
|
|
function URI(
|
|
rawScheme,
|
|
rawCredentials, rawDomain, port,
|
|
rawPath, rawQuery, rawFragment) {
|
|
this.scheme_ = rawScheme;
|
|
this.credentials_ = rawCredentials;
|
|
this.domain_ = rawDomain;
|
|
this.port_ = port;
|
|
this.path_ = rawPath;
|
|
this.query_ = rawQuery;
|
|
this.fragment_ = rawFragment;
|
|
/**
|
|
* @type {Array|null}
|
|
*/
|
|
this.paramCache_ = null;
|
|
}
|
|
|
|
/** returns the string form of the url. */
|
|
URI.prototype.toString = function () {
|
|
var out = [];
|
|
if (null !== this.scheme_) { out.push(this.scheme_, ':'); }
|
|
if (null !== this.domain_) {
|
|
out.push('//');
|
|
if (null !== this.credentials_) { out.push(this.credentials_, '@'); }
|
|
out.push(this.domain_);
|
|
if (null !== this.port_) { out.push(':', this.port_.toString()); }
|
|
}
|
|
if (null !== this.path_) { out.push(this.path_); }
|
|
if (null !== this.query_) { out.push('?', this.query_); }
|
|
if (null !== this.fragment_) { out.push('#', this.fragment_); }
|
|
return out.join('');
|
|
};
|
|
|
|
URI.prototype.clone = function () {
|
|
return new URI(this.scheme_, this.credentials_, this.domain_, this.port_,
|
|
this.path_, this.query_, this.fragment_);
|
|
};
|
|
|
|
URI.prototype.getScheme = function () {
|
|
// HTML5 spec does not require the scheme to be lowercased but
|
|
// all common browsers except Safari lowercase the scheme.
|
|
return this.scheme_ && decodeURIComponent(this.scheme_).toLowerCase();
|
|
};
|
|
URI.prototype.getRawScheme = function () {
|
|
return this.scheme_;
|
|
};
|
|
URI.prototype.setScheme = function (newScheme) {
|
|
this.scheme_ = encodeIfExists2(
|
|
newScheme, URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_);
|
|
return this;
|
|
};
|
|
URI.prototype.setRawScheme = function (newScheme) {
|
|
this.scheme_ = newScheme ? newScheme : null;
|
|
return this;
|
|
};
|
|
URI.prototype.hasScheme = function () {
|
|
return null !== this.scheme_;
|
|
};
|
|
|
|
|
|
URI.prototype.getCredentials = function () {
|
|
return this.credentials_ && decodeURIComponent(this.credentials_);
|
|
};
|
|
URI.prototype.getRawCredentials = function () {
|
|
return this.credentials_;
|
|
};
|
|
URI.prototype.setCredentials = function (newCredentials) {
|
|
this.credentials_ = encodeIfExists2(
|
|
newCredentials, URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_);
|
|
|
|
return this;
|
|
};
|
|
URI.prototype.setRawCredentials = function (newCredentials) {
|
|
this.credentials_ = newCredentials ? newCredentials : null;
|
|
return this;
|
|
};
|
|
URI.prototype.hasCredentials = function () {
|
|
return null !== this.credentials_;
|
|
};
|
|
|
|
|
|
URI.prototype.getDomain = function () {
|
|
return this.domain_ && decodeURIComponent(this.domain_);
|
|
};
|
|
URI.prototype.getRawDomain = function () {
|
|
return this.domain_;
|
|
};
|
|
URI.prototype.setDomain = function (newDomain) {
|
|
return this.setRawDomain(newDomain && encodeURIComponent(newDomain));
|
|
};
|
|
URI.prototype.setRawDomain = function (newDomain) {
|
|
this.domain_ = newDomain ? newDomain : null;
|
|
// Maintain the invariant that paths must start with a slash when the URI
|
|
// is not path-relative.
|
|
return this.setRawPath(this.path_);
|
|
};
|
|
URI.prototype.hasDomain = function () {
|
|
return null !== this.domain_;
|
|
};
|
|
|
|
|
|
URI.prototype.getPort = function () {
|
|
return this.port_ && decodeURIComponent(this.port_);
|
|
};
|
|
URI.prototype.setPort = function (newPort) {
|
|
if (newPort) {
|
|
newPort = Number(newPort);
|
|
if (newPort !== (newPort & 0xffff)) {
|
|
throw new Error('Bad port number ' + newPort);
|
|
}
|
|
this.port_ = '' + newPort;
|
|
} else {
|
|
this.port_ = null;
|
|
}
|
|
return this;
|
|
};
|
|
URI.prototype.hasPort = function () {
|
|
return null !== this.port_;
|
|
};
|
|
|
|
|
|
URI.prototype.getPath = function () {
|
|
return this.path_ && decodeURIComponent(this.path_);
|
|
};
|
|
URI.prototype.getRawPath = function () {
|
|
return this.path_;
|
|
};
|
|
URI.prototype.setPath = function (newPath) {
|
|
return this.setRawPath(encodeIfExists2(newPath, URI_DISALLOWED_IN_PATH_));
|
|
};
|
|
URI.prototype.setRawPath = function (newPath) {
|
|
if (newPath) {
|
|
newPath = String(newPath);
|
|
this.path_ =
|
|
// Paths must start with '/' unless this is a path-relative URL.
|
|
(!this.domain_ || /^\//.test(newPath)) ? newPath : '/' + newPath;
|
|
} else {
|
|
this.path_ = null;
|
|
}
|
|
return this;
|
|
};
|
|
URI.prototype.hasPath = function () {
|
|
return null !== this.path_;
|
|
};
|
|
|
|
|
|
URI.prototype.getQuery = function () {
|
|
// From http://www.w3.org/Addressing/URL/4_URI_Recommentations.html
|
|
// Within the query string, the plus sign is reserved as shorthand notation
|
|
// for a space.
|
|
return this.query_ && decodeURIComponent(this.query_).replace(/\+/g, ' ');
|
|
};
|
|
URI.prototype.getRawQuery = function () {
|
|
return this.query_;
|
|
};
|
|
URI.prototype.setQuery = function (newQuery) {
|
|
this.paramCache_ = null;
|
|
this.query_ = encodeIfExists(newQuery);
|
|
return this;
|
|
};
|
|
URI.prototype.setRawQuery = function (newQuery) {
|
|
this.paramCache_ = null;
|
|
this.query_ = newQuery ? newQuery : null;
|
|
return this;
|
|
};
|
|
URI.prototype.hasQuery = function () {
|
|
return null !== this.query_;
|
|
};
|
|
|
|
/**
|
|
* sets the query given a list of strings of the form
|
|
* [ key0, value0, key1, value1, ... ].
|
|
*
|
|
* <p><code>uri.setAllParameters(['a', 'b', 'c', 'd']).getQuery()</code>
|
|
* will yield <code>'a=b&c=d'</code>.
|
|
*/
|
|
URI.prototype.setAllParameters = function (params) {
|
|
if (typeof params === 'object') {
|
|
if (!(params instanceof Array)
|
|
&& (params instanceof Object
|
|
|| Object.prototype.toString.call(params) !== '[object Array]')) {
|
|
var newParams = [];
|
|
var i = -1;
|
|
for (var k in params) {
|
|
var v = params[k];
|
|
if ('string' === typeof v) {
|
|
newParams[++i] = k;
|
|
newParams[++i] = v;
|
|
}
|
|
}
|
|
params = newParams;
|
|
}
|
|
}
|
|
this.paramCache_ = null;
|
|
var queryBuf = [];
|
|
var separator = '';
|
|
for (var j = 0; j < params.length;) {
|
|
var k = params[j++];
|
|
var v = params[j++];
|
|
queryBuf.push(separator, encodeURIComponent(k.toString()));
|
|
separator = '&';
|
|
if (v) {
|
|
queryBuf.push('=', encodeURIComponent(v.toString()));
|
|
}
|
|
}
|
|
this.query_ = queryBuf.join('');
|
|
return this;
|
|
};
|
|
URI.prototype.checkParameterCache_ = function () {
|
|
if (!this.paramCache_) {
|
|
var q = this.query_;
|
|
if (!q) {
|
|
this.paramCache_ = [];
|
|
} else {
|
|
var cgiParams = q.split(/[&\?]/);
|
|
var out = [];
|
|
var k = -1;
|
|
for (var i = 0; i < cgiParams.length; ++i) {
|
|
var m = cgiParams[i].match(/^([^=]*)(?:=(.*))?$/);
|
|
// From http://www.w3.org/Addressing/URL/4_URI_Recommentations.html
|
|
// Within the query string, the plus sign is reserved as shorthand
|
|
// notation for a space.
|
|
out[++k] = decodeURIComponent(m[1]).replace(/\+/g, ' ');
|
|
out[++k] = decodeURIComponent(m[2] || '').replace(/\+/g, ' ');
|
|
}
|
|
this.paramCache_ = out;
|
|
}
|
|
}
|
|
};
|
|
/**
|
|
* sets the values of the named cgi parameters.
|
|
*
|
|
* <p>So, <code>uri.parse('foo?a=b&c=d&e=f').setParameterValues('c', ['new'])
|
|
* </code> yields <tt>foo?a=b&c=new&e=f</tt>.</p>
|
|
*
|
|
* @param key {string}
|
|
* @param values {Array.<string>} the new values. If values is a single string
|
|
* then it will be treated as the sole value.
|
|
*/
|
|
URI.prototype.setParameterValues = function (key, values) {
|
|
// be nice and avoid subtle bugs where [] operator on string performs charAt
|
|
// on some browsers and crashes on IE
|
|
if (typeof values === 'string') {
|
|
values = [ values ];
|
|
}
|
|
|
|
this.checkParameterCache_();
|
|
var newValueIndex = 0;
|
|
var pc = this.paramCache_;
|
|
var params = [];
|
|
for (var i = 0, k = 0; i < pc.length; i += 2) {
|
|
if (key === pc[i]) {
|
|
if (newValueIndex < values.length) {
|
|
params.push(key, values[newValueIndex++]);
|
|
}
|
|
} else {
|
|
params.push(pc[i], pc[i + 1]);
|
|
}
|
|
}
|
|
while (newValueIndex < values.length) {
|
|
params.push(key, values[newValueIndex++]);
|
|
}
|
|
this.setAllParameters(params);
|
|
return this;
|
|
};
|
|
URI.prototype.removeParameter = function (key) {
|
|
return this.setParameterValues(key, []);
|
|
};
|
|
/**
|
|
* returns the parameters specified in the query part of the uri as a list of
|
|
* keys and values like [ key0, value0, key1, value1, ... ].
|
|
*
|
|
* @return {Array.<string>}
|
|
*/
|
|
URI.prototype.getAllParameters = function () {
|
|
this.checkParameterCache_();
|
|
return this.paramCache_.slice(0, this.paramCache_.length);
|
|
};
|
|
/**
|
|
* returns the value<b>s</b> for a given cgi parameter as a list of decoded
|
|
* query parameter values.
|
|
* @return {Array.<string>}
|
|
*/
|
|
URI.prototype.getParameterValues = function (paramNameUnescaped) {
|
|
this.checkParameterCache_();
|
|
var values = [];
|
|
for (var i = 0; i < this.paramCache_.length; i += 2) {
|
|
if (paramNameUnescaped === this.paramCache_[i]) {
|
|
values.push(this.paramCache_[i + 1]);
|
|
}
|
|
}
|
|
return values;
|
|
};
|
|
/**
|
|
* returns a map of cgi parameter names to (non-empty) lists of values.
|
|
* @return {Object.<string,Array.<string>>}
|
|
*/
|
|
URI.prototype.getParameterMap = function (paramNameUnescaped) {
|
|
this.checkParameterCache_();
|
|
var paramMap = {};
|
|
for (var i = 0; i < this.paramCache_.length; i += 2) {
|
|
var key = this.paramCache_[i++],
|
|
value = this.paramCache_[i++];
|
|
if (!(key in paramMap)) {
|
|
paramMap[key] = [value];
|
|
} else {
|
|
paramMap[key].push(value);
|
|
}
|
|
}
|
|
return paramMap;
|
|
};
|
|
/**
|
|
* returns the first value for a given cgi parameter or null if the given
|
|
* parameter name does not appear in the query string.
|
|
* If the given parameter name does appear, but has no '<tt>=</tt>' following
|
|
* it, then the empty string will be returned.
|
|
* @return {string|null}
|
|
*/
|
|
URI.prototype.getParameterValue = function (paramNameUnescaped) {
|
|
this.checkParameterCache_();
|
|
for (var i = 0; i < this.paramCache_.length; i += 2) {
|
|
if (paramNameUnescaped === this.paramCache_[i]) {
|
|
return this.paramCache_[i + 1];
|
|
}
|
|
}
|
|
return null;
|
|
};
|
|
|
|
URI.prototype.getFragment = function () {
|
|
return this.fragment_ && decodeURIComponent(this.fragment_);
|
|
};
|
|
URI.prototype.getRawFragment = function () {
|
|
return this.fragment_;
|
|
};
|
|
URI.prototype.setFragment = function (newFragment) {
|
|
this.fragment_ = newFragment ? encodeURIComponent(newFragment) : null;
|
|
return this;
|
|
};
|
|
URI.prototype.setRawFragment = function (newFragment) {
|
|
this.fragment_ = newFragment ? newFragment : null;
|
|
return this;
|
|
};
|
|
URI.prototype.hasFragment = function () {
|
|
return null !== this.fragment_;
|
|
};
|
|
|
|
function nullIfAbsent(matchPart) {
|
|
return ('string' == typeof matchPart) && (matchPart.length > 0)
|
|
? matchPart
|
|
: null;
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
* a regular expression for breaking a URI into its component parts.
|
|
*
|
|
* <p>http://www.gbiv.com/protocols/uri/rfc/rfc3986.html#RFC2234 says
|
|
* As the "first-match-wins" algorithm is identical to the "greedy"
|
|
* disambiguation method used by POSIX regular expressions, it is natural and
|
|
* commonplace to use a regular expression for parsing the potential five
|
|
* components of a URI reference.
|
|
*
|
|
* <p>The following line is the regular expression for breaking-down a
|
|
* well-formed URI reference into its components.
|
|
*
|
|
* <pre>
|
|
* ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
|
|
* 12 3 4 5 6 7 8 9
|
|
* </pre>
|
|
*
|
|
* <p>The numbers in the second line above are only to assist readability; they
|
|
* indicate the reference points for each subexpression (i.e., each paired
|
|
* parenthesis). We refer to the value matched for subexpression <n> as $<n>.
|
|
* For example, matching the above expression to
|
|
* <pre>
|
|
* http://www.ics.uci.edu/pub/ietf/uri/#Related
|
|
* </pre>
|
|
* results in the following subexpression matches:
|
|
* <pre>
|
|
* $1 = http:
|
|
* $2 = http
|
|
* $3 = //www.ics.uci.edu
|
|
* $4 = www.ics.uci.edu
|
|
* $5 = /pub/ietf/uri/
|
|
* $6 = <undefined>
|
|
* $7 = <undefined>
|
|
* $8 = #Related
|
|
* $9 = Related
|
|
* </pre>
|
|
* where <undefined> indicates that the component is not present, as is the
|
|
* case for the query component in the above example. Therefore, we can
|
|
* determine the value of the five components as
|
|
* <pre>
|
|
* scheme = $2
|
|
* authority = $4
|
|
* path = $5
|
|
* query = $7
|
|
* fragment = $9
|
|
* </pre>
|
|
*
|
|
* <p>msamuel: I have modified the regular expression slightly to expose the
|
|
* credentials, domain, and port separately from the authority.
|
|
* The modified version yields
|
|
* <pre>
|
|
* $1 = http scheme
|
|
* $2 = <undefined> credentials -\
|
|
* $3 = www.ics.uci.edu domain | authority
|
|
* $4 = <undefined> port -/
|
|
* $5 = /pub/ietf/uri/ path
|
|
* $6 = <undefined> query without ?
|
|
* $7 = Related fragment without #
|
|
* </pre>
|
|
*/
|
|
var URI_RE_ = new RegExp(
|
|
"^" +
|
|
"(?:" +
|
|
"([^:/?#]+)" + // scheme
|
|
":)?" +
|
|
"(?://" +
|
|
"(?:([^/?#]*)@)?" + // credentials
|
|
"([^/?#:@]*)" + // domain
|
|
"(?::([0-9]+))?" + // port
|
|
")?" +
|
|
"([^?#]+)?" + // path
|
|
"(?:\\?([^#]*))?" + // query
|
|
"(?:#(.*))?" + // fragment
|
|
"$"
|
|
);
|
|
|
|
var URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_ = /[#\/\?@]/g;
|
|
var URI_DISALLOWED_IN_PATH_ = /[\#\?]/g;
|
|
|
|
URI.parse = parse;
|
|
URI.create = create;
|
|
URI.resolve = resolve;
|
|
URI.collapse_dots = collapse_dots; // Visible for testing.
|
|
|
|
// lightweight string-based api for loadModuleMaker
|
|
URI.utils = {
|
|
mimeTypeOf: function (uri) {
|
|
var uriObj = parse(uri);
|
|
if (/\.html$/.test(uriObj.getPath())) {
|
|
return 'text/html';
|
|
} else {
|
|
return 'application/javascript';
|
|
}
|
|
},
|
|
resolve: function (base, uri) {
|
|
if (base) {
|
|
return resolve(parse(base), parse(uri)).toString();
|
|
} else {
|
|
return '' + uri;
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
return URI;
|
|
})();
|
|
|
|
// Exports for closure compiler.
|
|
if (typeof window !== 'undefined') {
|
|
window['URI'] = URI;
|
|
}
|
|
;
|
|
// Copyright Google Inc.
|
|
// Licensed under the Apache Licence Version 2.0
|
|
// Autogenerated at Mon Oct 21 13:30:08 EDT 2013
|
|
// @overrides window
|
|
// @provides html4
|
|
var html4 = {};
|
|
html4.atype = {
|
|
'NONE': 0,
|
|
'URI': 1,
|
|
'URI_FRAGMENT': 11,
|
|
'SCRIPT': 2,
|
|
'STYLE': 3,
|
|
'HTML': 12,
|
|
'ID': 4,
|
|
'IDREF': 5,
|
|
'IDREFS': 6,
|
|
'GLOBAL_NAME': 7,
|
|
'LOCAL_NAME': 8,
|
|
'CLASSES': 9,
|
|
'FRAME_TARGET': 10,
|
|
'MEDIA_QUERY': 13
|
|
};
|
|
html4[ 'atype' ] = html4.atype;
|
|
html4.ATTRIBS = {
|
|
'*::class': 9,
|
|
'*::dir': 0,
|
|
'*::draggable': 0,
|
|
'*::hidden': 0,
|
|
'*::id': 4,
|
|
'*::inert': 0,
|
|
'*::itemprop': 0,
|
|
'*::itemref': 6,
|
|
'*::itemscope': 0,
|
|
'*::lang': 0,
|
|
'*::onblur': 2,
|
|
'*::onchange': 2,
|
|
'*::onclick': 2,
|
|
'*::ondblclick': 2,
|
|
'*::onerror': 2,
|
|
'*::onfocus': 2,
|
|
'*::onkeydown': 2,
|
|
'*::onkeypress': 2,
|
|
'*::onkeyup': 2,
|
|
'*::onload': 2,
|
|
'*::onmousedown': 2,
|
|
'*::onmousemove': 2,
|
|
'*::onmouseout': 2,
|
|
'*::onmouseover': 2,
|
|
'*::onmouseup': 2,
|
|
'*::onreset': 2,
|
|
'*::onscroll': 2,
|
|
'*::onselect': 2,
|
|
'*::onsubmit': 2,
|
|
'*::onunload': 2,
|
|
'*::spellcheck': 0,
|
|
'*::style': 3,
|
|
'*::title': 0,
|
|
'*::translate': 0,
|
|
'a::accesskey': 0,
|
|
'a::coords': 0,
|
|
'a::href': 1,
|
|
'a::hreflang': 0,
|
|
'a::name': 7,
|
|
'a::onblur': 2,
|
|
'a::onfocus': 2,
|
|
'a::shape': 0,
|
|
'a::tabindex': 0,
|
|
'a::target': 10,
|
|
'a::type': 0,
|
|
'bdo::dir': 0,
|
|
'blockquote::cite': 1,
|
|
'br::clear': 0,
|
|
'caption::align': 0,
|
|
'col::align': 0,
|
|
'col::char': 0,
|
|
'col::charoff': 0,
|
|
'col::span': 0,
|
|
'col::valign': 0,
|
|
'col::width': 0,
|
|
'colgroup::align': 0,
|
|
'colgroup::char': 0,
|
|
'colgroup::charoff': 0,
|
|
'colgroup::span': 0,
|
|
'colgroup::valign': 0,
|
|
'colgroup::width': 0,
|
|
'data::value': 0,
|
|
'del::cite': 1,
|
|
'del::datetime': 0,
|
|
'details::open': 0,
|
|
'dir::compact': 0,
|
|
'div::align': 0,
|
|
'dl::compact': 0,
|
|
'h1::align': 0,
|
|
'h2::align': 0,
|
|
'h3::align': 0,
|
|
'h4::align': 0,
|
|
'h5::align': 0,
|
|
'h6::align': 0,
|
|
'hr::align': 0,
|
|
'hr::noshade': 0,
|
|
'hr::size': 0,
|
|
'hr::width': 0,
|
|
'iframe::align': 0,
|
|
'iframe::frameborder': 0,
|
|
'iframe::height': 0,
|
|
'iframe::marginheight': 0,
|
|
'iframe::marginwidth': 0,
|
|
'iframe::width': 0,
|
|
'iframe::src': 1,
|
|
'img::align': 0,
|
|
'img::alt': 0,
|
|
'img::border': 0,
|
|
'img::height': 0,
|
|
'img::hspace': 0,
|
|
'img::ismap': 0,
|
|
'img::name': 7,
|
|
'img::src': 1,
|
|
'img::vspace': 0,
|
|
'img::width': 0,
|
|
'ins::cite': 1,
|
|
'ins::datetime': 0,
|
|
'label::accesskey': 0,
|
|
'label::for': 5,
|
|
'label::onblur': 2,
|
|
'label::onfocus': 2,
|
|
'legend::accesskey': 0,
|
|
'legend::align': 0,
|
|
'li::type': 0,
|
|
'li::value': 0,
|
|
'meter::high': 0,
|
|
'meter::low': 0,
|
|
'meter::max': 0,
|
|
'meter::min': 0,
|
|
'meter::value': 0,
|
|
'ol::compact': 0,
|
|
'ol::reversed': 0,
|
|
'ol::start': 0,
|
|
'ol::type': 0,
|
|
'p::align': 0,
|
|
'pre::width': 0,
|
|
'progress::max': 0,
|
|
'progress::min': 0,
|
|
'progress::value': 0,
|
|
'q::cite': 1,
|
|
'source::type': 0,
|
|
'track::default': 0,
|
|
'track::kind': 0,
|
|
'track::label': 0,
|
|
'track::srclang': 0,
|
|
'ul::compact': 0,
|
|
'ul::type': 0,
|
|
};
|
|
html4[ 'ATTRIBS' ] = html4.ATTRIBS;
|
|
html4.eflags = {
|
|
'OPTIONAL_ENDTAG': 1,
|
|
'EMPTY': 2,
|
|
'CDATA': 4,
|
|
'RCDATA': 8,
|
|
'UNSAFE': 16,
|
|
'FOLDABLE': 32,
|
|
'SCRIPT': 64,
|
|
'STYLE': 128,
|
|
'VIRTUALIZED': 256
|
|
};
|
|
html4[ 'eflags' ] = html4.eflags;
|
|
html4.ELEMENTS = {
|
|
'a': 0,
|
|
'abbr': 0,
|
|
'acronym': 0,
|
|
'address': 0,
|
|
'article': 0,
|
|
'aside': 0,
|
|
'b': 0,
|
|
'base': 274,
|
|
'bdi': 0,
|
|
'bdo': 0,
|
|
'big': 0,
|
|
'blockquote': 0,
|
|
'body': 305,
|
|
'br': 2,
|
|
'caption': 0,
|
|
'cite': 0,
|
|
'code': 0,
|
|
'col': 2,
|
|
'colgroup': 1,
|
|
'data': 0,
|
|
'dd': 1,
|
|
'del': 0,
|
|
'details': 0,
|
|
'dfn': 0,
|
|
'dialog': 272,
|
|
'dir': 0,
|
|
'div': 0,
|
|
'dl': 0,
|
|
'dt': 1,
|
|
'em': 0,
|
|
'figcaption': 0,
|
|
'figure': 0,
|
|
'frame': 274,
|
|
'frameset': 272,
|
|
'h1': 0,
|
|
'h2': 0,
|
|
'h3': 0,
|
|
'h4': 0,
|
|
'h5': 0,
|
|
'h6': 0,
|
|
'head': 305,
|
|
'header': 0,
|
|
'hgroup': 0,
|
|
'hr': 2,
|
|
'html': 305,
|
|
'i': 0,
|
|
'iframe': 4,
|
|
'img': 2,
|
|
'ins': 0,
|
|
'isindex': 274,
|
|
'kbd': 0,
|
|
'keygen': 274,
|
|
'label': 0,
|
|
'legend': 0,
|
|
'li': 1,
|
|
'link': 274,
|
|
'mark': 0,
|
|
'meter': 0,
|
|
'nav': 0,
|
|
'nobr': 0,
|
|
'noembed': 276,
|
|
'noframes': 276,
|
|
'noscript': 276,
|
|
'object': 272,
|
|
'ol': 0,
|
|
'p': 1,
|
|
'param': 274,
|
|
'pre': 0,
|
|
'progress': 0,
|
|
'q': 0,
|
|
's': 0,
|
|
'samp': 0,
|
|
'script': 84,
|
|
'section': 0,
|
|
'small': 0,
|
|
'span': 0,
|
|
'strike': 0,
|
|
'strong': 0,
|
|
'style': 148,
|
|
'sub': 0,
|
|
'summary': 0,
|
|
'sup': 0,
|
|
'table': 272,
|
|
'tbody': 273,
|
|
'td': 273,
|
|
'tfoot': 1,
|
|
'th': 273,
|
|
'thead': 273,
|
|
'time': 0,
|
|
'title': 280,
|
|
'tr': 273,
|
|
'track': 2,
|
|
'tt': 0,
|
|
'u': 0,
|
|
'ul': 0,
|
|
'var': 0,
|
|
'wbr': 2
|
|
};
|
|
html4[ 'ELEMENTS' ] = html4.ELEMENTS;
|
|
html4.ELEMENT_DOM_INTERFACES = {
|
|
'a': 'HTMLAnchorElement',
|
|
'abbr': 'HTMLElement',
|
|
'acronym': 'HTMLElement',
|
|
'address': 'HTMLElement',
|
|
'applet': 'HTMLAppletElement',
|
|
'area': 'HTMLAreaElement',
|
|
'article': 'HTMLElement',
|
|
'aside': 'HTMLElement',
|
|
'audio': 'HTMLAudioElement',
|
|
'b': 'HTMLElement',
|
|
'base': 'HTMLBaseElement',
|
|
'basefont': 'HTMLBaseFontElement',
|
|
'bdi': 'HTMLElement',
|
|
'bdo': 'HTMLElement',
|
|
'big': 'HTMLElement',
|
|
'blockquote': 'HTMLQuoteElement',
|
|
'body': 'HTMLBodyElement',
|
|
'br': 'HTMLBRElement',
|
|
'caption': 'HTMLTableCaptionElement',
|
|
'cite': 'HTMLElement',
|
|
'code': 'HTMLElement',
|
|
'col': 'HTMLTableColElement',
|
|
'colgroup': 'HTMLTableColElement',
|
|
'command': 'HTMLCommandElement',
|
|
'data': 'HTMLElement',
|
|
'datalist': 'HTMLDataListElement',
|
|
'dd': 'HTMLElement',
|
|
'del': 'HTMLModElement',
|
|
'details': 'HTMLDetailsElement',
|
|
'dfn': 'HTMLElement',
|
|
'dialog': 'HTMLDialogElement',
|
|
'dir': 'HTMLDirectoryElement',
|
|
'div': 'HTMLDivElement',
|
|
'dl': 'HTMLDListElement',
|
|
'dt': 'HTMLElement',
|
|
'em': 'HTMLElement',
|
|
'fieldset': 'HTMLFieldSetElement',
|
|
'figcaption': 'HTMLElement',
|
|
'figure': 'HTMLElement',
|
|
'footer': 'HTMLElement',
|
|
'form': 'HTMLFormElement',
|
|
'frame': 'HTMLFrameElement',
|
|
'frameset': 'HTMLFrameSetElement',
|
|
'h1': 'HTMLHeadingElement',
|
|
'h2': 'HTMLHeadingElement',
|
|
'h3': 'HTMLHeadingElement',
|
|
'h4': 'HTMLHeadingElement',
|
|
'h5': 'HTMLHeadingElement',
|
|
'h6': 'HTMLHeadingElement',
|
|
'head': 'HTMLHeadElement',
|
|
'header': 'HTMLElement',
|
|
'hgroup': 'HTMLElement',
|
|
'hr': 'HTMLHRElement',
|
|
'html': 'HTMLHtmlElement',
|
|
'i': 'HTMLElement',
|
|
'iframe': 'HTMLIFrameElement',
|
|
'img': 'HTMLImageElement',
|
|
'input': 'HTMLInputElement',
|
|
'ins': 'HTMLModElement',
|
|
'isindex': 'HTMLUnknownElement',
|
|
'kbd': 'HTMLElement',
|
|
'keygen': 'HTMLKeygenElement',
|
|
'label': 'HTMLLabelElement',
|
|
'legend': 'HTMLLegendElement',
|
|
'li': 'HTMLLIElement',
|
|
'link': 'HTMLLinkElement',
|
|
'map': 'HTMLMapElement',
|
|
'mark': 'HTMLElement',
|
|
'menu': 'HTMLMenuElement',
|
|
'meta': 'HTMLMetaElement',
|
|
'meter': 'HTMLMeterElement',
|
|
'nav': 'HTMLElement',
|
|
'nobr': 'HTMLElement',
|
|
'noembed': 'HTMLElement',
|
|
'noframes': 'HTMLElement',
|
|
'noscript': 'HTMLElement',
|
|
'object': 'HTMLObjectElement',
|
|
'ol': 'HTMLOListElement',
|
|
'optgroup': 'HTMLOptGroupElement',
|
|
'option': 'HTMLOptionElement',
|
|
'output': 'HTMLOutputElement',
|
|
'p': 'HTMLParagraphElement',
|
|
'param': 'HTMLParamElement',
|
|
'pre': 'HTMLPreElement',
|
|
'progress': 'HTMLProgressElement',
|
|
'q': 'HTMLQuoteElement',
|
|
's': 'HTMLElement',
|
|
'samp': 'HTMLElement',
|
|
'script': 'HTMLScriptElement',
|
|
'section': 'HTMLElement',
|
|
'select': 'HTMLSelectElement',
|
|
'small': 'HTMLElement',
|
|
'source': 'HTMLSourceElement',
|
|
'span': 'HTMLSpanElement',
|
|
'strike': 'HTMLElement',
|
|
'strong': 'HTMLElement',
|
|
'style': 'HTMLStyleElement',
|
|
'sub': 'HTMLElement',
|
|
'summary': 'HTMLElement',
|
|
'sup': 'HTMLElement',
|
|
'table': 'HTMLTableElement',
|
|
'tbody': 'HTMLTableSectionElement',
|
|
'td': 'HTMLTableDataCellElement',
|
|
'tfoot': 'HTMLTableSectionElement',
|
|
'th': 'HTMLTableHeaderCellElement',
|
|
'thead': 'HTMLTableSectionElement',
|
|
'time': 'HTMLTimeElement',
|
|
'title': 'HTMLTitleElement',
|
|
'tr': 'HTMLTableRowElement',
|
|
'track': 'HTMLTrackElement',
|
|
'tt': 'HTMLElement',
|
|
'u': 'HTMLElement',
|
|
'ul': 'HTMLUListElement',
|
|
'var': 'HTMLElement',
|
|
'video': 'HTMLVideoElement',
|
|
'wbr': 'HTMLElement'
|
|
};
|
|
html4[ 'ELEMENT_DOM_INTERFACES' ] = html4.ELEMENT_DOM_INTERFACES;
|
|
html4.ueffects = {
|
|
'NOT_LOADED': 0,
|
|
'SAME_DOCUMENT': 1,
|
|
'NEW_DOCUMENT': 2
|
|
};
|
|
html4[ 'ueffects' ] = html4.ueffects;
|
|
html4.URIEFFECTS = {
|
|
'a::href': 2,
|
|
'area::href': 2,
|
|
'audio::src': 1,
|
|
'blockquote::cite': 0,
|
|
'command::icon': 1,
|
|
'del::cite': 0,
|
|
'form::action': 2,
|
|
'iframe::src': 1,
|
|
'img::src': 1,
|
|
'input::src': 1,
|
|
'ins::cite': 0,
|
|
'q::cite': 0,
|
|
'video::poster': 1,
|
|
'video::src': 1
|
|
};
|
|
html4[ 'URIEFFECTS' ] = html4.URIEFFECTS;
|
|
html4.ltypes = {
|
|
'UNSANDBOXED': 2,
|
|
'SANDBOXED': 1,
|
|
'DATA': 0
|
|
};
|
|
html4[ 'ltypes' ] = html4.ltypes;
|
|
html4.LOADERTYPES = {
|
|
'a::href': 2,
|
|
'area::href': 2,
|
|
'audio::src': 2,
|
|
'blockquote::cite': 2,
|
|
'command::icon': 1,
|
|
'del::cite': 2,
|
|
'form::action': 2,
|
|
'iframe::src': 2,
|
|
'img::src': 1,
|
|
'input::src': 1,
|
|
'ins::cite': 2,
|
|
'q::cite': 2,
|
|
'video::poster': 1,
|
|
'video::src': 2
|
|
};
|
|
html4[ 'LOADERTYPES' ] = html4.LOADERTYPES;
|
|
// NOTE: currently focused only on URI-type attributes
|
|
html4.REQUIREDATTRIBUTES = {
|
|
"audio" : ["src"],
|
|
"form" : ["action"],
|
|
"iframe" : ["src"],
|
|
"image" : ["src"],
|
|
"video" : ["src"]
|
|
};
|
|
html4[ 'REQUIREDATTRIBUTES' ] = html4.REQUIREDATTRIBUTES;
|
|
// export for Closure Compiler
|
|
if (typeof window !== 'undefined') {
|
|
window['html4'] = html4;
|
|
}
|
|
;
|
|
// Copyright (C) 2006 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
/**
|
|
* @fileoverview
|
|
* An HTML sanitizer that can satisfy a variety of security policies.
|
|
*
|
|
* <p>
|
|
* The HTML sanitizer is built around a SAX parser and HTML element and
|
|
* attributes schemas.
|
|
*
|
|
* If the cssparser is loaded, inline styles are sanitized using the
|
|
* css property and value schemas. Else they are remove during
|
|
* sanitization.
|
|
*
|
|
* If it exists, uses parseCssDeclarations, sanitizeCssProperty, cssSchema
|
|
*
|
|
* @author mikesamuel@gmail.com
|
|
* @author jasvir@gmail.com
|
|
* \@requires html4, URI
|
|
* \@overrides window
|
|
* \@provides html, html_sanitize
|
|
*/
|
|
|
|
// The Turkish i seems to be a non-issue, but abort in case it is.
|
|
if ('I'.toLowerCase() !== 'i') { throw 'I/i problem'; }
|
|
|
|
/**
|
|
* \@namespace
|
|
*/
|
|
var html = (function(html4) {
|
|
|
|
// For closure compiler
|
|
var parseCssDeclarations, sanitizeCssProperty, cssSchema;
|
|
if ('undefined' !== typeof window) {
|
|
parseCssDeclarations = window['parseCssDeclarations'];
|
|
sanitizeCssProperty = window['sanitizeCssProperty'];
|
|
cssSchema = window['cssSchema'];
|
|
}
|
|
|
|
// The keys of this object must be 'quoted' or JSCompiler will mangle them!
|
|
// This is a partial list -- lookupEntity() uses the host browser's parser
|
|
// (when available) to implement full entity lookup.
|
|
// Note that entities are in general case-sensitive; the uppercase ones are
|
|
// explicitly defined by HTML5 (presumably as compatibility).
|
|
var ENTITIES = {
|
|
'lt': '<',
|
|
'LT': '<',
|
|
'gt': '>',
|
|
'GT': '>',
|
|
'amp': '&',
|
|
'AMP': '&',
|
|
'quot': '"',
|
|
'apos': '\'',
|
|
'nbsp': '\240'
|
|
};
|
|
|
|
// Patterns for types of entity/character reference names.
|
|
var decimalEscapeRe = /^#(\d+)$/;
|
|
var hexEscapeRe = /^#x([0-9A-Fa-f]+)$/;
|
|
// contains every entity per http://www.w3.org/TR/2011/WD-html5-20110113/named-character-references.html
|
|
var safeEntityNameRe = /^[A-Za-z][A-za-z0-9]+$/;
|
|
// Used as a hook to invoke the browser's entity parsing. <textarea> is used
|
|
// because its content is parsed for entities but not tags.
|
|
// TODO(kpreid): This retrieval is a kludge and leads to silent loss of
|
|
// functionality if the document isn't available.
|
|
var entityLookupElement =
|
|
('undefined' !== typeof window && window['document'])
|
|
? window['document'].createElement('textarea') : null;
|
|
/**
|
|
* Decodes an HTML entity.
|
|
*
|
|
* {\@updoc
|
|
* $ lookupEntity('lt')
|
|
* # '<'
|
|
* $ lookupEntity('GT')
|
|
* # '>'
|
|
* $ lookupEntity('amp')
|
|
* # '&'
|
|
* $ lookupEntity('nbsp')
|
|
* # '\xA0'
|
|
* $ lookupEntity('apos')
|
|
* # "'"
|
|
* $ lookupEntity('quot')
|
|
* # '"'
|
|
* $ lookupEntity('#xa')
|
|
* # '\n'
|
|
* $ lookupEntity('#10')
|
|
* # '\n'
|
|
* $ lookupEntity('#x0a')
|
|
* # '\n'
|
|
* $ lookupEntity('#010')
|
|
* # '\n'
|
|
* $ lookupEntity('#x00A')
|
|
* # '\n'
|
|
* $ lookupEntity('Pi') // Known failure
|
|
* # '\u03A0'
|
|
* $ lookupEntity('pi') // Known failure
|
|
* # '\u03C0'
|
|
* }
|
|
*
|
|
* @param {string} name the content between the '&' and the ';'.
|
|
* @return {string} a single unicode code-point as a string.
|
|
*/
|
|
function lookupEntity(name) {
|
|
// TODO: entity lookup as specified by HTML5 actually depends on the
|
|
// presence of the ";".
|
|
if (ENTITIES.hasOwnProperty(name)) { return ENTITIES[name]; }
|
|
var m = name.match(decimalEscapeRe);
|
|
if (m) {
|
|
return String.fromCharCode(parseInt(m[1], 10));
|
|
} else if (!!(m = name.match(hexEscapeRe))) {
|
|
return String.fromCharCode(parseInt(m[1], 16));
|
|
} else if (entityLookupElement && safeEntityNameRe.test(name)) {
|
|
entityLookupElement.innerHTML = '&' + name + ';';
|
|
var text = entityLookupElement.textContent;
|
|
ENTITIES[name] = text;
|
|
return text;
|
|
} else {
|
|
return '&' + name + ';';
|
|
}
|
|
}
|
|
|
|
function decodeOneEntity(_, name) {
|
|
return lookupEntity(name);
|
|
}
|
|
|
|
var nulRe = /\0/g;
|
|
function stripNULs(s) {
|
|
return s.replace(nulRe, '');
|
|
}
|
|
|
|
var ENTITY_RE_1 = /&(#[0-9]+|#[xX][0-9A-Fa-f]+|\w+);/g;
|
|
var ENTITY_RE_2 = /^(#[0-9]+|#[xX][0-9A-Fa-f]+|\w+);/;
|
|
/**
|
|
* The plain text of a chunk of HTML CDATA which possibly containing.
|
|
*
|
|
* {\@updoc
|
|
* $ unescapeEntities('')
|
|
* # ''
|
|
* $ unescapeEntities('hello World!')
|
|
* # 'hello World!'
|
|
* $ unescapeEntities('1 < 2 && 4 > 3 ')
|
|
* # '1 < 2 && 4 > 3\n'
|
|
* $ unescapeEntities('<< <- unfinished entity>')
|
|
* # '<< <- unfinished entity>'
|
|
* $ unescapeEntities('/foo?bar=baz©=true') // & often unescaped in URLS
|
|
* # '/foo?bar=baz©=true'
|
|
* $ unescapeEntities('pi=ππ, Pi=Π\u03A0') // FIXME: known failure
|
|
* # 'pi=\u03C0\u03c0, Pi=\u03A0\u03A0'
|
|
* }
|
|
*
|
|
* @param {string} s a chunk of HTML CDATA. It must not start or end inside
|
|
* an HTML entity.
|
|
*/
|
|
function unescapeEntities(s) {
|
|
return s.replace(ENTITY_RE_1, decodeOneEntity);
|
|
}
|
|
|
|
var ampRe = /&/g;
|
|
var looseAmpRe = /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi;
|
|
var ltRe = /[<]/g;
|
|
var gtRe = />/g;
|
|
var quotRe = /\"/g;
|
|
|
|
/**
|
|
* Escapes HTML special characters in attribute values.
|
|
*
|
|
* {\@updoc
|
|
* $ escapeAttrib('')
|
|
* # ''
|
|
* $ escapeAttrib('"<<&==&>>"') // Do not just escape the first occurrence.
|
|
* # '"<<&==&>>"'
|
|
* $ escapeAttrib('Hello <World>!')
|
|
* # 'Hello <World>!'
|
|
* }
|
|
*/
|
|
function escapeAttrib(s) {
|
|
return ('' + s).replace(ampRe, '&').replace(ltRe, '<')
|
|
.replace(gtRe, '>').replace(quotRe, '"');
|
|
}
|
|
|
|
/**
|
|
* Escape entities in RCDATA that can be escaped without changing the meaning.
|
|
* {\@updoc
|
|
* $ normalizeRCData('1 < 2 && 3 > 4 && 5 < 7&8')
|
|
* # '1 < 2 && 3 > 4 && 5 < 7&8'
|
|
* }
|
|
*/
|
|
function normalizeRCData(rcdata) {
|
|
return rcdata
|
|
.replace(looseAmpRe, '&$1')
|
|
.replace(ltRe, '<')
|
|
.replace(gtRe, '>');
|
|
}
|
|
|
|
// TODO(felix8a): validate sanitizer regexs against the HTML5 grammar at
|
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
|
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html
|
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
|
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html
|
|
|
|
// We initially split input so that potentially meaningful characters
|
|
// like '<' and '>' are separate tokens, using a fast dumb process that
|
|
// ignores quoting. Then we walk that token stream, and when we see a
|
|
// '<' that's the start of a tag, we use ATTR_RE to extract tag
|
|
// attributes from the next token. That token will never have a '>'
|
|
// character. However, it might have an unbalanced quote character, and
|
|
// when we see that, we combine additional tokens to balance the quote.
|
|
|
|
var ATTR_RE = new RegExp(
|
|
'^\\s*' +
|
|
'([-.:\\w]+)' + // 1 = Attribute name
|
|
'(?:' + (
|
|
'\\s*(=)\\s*' + // 2 = Is there a value?
|
|
'(' + ( // 3 = Attribute value
|
|
// TODO(felix8a): maybe use backref to match quotes
|
|
'(\")[^\"]*(\"|$)' + // 4, 5 = Double-quoted string
|
|
'|' +
|
|
'(\')[^\']*(\'|$)' + // 6, 7 = Single-quoted string
|
|
'|' +
|
|
// Positive lookahead to prevent interpretation of
|
|
// <foo a= b=c> as <foo a='b=c'>
|
|
// TODO(felix8a): might be able to drop this case
|
|
'(?=[a-z][-\\w]*\\s*=)' +
|
|
'|' +
|
|
// Unquoted value that isn't an attribute name
|
|
// (since we didn't match the positive lookahead above)
|
|
'[^\"\'\\s]*' ) +
|
|
')' ) +
|
|
')?',
|
|
'i');
|
|
|
|
// false on IE<=8, true on most other browsers
|
|
var splitWillCapture = ('a,b'.split(/(,)/).length === 3);
|
|
|
|
// bitmask for tags with special parsing, like <script> and <textarea>
|
|
var EFLAGS_TEXT = html4.eflags['CDATA'] | html4.eflags['RCDATA'];
|
|
|
|
/**
|
|
* Given a SAX-like event handler, produce a function that feeds those
|
|
* events and a parameter to the event handler.
|
|
*
|
|
* The event handler has the form:{@code
|
|
* {
|
|
* // Name is an upper-case HTML tag name. Attribs is an array of
|
|
* // alternating upper-case attribute names, and attribute values. The
|
|
* // attribs array is reused by the parser. Param is the value passed to
|
|
* // the saxParser.
|
|
* startTag: function (name, attribs, param) { ... },
|
|
* endTag: function (name, param) { ... },
|
|
* pcdata: function (text, param) { ... },
|
|
* rcdata: function (text, param) { ... },
|
|
* cdata: function (text, param) { ... },
|
|
* startDoc: function (param) { ... },
|
|
* endDoc: function (param) { ... }
|
|
* }}
|
|
*
|
|
* @param {Object} handler a record containing event handlers.
|
|
* @return {function(string, Object)} A function that takes a chunk of HTML
|
|
* and a parameter. The parameter is passed on to the handler methods.
|
|
*/
|
|
function makeSaxParser(handler) {
|
|
// Accept quoted or unquoted keys (Closure compat)
|
|
var hcopy = {
|
|
cdata: handler.cdata || handler['cdata'],
|
|
comment: handler.comment || handler['comment'],
|
|
endDoc: handler.endDoc || handler['endDoc'],
|
|
endTag: handler.endTag || handler['endTag'],
|
|
pcdata: handler.pcdata || handler['pcdata'],
|
|
rcdata: handler.rcdata || handler['rcdata'],
|
|
startDoc: handler.startDoc || handler['startDoc'],
|
|
startTag: handler.startTag || handler['startTag']
|
|
};
|
|
return function(htmlText, param) {
|
|
return parse(htmlText, hcopy, param);
|
|
};
|
|
}
|
|
|
|
// Parsing strategy is to split input into parts that might be lexically
|
|
// meaningful (every ">" becomes a separate part), and then recombine
|
|
// parts if we discover they're in a different context.
|
|
|
|
// TODO(felix8a): Significant performance regressions from -legacy,
|
|
// tested on
|
|
// Chrome 18.0
|
|
// Firefox 11.0
|
|
// IE 6, 7, 8, 9
|
|
// Opera 11.61
|
|
// Safari 5.1.3
|
|
// Many of these are unusual patterns that are linearly slower and still
|
|
// pretty fast (eg 1ms to 5ms), so not necessarily worth fixing.
|
|
|
|
// TODO(felix8a): "<script> && && && ... <\/script>" is slower on all
|
|
// browsers. The hotspot is htmlSplit.
|
|
|
|
// TODO(felix8a): "<p title='>>>>...'><\/p>" is slower on all browsers.
|
|
// This is partly htmlSplit, but the hotspot is parseTagAndAttrs.
|
|
|
|
// TODO(felix8a): "<a><\/a><a><\/a>..." is slower on IE9.
|
|
// "<a>1<\/a><a>1<\/a>..." is faster, "<a><\/a>2<a><\/a>2..." is faster.
|
|
|
|
// TODO(felix8a): "<p<p<p..." is slower on IE[6-8]
|
|
|
|
var continuationMarker = {};
|
|
function parse(htmlText, handler, param) {
|
|
var m, p, tagName;
|
|
var parts = htmlSplit(htmlText);
|
|
var state = {
|
|
noMoreGT: false,
|
|
noMoreEndComments: false
|
|
};
|
|
parseCPS(handler, parts, 0, state, param);
|
|
}
|
|
|
|
function continuationMaker(h, parts, initial, state, param) {
|
|
return function () {
|
|
parseCPS(h, parts, initial, state, param);
|
|
};
|
|
}
|
|
|
|
function parseCPS(h, parts, initial, state, param) {
|
|
try {
|
|
if (h.startDoc && initial == 0) { h.startDoc(param); }
|
|
var m, p, tagName;
|
|
for (var pos = initial, end = parts.length; pos < end;) {
|
|
var current = parts[pos++];
|
|
var next = parts[pos];
|
|
switch (current) {
|
|
case '&':
|
|
if (ENTITY_RE_2.test(next)) {
|
|
if (h.pcdata) {
|
|
h.pcdata('&' + next, param, continuationMarker,
|
|
continuationMaker(h, parts, pos, state, param));
|
|
}
|
|
pos++;
|
|
} else {
|
|
if (h.pcdata) { h.pcdata("&", param, continuationMarker,
|
|
continuationMaker(h, parts, pos, state, param));
|
|
}
|
|
}
|
|
break;
|
|
case '<\/':
|
|
if ((m = /^([-\w:]+)[^\'\"]*/.exec(next))) {
|
|
if (m[0].length === next.length && parts[pos + 1] === '>') {
|
|
// fast case, no attribute parsing needed
|
|
pos += 2;
|
|
tagName = m[1].toLowerCase();
|
|
if (h.endTag) {
|
|
h.endTag(tagName, param, continuationMarker,
|
|
continuationMaker(h, parts, pos, state, param));
|
|
}
|
|
} else {
|
|
// slow case, need to parse attributes
|
|
// TODO(felix8a): do we really care about misparsing this?
|
|
pos = parseEndTag(
|
|
parts, pos, h, param, continuationMarker, state);
|
|
}
|
|
} else {
|
|
if (h.pcdata) {
|
|
h.pcdata('</', param, continuationMarker,
|
|
continuationMaker(h, parts, pos, state, param));
|
|
}
|
|
}
|
|
break;
|
|
case '<':
|
|
if (m = /^([-\w:]+)\s*\/?/.exec(next)) {
|
|
if (m[0].length === next.length && parts[pos + 1] === '>') {
|
|
// fast case, no attribute parsing needed
|
|
pos += 2;
|
|
tagName = m[1].toLowerCase();
|
|
if (h.startTag) {
|
|
h.startTag(tagName, [], param, continuationMarker,
|
|
continuationMaker(h, parts, pos, state, param));
|
|
}
|
|
// tags like <script> and <textarea> have special parsing
|
|
var eflags = html4.ELEMENTS[tagName];
|
|
if (eflags & EFLAGS_TEXT) {
|
|
var tag = { name: tagName, next: pos, eflags: eflags };
|
|
pos = parseText(
|
|
parts, tag, h, param, continuationMarker, state);
|
|
}
|
|
} else {
|
|
// slow case, need to parse attributes
|
|
pos = parseStartTag(
|
|
parts, pos, h, param, continuationMarker, state);
|
|
}
|
|
} else {
|
|
if (h.pcdata) {
|
|
h.pcdata('<', param, continuationMarker,
|
|
continuationMaker(h, parts, pos, state, param));
|
|
}
|
|
}
|
|
break;
|
|
case '<\!--':
|
|
// The pathological case is n copies of '<\!--' without '-->', and
|
|
// repeated failure to find '-->' is quadratic. We avoid that by
|
|
// remembering when search for '-->' fails.
|
|
if (!state.noMoreEndComments) {
|
|
// A comment <\!--x--> is split into three tokens:
|
|
// '<\!--', 'x--', '>'
|
|
// We want to find the next '>' token that has a preceding '--'.
|
|
// pos is at the 'x--'.
|
|
for (p = pos + 1; p < end; p++) {
|
|
if (parts[p] === '>' && /--$/.test(parts[p - 1])) { break; }
|
|
}
|
|
if (p < end) {
|
|
if (h.comment) {
|
|
var comment = parts.slice(pos, p).join('');
|
|
h.comment(
|
|
comment.substr(0, comment.length - 2), param,
|
|
continuationMarker,
|
|
continuationMaker(h, parts, p + 1, state, param));
|
|
}
|
|
pos = p + 1;
|
|
} else {
|
|
state.noMoreEndComments = true;
|
|
}
|
|
}
|
|
if (state.noMoreEndComments) {
|
|
if (h.pcdata) {
|
|
h.pcdata('<!--', param, continuationMarker,
|
|
continuationMaker(h, parts, pos, state, param));
|
|
}
|
|
}
|
|
break;
|
|
case '<\!':
|
|
if (!/^\w/.test(next)) {
|
|
if (h.pcdata) {
|
|
h.pcdata('<!', param, continuationMarker,
|
|
continuationMaker(h, parts, pos, state, param));
|
|
}
|
|
} else {
|
|
// similar to noMoreEndComment logic
|
|
if (!state.noMoreGT) {
|
|
for (p = pos + 1; p < end; p++) {
|
|
if (parts[p] === '>') { break; }
|
|
}
|
|
if (p < end) {
|
|
pos = p + 1;
|
|
} else {
|
|
state.noMoreGT = true;
|
|
}
|
|
}
|
|
if (state.noMoreGT) {
|
|
if (h.pcdata) {
|
|
h.pcdata('<!', param, continuationMarker,
|
|
continuationMaker(h, parts, pos, state, param));
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case '<?':
|
|
// similar to noMoreEndComment logic
|
|
if (!state.noMoreGT) {
|
|
for (p = pos + 1; p < end; p++) {
|
|
if (parts[p] === '>') { break; }
|
|
}
|
|
if (p < end) {
|
|
pos = p + 1;
|
|
} else {
|
|
state.noMoreGT = true;
|
|
}
|
|
}
|
|
if (state.noMoreGT) {
|
|
if (h.pcdata) {
|
|
h.pcdata('<?', param, continuationMarker,
|
|
continuationMaker(h, parts, pos, state, param));
|
|
}
|
|
}
|
|
break;
|
|
case '>':
|
|
if (h.pcdata) {
|
|
h.pcdata(">", param, continuationMarker,
|
|
continuationMaker(h, parts, pos, state, param));
|
|
}
|
|
break;
|
|
case '':
|
|
break;
|
|
default:
|
|
if (h.pcdata) {
|
|
h.pcdata(current, param, continuationMarker,
|
|
continuationMaker(h, parts, pos, state, param));
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
if (h.endDoc) { h.endDoc(param); }
|
|
} catch (e) {
|
|
if (e !== continuationMarker) { throw e; }
|
|
}
|
|
}
|
|
|
|
// Split str into parts for the html parser.
|
|
function htmlSplit(str) {
|
|
// can't hoist this out of the function because of the re.exec loop.
|
|
var re = /(<\/|<\!--|<[!?]|[&<>])/g;
|
|
str += '';
|
|
if (splitWillCapture) {
|
|
return str.split(re);
|
|
} else {
|
|
var parts = [];
|
|
var lastPos = 0;
|
|
var m;
|
|
while ((m = re.exec(str)) !== null) {
|
|
parts.push(str.substring(lastPos, m.index));
|
|
parts.push(m[0]);
|
|
lastPos = m.index + m[0].length;
|
|
}
|
|
parts.push(str.substring(lastPos));
|
|
return parts;
|
|
}
|
|
}
|
|
|
|
function parseEndTag(parts, pos, h, param, continuationMarker, state) {
|
|
var tag = parseTagAndAttrs(parts, pos);
|
|
// drop unclosed tags
|
|
if (!tag) { return parts.length; }
|
|
if (h.endTag) {
|
|
h.endTag(tag.name, param, continuationMarker,
|
|
continuationMaker(h, parts, pos, state, param));
|
|
}
|
|
return tag.next;
|
|
}
|
|
|
|
function parseStartTag(parts, pos, h, param, continuationMarker, state) {
|
|
var tag = parseTagAndAttrs(parts, pos);
|
|
// drop unclosed tags
|
|
if (!tag) { return parts.length; }
|
|
if (h.startTag) {
|
|
h.startTag(tag.name, tag.attrs, param, continuationMarker,
|
|
continuationMaker(h, parts, tag.next, state, param));
|
|
}
|
|
// tags like <script> and <textarea> have special parsing
|
|
if (tag.eflags & EFLAGS_TEXT) {
|
|
return parseText(parts, tag, h, param, continuationMarker, state);
|
|
} else {
|
|
return tag.next;
|
|
}
|
|
}
|
|
|
|
var endTagRe = {};
|
|
|
|
// Tags like <script> and <textarea> are flagged as CDATA or RCDATA,
|
|
// which means everything is text until we see the correct closing tag.
|
|
function parseText(parts, tag, h, param, continuationMarker, state) {
|
|
var end = parts.length;
|
|
if (!endTagRe.hasOwnProperty(tag.name)) {
|
|
endTagRe[tag.name] = new RegExp('^' + tag.name + '(?:[\\s\\/]|$)', 'i');
|
|
}
|
|
var re = endTagRe[tag.name];
|
|
var first = tag.next;
|
|
var p = tag.next + 1;
|
|
for (; p < end; p++) {
|
|
if (parts[p - 1] === '<\/' && re.test(parts[p])) { break; }
|
|
}
|
|
if (p < end) { p -= 1; }
|
|
var buf = parts.slice(first, p).join('');
|
|
if (tag.eflags & html4.eflags['CDATA']) {
|
|
if (h.cdata) {
|
|
h.cdata(buf, param, continuationMarker,
|
|
continuationMaker(h, parts, p, state, param));
|
|
}
|
|
} else if (tag.eflags & html4.eflags['RCDATA']) {
|
|
if (h.rcdata) {
|
|
h.rcdata(normalizeRCData(buf), param, continuationMarker,
|
|
continuationMaker(h, parts, p, state, param));
|
|
}
|
|
} else {
|
|
throw new Error('bug');
|
|
}
|
|
return p;
|
|
}
|
|
|
|
// at this point, parts[pos-1] is either "<" or "<\/".
|
|
function parseTagAndAttrs(parts, pos) {
|
|
var m = /^([-\w:]+)/.exec(parts[pos]);
|
|
var tag = {};
|
|
tag.name = m[1].toLowerCase();
|
|
tag.eflags = html4.ELEMENTS[tag.name];
|
|
var buf = parts[pos].substr(m[0].length);
|
|
// Find the next '>'. We optimistically assume this '>' is not in a
|
|
// quoted context, and further down we fix things up if it turns out to
|
|
// be quoted.
|
|
var p = pos + 1;
|
|
var end = parts.length;
|
|
for (; p < end; p++) {
|
|
if (parts[p] === '>') { break; }
|
|
buf += parts[p];
|
|
}
|
|
if (end <= p) { return void 0; }
|
|
var attrs = [];
|
|
while (buf !== '') {
|
|
m = ATTR_RE.exec(buf);
|
|
if (!m) {
|
|
// No attribute found: skip garbage
|
|
buf = buf.replace(/^[\s\S][^a-z\s]*/, '');
|
|
|
|
} else if ((m[4] && !m[5]) || (m[6] && !m[7])) {
|
|
// Unterminated quote: slurp to the next unquoted '>'
|
|
var quote = m[4] || m[6];
|
|
var sawQuote = false;
|
|
var abuf = [buf, parts[p++]];
|
|
for (; p < end; p++) {
|
|
if (sawQuote) {
|
|
if (parts[p] === '>') { break; }
|
|
} else if (0 <= parts[p].indexOf(quote)) {
|
|
sawQuote = true;
|
|
}
|
|
abuf.push(parts[p]);
|
|
}
|
|
// Slurp failed: lose the garbage
|
|
if (end <= p) { break; }
|
|
// Otherwise retry attribute parsing
|
|
buf = abuf.join('');
|
|
continue;
|
|
|
|
} else {
|
|
// We have an attribute
|
|
var aName = m[1].toLowerCase();
|
|
var aValue = m[2] ? decodeValue(m[3]) : '';
|
|
attrs.push(aName, aValue);
|
|
buf = buf.substr(m[0].length);
|
|
}
|
|
}
|
|
tag.attrs = attrs;
|
|
tag.next = p + 1;
|
|
return tag;
|
|
}
|
|
|
|
function decodeValue(v) {
|
|
var q = v.charCodeAt(0);
|
|
if (q === 0x22 || q === 0x27) { // " or '
|
|
v = v.substr(1, v.length - 2);
|
|
}
|
|
return unescapeEntities(stripNULs(v));
|
|
}
|
|
|
|
/**
|
|
* Returns a function that strips unsafe tags and attributes from html.
|
|
* @param {function(string, Array.<string>): ?Array.<string>} tagPolicy
|
|
* A function that takes (tagName, attribs[]), where tagName is a key in
|
|
* html4.ELEMENTS and attribs is an array of alternating attribute names
|
|
* and values. It should return a record (as follows), or null to delete
|
|
* the element. It's okay for tagPolicy to modify the attribs array,
|
|
* but the same array is reused, so it should not be held between calls.
|
|
* Record keys:
|
|
* attribs: (required) Sanitized attributes array.
|
|
* tagName: Replacement tag name.
|
|
* @return {function(string, Array)} A function that sanitizes a string of
|
|
* HTML and appends result strings to the second argument, an array.
|
|
*/
|
|
function makeHtmlSanitizer(tagPolicy) {
|
|
var stack;
|
|
var ignoring;
|
|
var emit = function (text, out) {
|
|
if (!ignoring) { out.push(text); }
|
|
};
|
|
return makeSaxParser({
|
|
'startDoc': function(_) {
|
|
stack = [];
|
|
ignoring = false;
|
|
},
|
|
'startTag': function(tagNameOrig, attribs, out) {
|
|
if (ignoring) { return; }
|
|
if (!html4.ELEMENTS.hasOwnProperty(tagNameOrig)) { return; }
|
|
var eflagsOrig = html4.ELEMENTS[tagNameOrig];
|
|
if (eflagsOrig & html4.eflags['FOLDABLE']) {
|
|
return;
|
|
}
|
|
|
|
var decision = tagPolicy(tagNameOrig, attribs);
|
|
if (!decision) {
|
|
ignoring = !(eflagsOrig & html4.eflags['EMPTY']);
|
|
return;
|
|
} else if (typeof decision !== 'object') {
|
|
throw new Error('tagPolicy did not return object (old API?)');
|
|
}
|
|
if ('attribs' in decision) {
|
|
attribs = decision['attribs'];
|
|
} else {
|
|
throw new Error('tagPolicy gave no attribs');
|
|
}
|
|
var eflagsRep;
|
|
var tagNameRep;
|
|
if ('tagName' in decision) {
|
|
tagNameRep = decision['tagName'];
|
|
eflagsRep = html4.ELEMENTS[tagNameRep];
|
|
} else {
|
|
tagNameRep = tagNameOrig;
|
|
eflagsRep = eflagsOrig;
|
|
}
|
|
// TODO(mikesamuel): relying on tagPolicy not to insert unsafe
|
|
// attribute names.
|
|
|
|
// If this is an optional-end-tag element and either this element or its
|
|
// previous like sibling was rewritten, then insert a close tag to
|
|
// preserve structure.
|
|
if (eflagsOrig & html4.eflags['OPTIONAL_ENDTAG']) {
|
|
var onStack = stack[stack.length - 1];
|
|
if (onStack && onStack.orig === tagNameOrig &&
|
|
(onStack.rep !== tagNameRep || tagNameOrig !== tagNameRep)) {
|
|
out.push('<\/', onStack.rep, '>');
|
|
}
|
|
}
|
|
|
|
if (!(eflagsOrig & html4.eflags['EMPTY'])) {
|
|
stack.push({orig: tagNameOrig, rep: tagNameRep});
|
|
}
|
|
|
|
out.push('<', tagNameRep);
|
|
for (var i = 0, n = attribs.length; i < n; i += 2) {
|
|
var attribName = attribs[i],
|
|
value = attribs[i + 1];
|
|
if (value !== null && value !== void 0) {
|
|
out.push(' ', attribName, '="', escapeAttrib(value), '"');
|
|
}
|
|
}
|
|
out.push('>');
|
|
|
|
if ((eflagsOrig & html4.eflags['EMPTY'])
|
|
&& !(eflagsRep & html4.eflags['EMPTY'])) {
|
|
// replacement is non-empty, synthesize end tag
|
|
out.push('<\/', tagNameRep, '>');
|
|
}
|
|
},
|
|
'endTag': function(tagName, out) {
|
|
if (ignoring) {
|
|
ignoring = false;
|
|
return;
|
|
}
|
|
if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; }
|
|
var eflags = html4.ELEMENTS[tagName];
|
|
if (!(eflags & (html4.eflags['EMPTY'] | html4.eflags['FOLDABLE']))) {
|
|
var index;
|
|
if (eflags & html4.eflags['OPTIONAL_ENDTAG']) {
|
|
for (index = stack.length; --index >= 0;) {
|
|
var stackElOrigTag = stack[index].orig;
|
|
if (stackElOrigTag === tagName) { break; }
|
|
if (!(html4.ELEMENTS[stackElOrigTag] &
|
|
html4.eflags['OPTIONAL_ENDTAG'])) {
|
|
// Don't pop non optional end tags looking for a match.
|
|
return;
|
|
}
|
|
}
|
|
} else {
|
|
for (index = stack.length; --index >= 0;) {
|
|
if (stack[index].orig === tagName) { break; }
|
|
}
|
|
}
|
|
if (index < 0) { return; } // Not opened.
|
|
for (var i = stack.length; --i > index;) {
|
|
var stackElRepTag = stack[i].rep;
|
|
if (!(html4.ELEMENTS[stackElRepTag] &
|
|
html4.eflags['OPTIONAL_ENDTAG'])) {
|
|
out.push('<\/', stackElRepTag, '>');
|
|
}
|
|
}
|
|
if (index < stack.length) {
|
|
tagName = stack[index].rep;
|
|
}
|
|
stack.length = index;
|
|
out.push('<\/', tagName, '>');
|
|
}
|
|
},
|
|
'pcdata': emit,
|
|
'rcdata': emit,
|
|
'cdata': emit,
|
|
'endDoc': function(out) {
|
|
for (; stack.length; stack.length--) {
|
|
out.push('<\/', stack[stack.length - 1].rep, '>');
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
var ALLOWED_URI_SCHEMES = /^(?:https?|mailto)$/i;
|
|
|
|
function safeUri(uri, effect, ltype, hints, naiveUriRewriter) {
|
|
if (!naiveUriRewriter) { return null; }
|
|
try {
|
|
var parsed = URI.parse('' + uri);
|
|
if (parsed) {
|
|
if (!parsed.hasScheme() ||
|
|
ALLOWED_URI_SCHEMES.test(parsed.getScheme())) {
|
|
var safe = naiveUriRewriter(parsed, effect, ltype, hints);
|
|
return safe ? safe.toString() : null;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
return null;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function log(logger, tagName, attribName, oldValue, newValue) {
|
|
if (!attribName) {
|
|
logger(tagName + " removed", {
|
|
change: "removed",
|
|
tagName: tagName
|
|
});
|
|
}
|
|
if (oldValue !== newValue) {
|
|
var changed = "changed";
|
|
if (oldValue && !newValue) {
|
|
changed = "removed";
|
|
} else if (!oldValue && newValue) {
|
|
changed = "added";
|
|
}
|
|
logger(tagName + "." + attribName + " " + changed, {
|
|
change: changed,
|
|
tagName: tagName,
|
|
attribName: attribName,
|
|
oldValue: oldValue,
|
|
newValue: newValue
|
|
});
|
|
}
|
|
}
|
|
|
|
function lookupAttribute(map, tagName, attribName) {
|
|
var attribKey;
|
|
attribKey = tagName + '::' + attribName;
|
|
if (map.hasOwnProperty(attribKey)) {
|
|
return map[attribKey];
|
|
}
|
|
attribKey = '*::' + attribName;
|
|
if (map.hasOwnProperty(attribKey)) {
|
|
return map[attribKey];
|
|
}
|
|
return void 0;
|
|
}
|
|
function getAttributeType(tagName, attribName) {
|
|
return lookupAttribute(html4.ATTRIBS, tagName, attribName);
|
|
}
|
|
function getLoaderType(tagName, attribName) {
|
|
return lookupAttribute(html4.LOADERTYPES, tagName, attribName);
|
|
}
|
|
function getUriEffect(tagName, attribName) {
|
|
return lookupAttribute(html4.URIEFFECTS, tagName, attribName);
|
|
}
|
|
|
|
/**
|
|
* Sanitizes attributes on an HTML tag.
|
|
* @param {string} tagName An HTML tag name in lowercase.
|
|
* @param {Array.<?string>} attribs An array of alternating names and values.
|
|
* @param {?function(?string): ?string} opt_naiveUriRewriter A transform to
|
|
* apply to URI attributes; it can return a new string value, or null to
|
|
* delete the attribute. If unspecified, URI attributes are deleted.
|
|
* @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply
|
|
* to attributes containing HTML names, element IDs, and space-separated
|
|
* lists of classes; it can return a new string value, or null to delete
|
|
* the attribute. If unspecified, these attributes are kept unchanged.
|
|
* @return {Array.<?string>} The sanitized attributes as a list of alternating
|
|
* names and values, where a null value means to omit the attribute.
|
|
*/
|
|
function sanitizeAttribs(tagName, attribs, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) {
|
|
// TODO(felix8a): it's obnoxious that domado duplicates much of this
|
|
// TODO(felix8a): maybe consistently enforce constraints like target=
|
|
for (var i = 0; i < attribs.length; i += 2) {
|
|
var attribName = attribs[i];
|
|
var value = attribs[i + 1];
|
|
var oldValue = value;
|
|
var atype = null, attribKey;
|
|
if ((attribKey = tagName + '::' + attribName,
|
|
html4.ATTRIBS.hasOwnProperty(attribKey)) ||
|
|
(attribKey = '*::' + attribName,
|
|
html4.ATTRIBS.hasOwnProperty(attribKey))) {
|
|
atype = html4.ATTRIBS[attribKey];
|
|
}
|
|
if (atype !== null) {
|
|
switch (atype) {
|
|
case html4.atype['NONE']: break;
|
|
case html4.atype['SCRIPT']:
|
|
value = null;
|
|
if (opt_logger) {
|
|
log(opt_logger, tagName, attribName, oldValue, value);
|
|
}
|
|
break;
|
|
case html4.atype['STYLE']:
|
|
if ('undefined' === typeof parseCssDeclarations) {
|
|
value = null;
|
|
if (opt_logger) {
|
|
log(opt_logger, tagName, attribName, oldValue, value);
|
|
}
|
|
break;
|
|
}
|
|
var sanitizedDeclarations = [];
|
|
parseCssDeclarations(
|
|
value,
|
|
{
|
|
'declaration': function (property, tokens) {
|
|
var normProp = property.toLowerCase();
|
|
sanitizeCssProperty(
|
|
normProp, tokens,
|
|
opt_naiveUriRewriter
|
|
? function (url) {
|
|
return safeUri(
|
|
url, html4.ueffects.SAME_DOCUMENT,
|
|
html4.ltypes.SANDBOXED,
|
|
{
|
|
"TYPE": "CSS",
|
|
"CSS_PROP": normProp
|
|
}, opt_naiveUriRewriter);
|
|
}
|
|
: null);
|
|
if (tokens.length) {
|
|
sanitizedDeclarations.push(
|
|
normProp + ': ' + tokens.join(' '));
|
|
}
|
|
}
|
|
});
|
|
value = sanitizedDeclarations.length > 0 ?
|
|
sanitizedDeclarations.join(' ; ') : null;
|
|
if (opt_logger) {
|
|
log(opt_logger, tagName, attribName, oldValue, value);
|
|
}
|
|
break;
|
|
case html4.atype['ID']:
|
|
case html4.atype['IDREF']:
|
|
case html4.atype['IDREFS']:
|
|
case html4.atype['GLOBAL_NAME']:
|
|
case html4.atype['LOCAL_NAME']:
|
|
case html4.atype['CLASSES']:
|
|
value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value;
|
|
if (opt_logger) {
|
|
log(opt_logger, tagName, attribName, oldValue, value);
|
|
}
|
|
break;
|
|
case html4.atype['URI']:
|
|
value = safeUri(value,
|
|
getUriEffect(tagName, attribName),
|
|
getLoaderType(tagName, attribName),
|
|
{
|
|
"TYPE": "MARKUP",
|
|
"XML_ATTR": attribName,
|
|
"XML_TAG": tagName
|
|
}, opt_naiveUriRewriter);
|
|
if (opt_logger) {
|
|
log(opt_logger, tagName, attribName, oldValue, value);
|
|
}
|
|
break;
|
|
case html4.atype['URI_FRAGMENT']:
|
|
if (value && '#' === value.charAt(0)) {
|
|
value = value.substring(1); // remove the leading '#'
|
|
value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value;
|
|
if (value !== null && value !== void 0) {
|
|
value = '#' + value; // restore the leading '#'
|
|
}
|
|
} else {
|
|
value = null;
|
|
}
|
|
if (opt_logger) {
|
|
log(opt_logger, tagName, attribName, oldValue, value);
|
|
}
|
|
break;
|
|
default:
|
|
value = null;
|
|
if (opt_logger) {
|
|
log(opt_logger, tagName, attribName, oldValue, value);
|
|
}
|
|
break;
|
|
}
|
|
} else {
|
|
value = null;
|
|
if (opt_logger) {
|
|
log(opt_logger, tagName, attribName, oldValue, value);
|
|
}
|
|
}
|
|
attribs[i + 1] = value;
|
|
}
|
|
return attribs;
|
|
}
|
|
|
|
/**
|
|
* Creates a tag policy that omits all tags marked UNSAFE in html4-defs.js
|
|
* and applies the default attribute sanitizer with the supplied policy for
|
|
* URI attributes and NMTOKEN attributes.
|
|
* @param {?function(?string): ?string} opt_naiveUriRewriter A transform to
|
|
* apply to URI attributes. If not given, URI attributes are deleted.
|
|
* @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply
|
|
* to attributes containing HTML names, element IDs, and space-separated
|
|
* lists of classes. If not given, such attributes are left unchanged.
|
|
* @return {function(string, Array.<?string>)} A tagPolicy suitable for
|
|
* passing to html.sanitize.
|
|
*/
|
|
function makeTagPolicy(opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) {
|
|
return function(tagName, attribs) {
|
|
if (!(html4.ELEMENTS[tagName] & html4.eflags['UNSAFE'])) {
|
|
var sanitizedAttribs = sanitizeAttribs(tagName, attribs, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger);
|
|
var requiredAttributes = html4.REQUIREDATTRIBUTES[tagName];
|
|
if (requiredAttributes && missRequiredAttributes(sanitizedAttribs, requiredAttributes)) { return }
|
|
return { 'attribs': sanitizedAttribs };
|
|
} else {
|
|
if (opt_logger) {
|
|
log(opt_logger, tagName, undefined, undefined, undefined);
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
function missRequiredAttributes(sanitizedAttributes, requiredAttributes) {
|
|
var requiredAttributesWithValueCount = 0;
|
|
for (var i = 0, length = sanitizedAttributes.length; i < length; i += 2) {
|
|
var name = sanitizedAttributes[i];
|
|
var value = sanitizedAttributes[i + 1];
|
|
if (requiredAttributes.indexOf(name) > -1 && value && value.length > 0) { requiredAttributesWithValueCount++; }
|
|
}
|
|
return requiredAttributesWithValueCount != requiredAttributes.length;
|
|
}
|
|
|
|
/**
|
|
* Sanitizes HTML tags and attributes according to a given policy.
|
|
* @param {string} inputHtml The HTML to sanitize.
|
|
* @param {function(string, Array.<?string>)} tagPolicy A function that
|
|
* decides which tags to accept and sanitizes their attributes (see
|
|
* makeHtmlSanitizer above for details).
|
|
* @return {string} The sanitized HTML.
|
|
*/
|
|
function sanitizeWithPolicy(inputHtml, tagPolicy) {
|
|
var outputArray = [];
|
|
makeHtmlSanitizer(tagPolicy)(inputHtml, outputArray);
|
|
return outputArray.join('');
|
|
}
|
|
|
|
/**
|
|
* Strips unsafe tags and attributes from HTML.
|
|
* @param {string} inputHtml The HTML to sanitize.
|
|
* @param {?function(?string): ?string} opt_naiveUriRewriter A transform to
|
|
* apply to URI attributes. If not given, URI attributes are deleted.
|
|
* @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply
|
|
* to attributes containing HTML names, element IDs, and space-separated
|
|
* lists of classes. If not given, such attributes are left unchanged.
|
|
*/
|
|
function sanitize(inputHtml, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) {
|
|
var tagPolicy = makeTagPolicy(opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger);
|
|
return sanitizeWithPolicy(inputHtml, tagPolicy);
|
|
}
|
|
|
|
// Export both quoted and unquoted names for Closure linkage.
|
|
var html = {};
|
|
html.escapeAttrib = html['escapeAttrib'] = escapeAttrib;
|
|
html.makeHtmlSanitizer = html['makeHtmlSanitizer'] = makeHtmlSanitizer;
|
|
html.makeSaxParser = html['makeSaxParser'] = makeSaxParser;
|
|
html.makeTagPolicy = html['makeTagPolicy'] = makeTagPolicy;
|
|
html.normalizeRCData = html['normalizeRCData'] = normalizeRCData;
|
|
html.sanitize = html['sanitize'] = sanitize;
|
|
html.sanitizeAttribs = html['sanitizeAttribs'] = sanitizeAttribs;
|
|
html.sanitizeWithPolicy = html['sanitizeWithPolicy'] = sanitizeWithPolicy;
|
|
html.unescapeEntities = html['unescapeEntities'] = unescapeEntities;
|
|
return html;
|
|
})(html4);
|
|
|
|
var html_sanitize = html['sanitize'];
|
|
|
|
// Exports for Closure compiler. Note this file is also cajoled
|
|
// for domado and run in an environment without 'window'
|
|
if (typeof window !== 'undefined') {
|
|
window['html'] = html;
|
|
window['html_sanitize'] = html_sanitize;
|
|
}
|