You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

194 lines
5.0 KiB
JavaScript

var generate = require('regjsgen').generate;
var parse = require('regjsparser').parse;
var regenerate = require('regenerate');
var iuMappings = require('./data/iu-mappings.json');
var ESCAPE_SETS = require('./data/character-class-escape-sets.js');
function getCharacterClassEscapeSet(character) {
if (unicode) {
if (ignoreCase) {
return ESCAPE_SETS.UNICODE_IGNORE_CASE[character];
}
return ESCAPE_SETS.UNICODE[character];
}
return ESCAPE_SETS.REGULAR[character];
}
var object = {};
var hasOwnProperty = object.hasOwnProperty;
function has(object, property) {
return hasOwnProperty.call(object, property);
}
// Prepare a Regenerate set containing all code points, used for negative
// character classes (if any).
var UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
// Without the `u` flag, the range stops at 0xFFFF.
// https://mths.be/es6#sec-pattern-semantics
var BMP_SET = regenerate().addRange(0x0, 0xFFFF);
// Prepare a Regenerate set containing all code points that are supposed to be
// matched by `/./u`. https://mths.be/es6#sec-atom
var DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
.remove(
// minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
0x000A, // Line Feed <LF>
0x000D, // Carriage Return <CR>
0x2028, // Line Separator <LS>
0x2029 // Paragraph Separator <PS>
);
// Prepare a Regenerate set containing all code points that are supposed to be
// matched by `/./` (only BMP code points).
var DOT_SET = DOT_SET_UNICODE.clone()
.intersection(BMP_SET);
// Add a range of code points + any case-folded code points in that range to a
// set.
regenerate.prototype.iuAddRange = function(min, max) {
var $this = this;
do {
var folded = caseFold(min);
if (folded) {
$this.add(folded);
}
} while (++min <= max);
return $this;
};
function assign(target, source) {
for (var key in source) {
// Note: `hasOwnProperty` is not needed here.
target[key] = source[key];
}
}
function update(item, pattern) {
// TODO: Test if memoizing `pattern` here is worth the effort.
if (!pattern) {
return;
}
var tree = parse(pattern, '');
switch (tree.type) {
case 'characterClass':
case 'group':
case 'value':
// No wrapping needed.
break;
default:
// Wrap the pattern in a non-capturing group.
tree = wrap(tree, pattern);
}
assign(item, tree);
}
function wrap(tree, pattern) {
// Wrap the pattern in a non-capturing group.
return {
'type': 'group',
'behavior': 'ignore',
'body': [tree],
'raw': '(?:' + pattern + ')'
};
}
function caseFold(codePoint) {
return has(iuMappings, codePoint) ? iuMappings[codePoint] : false;
}
var ignoreCase = false;
var unicode = false;
function processCharacterClass(characterClassItem) {
var set = regenerate();
var body = characterClassItem.body.forEach(function(item) {
switch (item.type) {
case 'value':
set.add(item.codePoint);
if (ignoreCase && unicode) {
var folded = caseFold(item.codePoint);
if (folded) {
set.add(folded);
}
}
break;
case 'characterClassRange':
var min = item.min.codePoint;
var max = item.max.codePoint;
set.addRange(min, max);
if (ignoreCase && unicode) {
set.iuAddRange(min, max);
}
break;
case 'characterClassEscape':
set.add(getCharacterClassEscapeSet(item.value));
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
default:
throw Error('Unknown term type: ' + item.type);
}
});
if (characterClassItem.negative) {
set = (unicode ? UNICODE_SET : BMP_SET).clone().remove(set);
}
update(characterClassItem, set.toString());
return characterClassItem;
}
function processTerm(item) {
switch (item.type) {
case 'dot':
update(
item,
(unicode ? DOT_SET_UNICODE : DOT_SET).toString()
);
break;
case 'characterClass':
item = processCharacterClass(item);
break;
case 'characterClassEscape':
update(
item,
getCharacterClassEscapeSet(item.value).toString()
);
break;
case 'alternative':
case 'disjunction':
case 'group':
case 'quantifier':
item.body = item.body.map(processTerm);
break;
case 'value':
var codePoint = item.codePoint;
var set = regenerate(codePoint);
if (ignoreCase && unicode) {
var folded = caseFold(codePoint);
if (folded) {
set.add(folded);
}
}
update(item, set.toString());
break;
case 'anchor':
case 'empty':
case 'group':
case 'reference':
// Nothing to do here.
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
default:
throw Error('Unknown term type: ' + item.type);
}
return item;
};
module.exports = function(pattern, flags) {
var tree = parse(pattern, flags);
ignoreCase = flags ? flags.indexOf('i') > -1 : false;
unicode = flags ? flags.indexOf('u') > -1 : false;
assign(tree, processTerm(tree));
return generate(tree);
};