| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515 |
- /*---------------------------------------------------------------------------------------------
- * Copyright (c) Microsoft Corporation. All rights reserved.
- * Licensed under the MIT License. See License.txt in the project root for license information.
- *--------------------------------------------------------------------------------------------*/
- /*
- * This module only exports 'compile' which compiles a JSON language definition
- * into a typed and checked ILexer definition.
- */
- import * as monarchCommon from './monarchCommon.js';
- /*
- * Type helpers
- *
- * Note: this is just for sanity checks on the JSON description which is
- * helpful for the programmer. No checks are done anymore once the lexer is
- * already 'compiled and checked'.
- *
- */
- function isArrayOf(elemType, obj) {
- if (!obj) {
- return false;
- }
- if (!(Array.isArray(obj))) {
- return false;
- }
- for (const el of obj) {
- if (!(elemType(el))) {
- return false;
- }
- }
- return true;
- }
- function bool(prop, defValue) {
- if (typeof prop === 'boolean') {
- return prop;
- }
- return defValue;
- }
- function string(prop, defValue) {
- if (typeof (prop) === 'string') {
- return prop;
- }
- return defValue;
- }
- function arrayToHash(array) {
- const result = {};
- for (const e of array) {
- result[e] = true;
- }
- return result;
- }
- function createKeywordMatcher(arr, caseInsensitive = false) {
- if (caseInsensitive) {
- arr = arr.map(function (x) { return x.toLowerCase(); });
- }
- const hash = arrayToHash(arr);
- if (caseInsensitive) {
- return function (word) {
- return hash[word.toLowerCase()] !== undefined && hash.hasOwnProperty(word.toLowerCase());
- };
- }
- else {
- return function (word) {
- return hash[word] !== undefined && hash.hasOwnProperty(word);
- };
- }
- }
- // Lexer helpers
- /**
- * Compiles a regular expression string, adding the 'i' flag if 'ignoreCase' is set, and the 'u' flag if 'unicode' is set.
- * Also replaces @\w+ or sequences with the content of the specified attribute
- * @\w+ replacement can be avoided by escaping `@` signs with another `@` sign.
- * @example /@attr/ will be replaced with the value of lexer[attr]
- * @example /@@text/ will not be replaced and will become /@text/.
- */
- function compileRegExp(lexer, str) {
- // @@ must be interpreted as a literal @, so we replace all occurences of @@ with a placeholder character
- str = str.replace(/@@/g, `\x01`);
- let n = 0;
- let hadExpansion;
- do {
- hadExpansion = false;
- str = str.replace(/@(\w+)/g, function (s, attr) {
- hadExpansion = true;
- let sub = '';
- if (typeof (lexer[attr]) === 'string') {
- sub = lexer[attr];
- }
- else if (lexer[attr] && lexer[attr] instanceof RegExp) {
- sub = lexer[attr].source;
- }
- else {
- if (lexer[attr] === undefined) {
- throw monarchCommon.createError(lexer, 'language definition does not contain attribute \'' + attr + '\', used at: ' + str);
- }
- else {
- throw monarchCommon.createError(lexer, 'attribute reference \'' + attr + '\' must be a string, used at: ' + str);
- }
- }
- return (monarchCommon.empty(sub) ? '' : '(?:' + sub + ')');
- });
- n++;
- } while (hadExpansion && n < 5);
- // handle escaped @@
- str = str.replace(/\x01/g, '@');
- const flags = (lexer.ignoreCase ? 'i' : '') + (lexer.unicode ? 'u' : '');
- return new RegExp(str, flags);
- }
- /**
- * Compiles guard functions for case matches.
- * This compiles 'cases' attributes into efficient match functions.
- *
- */
- function selectScrutinee(id, matches, state, num) {
- if (num < 0) {
- return id;
- }
- if (num < matches.length) {
- return matches[num];
- }
- if (num >= 100) {
- num = num - 100;
- const parts = state.split('.');
- parts.unshift(state);
- if (num < parts.length) {
- return parts[num];
- }
- }
- return null;
- }
- function createGuard(lexer, ruleName, tkey, val) {
- // get the scrutinee and pattern
- let scrut = -1; // -1: $!, 0-99: $n, 100+n: $Sn
- let oppat = tkey;
- let matches = tkey.match(/^\$(([sS]?)(\d\d?)|#)(.*)$/);
- if (matches) {
- if (matches[3]) { // if digits
- scrut = parseInt(matches[3]);
- if (matches[2]) {
- scrut = scrut + 100; // if [sS] present
- }
- }
- oppat = matches[4];
- }
- // get operator
- let op = '~';
- let pat = oppat;
- if (!oppat || oppat.length === 0) {
- op = '!=';
- pat = '';
- }
- else if (/^\w*$/.test(pat)) { // just a word
- op = '==';
- }
- else {
- matches = oppat.match(/^(@|!@|~|!~|==|!=)(.*)$/);
- if (matches) {
- op = matches[1];
- pat = matches[2];
- }
- }
- // set the tester function
- let tester;
- // special case a regexp that matches just words
- if ((op === '~' || op === '!~') && /^(\w|\|)*$/.test(pat)) {
- const inWords = createKeywordMatcher(pat.split('|'), lexer.ignoreCase);
- tester = function (s) { return (op === '~' ? inWords(s) : !inWords(s)); };
- }
- else if (op === '@' || op === '!@') {
- const words = lexer[pat];
- if (!words) {
- throw monarchCommon.createError(lexer, 'the @ match target \'' + pat + '\' is not defined, in rule: ' + ruleName);
- }
- if (!(isArrayOf(function (elem) { return (typeof (elem) === 'string'); }, words))) {
- throw monarchCommon.createError(lexer, 'the @ match target \'' + pat + '\' must be an array of strings, in rule: ' + ruleName);
- }
- const inWords = createKeywordMatcher(words, lexer.ignoreCase);
- tester = function (s) { return (op === '@' ? inWords(s) : !inWords(s)); };
- }
- else if (op === '~' || op === '!~') {
- if (pat.indexOf('$') < 0) {
- // precompile regular expression
- const re = compileRegExp(lexer, '^' + pat + '$');
- tester = function (s) { return (op === '~' ? re.test(s) : !re.test(s)); };
- }
- else {
- tester = function (s, id, matches, state) {
- const re = compileRegExp(lexer, '^' + monarchCommon.substituteMatches(lexer, pat, id, matches, state) + '$');
- return re.test(s);
- };
- }
- }
- else { // if (op==='==' || op==='!=') {
- if (pat.indexOf('$') < 0) {
- const patx = monarchCommon.fixCase(lexer, pat);
- tester = function (s) { return (op === '==' ? s === patx : s !== patx); };
- }
- else {
- const patx = monarchCommon.fixCase(lexer, pat);
- tester = function (s, id, matches, state, eos) {
- const patexp = monarchCommon.substituteMatches(lexer, patx, id, matches, state);
- return (op === '==' ? s === patexp : s !== patexp);
- };
- }
- }
- // return the branch object
- if (scrut === -1) {
- return {
- name: tkey, value: val, test: function (id, matches, state, eos) {
- return tester(id, id, matches, state, eos);
- }
- };
- }
- else {
- return {
- name: tkey, value: val, test: function (id, matches, state, eos) {
- const scrutinee = selectScrutinee(id, matches, state, scrut);
- return tester(!scrutinee ? '' : scrutinee, id, matches, state, eos);
- }
- };
- }
- }
- /**
- * Compiles an action: i.e. optimize regular expressions and case matches
- * and do many sanity checks.
- *
- * This is called only during compilation but if the lexer definition
- * contains user functions as actions (which is usually not allowed), then this
- * may be called during lexing. It is important therefore to compile common cases efficiently
- */
- function compileAction(lexer, ruleName, action) {
- if (!action) {
- return { token: '' };
- }
- else if (typeof (action) === 'string') {
- return action; // { token: action };
- }
- else if (action.token || action.token === '') {
- if (typeof (action.token) !== 'string') {
- throw monarchCommon.createError(lexer, 'a \'token\' attribute must be of type string, in rule: ' + ruleName);
- }
- else {
- // only copy specific typed fields (only happens once during compile Lexer)
- const newAction = { token: action.token };
- if (action.token.indexOf('$') >= 0) {
- newAction.tokenSubst = true;
- }
- if (typeof (action.bracket) === 'string') {
- if (action.bracket === '@open') {
- newAction.bracket = 1 /* monarchCommon.MonarchBracket.Open */;
- }
- else if (action.bracket === '@close') {
- newAction.bracket = -1 /* monarchCommon.MonarchBracket.Close */;
- }
- else {
- throw monarchCommon.createError(lexer, 'a \'bracket\' attribute must be either \'@open\' or \'@close\', in rule: ' + ruleName);
- }
- }
- if (action.next) {
- if (typeof (action.next) !== 'string') {
- throw monarchCommon.createError(lexer, 'the next state must be a string value in rule: ' + ruleName);
- }
- else {
- let next = action.next;
- if (!/^(@pop|@push|@popall)$/.test(next)) {
- if (next[0] === '@') {
- next = next.substr(1); // peel off starting @ sign
- }
- if (next.indexOf('$') < 0) { // no dollar substitution, we can check if the state exists
- if (!monarchCommon.stateExists(lexer, monarchCommon.substituteMatches(lexer, next, '', [], ''))) {
- throw monarchCommon.createError(lexer, 'the next state \'' + action.next + '\' is not defined in rule: ' + ruleName);
- }
- }
- }
- newAction.next = next;
- }
- }
- if (typeof (action.goBack) === 'number') {
- newAction.goBack = action.goBack;
- }
- if (typeof (action.switchTo) === 'string') {
- newAction.switchTo = action.switchTo;
- }
- if (typeof (action.log) === 'string') {
- newAction.log = action.log;
- }
- if (typeof (action.nextEmbedded) === 'string') {
- newAction.nextEmbedded = action.nextEmbedded;
- lexer.usesEmbedded = true;
- }
- return newAction;
- }
- }
- else if (Array.isArray(action)) {
- const results = [];
- for (let i = 0, len = action.length; i < len; i++) {
- results[i] = compileAction(lexer, ruleName, action[i]);
- }
- return { group: results };
- }
- else if (action.cases) {
- // build an array of test cases
- const cases = [];
- // for each case, push a test function and result value
- for (const tkey in action.cases) {
- if (action.cases.hasOwnProperty(tkey)) {
- const val = compileAction(lexer, ruleName, action.cases[tkey]);
- // what kind of case
- if (tkey === '@default' || tkey === '@' || tkey === '') {
- cases.push({ test: undefined, value: val, name: tkey });
- }
- else if (tkey === '@eos') {
- cases.push({ test: function (id, matches, state, eos) { return eos; }, value: val, name: tkey });
- }
- else {
- cases.push(createGuard(lexer, ruleName, tkey, val)); // call separate function to avoid local variable capture
- }
- }
- }
- // create a matching function
- const def = lexer.defaultToken;
- return {
- test: function (id, matches, state, eos) {
- for (const _case of cases) {
- const didmatch = (!_case.test || _case.test(id, matches, state, eos));
- if (didmatch) {
- return _case.value;
- }
- }
- return def;
- }
- };
- }
- else {
- throw monarchCommon.createError(lexer, 'an action must be a string, an object with a \'token\' or \'cases\' attribute, or an array of actions; in rule: ' + ruleName);
- }
- }
- /**
- * Helper class for creating matching rules
- */
- class Rule {
- constructor(name) {
- this.regex = new RegExp('');
- this.action = { token: '' };
- this.matchOnlyAtLineStart = false;
- this.name = '';
- this.name = name;
- }
- setRegex(lexer, re) {
- let sregex;
- if (typeof (re) === 'string') {
- sregex = re;
- }
- else if (re instanceof RegExp) {
- sregex = re.source;
- }
- else {
- throw monarchCommon.createError(lexer, 'rules must start with a match string or regular expression: ' + this.name);
- }
- this.matchOnlyAtLineStart = (sregex.length > 0 && sregex[0] === '^');
- this.name = this.name + ': ' + sregex;
- this.regex = compileRegExp(lexer, '^(?:' + (this.matchOnlyAtLineStart ? sregex.substr(1) : sregex) + ')');
- }
- setAction(lexer, act) {
- this.action = compileAction(lexer, this.name, act);
- }
- }
- /**
- * Compiles a json description function into json where all regular expressions,
- * case matches etc, are compiled and all include rules are expanded.
- * We also compile the bracket definitions, supply defaults, and do many sanity checks.
- * If the 'jsonStrict' parameter is 'false', we allow at certain locations
- * regular expression objects and functions that get called during lexing.
- * (Currently we have no samples that need this so perhaps we should always have
- * jsonStrict to true).
- */
- export function compile(languageId, json) {
- if (!json || typeof (json) !== 'object') {
- throw new Error('Monarch: expecting a language definition object');
- }
- // Create our lexer
- const lexer = {};
- lexer.languageId = languageId;
- lexer.includeLF = bool(json.includeLF, false);
- lexer.noThrow = false; // raise exceptions during compilation
- lexer.maxStack = 100;
- // Set standard fields: be defensive about types
- lexer.start = (typeof json.start === 'string' ? json.start : null);
- lexer.ignoreCase = bool(json.ignoreCase, false);
- lexer.unicode = bool(json.unicode, false);
- lexer.tokenPostfix = string(json.tokenPostfix, '.' + lexer.languageId);
- lexer.defaultToken = string(json.defaultToken, 'source');
- lexer.usesEmbedded = false; // becomes true if we find a nextEmbedded action
- // For calling compileAction later on
- const lexerMin = json;
- lexerMin.languageId = languageId;
- lexerMin.includeLF = lexer.includeLF;
- lexerMin.ignoreCase = lexer.ignoreCase;
- lexerMin.unicode = lexer.unicode;
- lexerMin.noThrow = lexer.noThrow;
- lexerMin.usesEmbedded = lexer.usesEmbedded;
- lexerMin.stateNames = json.tokenizer;
- lexerMin.defaultToken = lexer.defaultToken;
- // Compile an array of rules into newrules where RegExp objects are created.
- function addRules(state, newrules, rules) {
- for (const rule of rules) {
- let include = rule.include;
- if (include) {
- if (typeof (include) !== 'string') {
- throw monarchCommon.createError(lexer, 'an \'include\' attribute must be a string at: ' + state);
- }
- if (include[0] === '@') {
- include = include.substr(1); // peel off starting @
- }
- if (!json.tokenizer[include]) {
- throw monarchCommon.createError(lexer, 'include target \'' + include + '\' is not defined at: ' + state);
- }
- addRules(state + '.' + include, newrules, json.tokenizer[include]);
- }
- else {
- const newrule = new Rule(state);
- // Set up new rule attributes
- if (Array.isArray(rule) && rule.length >= 1 && rule.length <= 3) {
- newrule.setRegex(lexerMin, rule[0]);
- if (rule.length >= 3) {
- if (typeof (rule[1]) === 'string') {
- newrule.setAction(lexerMin, { token: rule[1], next: rule[2] });
- }
- else if (typeof (rule[1]) === 'object') {
- const rule1 = rule[1];
- rule1.next = rule[2];
- newrule.setAction(lexerMin, rule1);
- }
- else {
- throw monarchCommon.createError(lexer, 'a next state as the last element of a rule can only be given if the action is either an object or a string, at: ' + state);
- }
- }
- else {
- newrule.setAction(lexerMin, rule[1]);
- }
- }
- else {
- if (!rule.regex) {
- throw monarchCommon.createError(lexer, 'a rule must either be an array, or an object with a \'regex\' or \'include\' field at: ' + state);
- }
- if (rule.name) {
- if (typeof rule.name === 'string') {
- newrule.name = rule.name;
- }
- }
- if (rule.matchOnlyAtStart) {
- newrule.matchOnlyAtLineStart = bool(rule.matchOnlyAtLineStart, false);
- }
- newrule.setRegex(lexerMin, rule.regex);
- newrule.setAction(lexerMin, rule.action);
- }
- newrules.push(newrule);
- }
- }
- }
- // compile the tokenizer rules
- if (!json.tokenizer || typeof (json.tokenizer) !== 'object') {
- throw monarchCommon.createError(lexer, 'a language definition must define the \'tokenizer\' attribute as an object');
- }
- lexer.tokenizer = [];
- for (const key in json.tokenizer) {
- if (json.tokenizer.hasOwnProperty(key)) {
- if (!lexer.start) {
- lexer.start = key;
- }
- const rules = json.tokenizer[key];
- lexer.tokenizer[key] = new Array();
- addRules('tokenizer.' + key, lexer.tokenizer[key], rules);
- }
- }
- lexer.usesEmbedded = lexerMin.usesEmbedded; // can be set during compileAction
- // Set simple brackets
- if (json.brackets) {
- if (!(Array.isArray(json.brackets))) {
- throw monarchCommon.createError(lexer, 'the \'brackets\' attribute must be defined as an array');
- }
- }
- else {
- json.brackets = [
- { open: '{', close: '}', token: 'delimiter.curly' },
- { open: '[', close: ']', token: 'delimiter.square' },
- { open: '(', close: ')', token: 'delimiter.parenthesis' },
- { open: '<', close: '>', token: 'delimiter.angle' }
- ];
- }
- const brackets = [];
- for (const el of json.brackets) {
- let desc = el;
- if (desc && Array.isArray(desc) && desc.length === 3) {
- desc = { token: desc[2], open: desc[0], close: desc[1] };
- }
- if (desc.open === desc.close) {
- throw monarchCommon.createError(lexer, 'open and close brackets in a \'brackets\' attribute must be different: ' + desc.open +
- '\n hint: use the \'bracket\' attribute if matching on equal brackets is required.');
- }
- if (typeof desc.open === 'string' && typeof desc.token === 'string' && typeof desc.close === 'string') {
- brackets.push({
- token: desc.token + lexer.tokenPostfix,
- open: monarchCommon.fixCase(lexer, desc.open),
- close: monarchCommon.fixCase(lexer, desc.close)
- });
- }
- else {
- throw monarchCommon.createError(lexer, 'every element in the \'brackets\' array must be a \'{open,close,token}\' object or array');
- }
- }
- lexer.brackets = brackets;
- // Disable throw so the syntax highlighter goes, no matter what
- lexer.noThrow = true;
- return lexer;
- }
|