b0b70dfed219d3786688fe0b508bd383fba200481bcc7f816963cd8f8ed1c7eb4d0f3fff16364f50f458169a7617e5ce82238d3ddae1436e2347305d2f7287 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515
  1. /*---------------------------------------------------------------------------------------------
  2. * Copyright (c) Microsoft Corporation. All rights reserved.
  3. * Licensed under the MIT License. See License.txt in the project root for license information.
  4. *--------------------------------------------------------------------------------------------*/
  5. /*
  6. * This module only exports 'compile' which compiles a JSON language definition
  7. * into a typed and checked ILexer definition.
  8. */
  9. import * as monarchCommon from './monarchCommon.js';
  10. /*
  11. * Type helpers
  12. *
  13. * Note: this is just for sanity checks on the JSON description which is
  14. * helpful for the programmer. No checks are done anymore once the lexer is
  15. * already 'compiled and checked'.
  16. *
  17. */
  18. function isArrayOf(elemType, obj) {
  19. if (!obj) {
  20. return false;
  21. }
  22. if (!(Array.isArray(obj))) {
  23. return false;
  24. }
  25. for (const el of obj) {
  26. if (!(elemType(el))) {
  27. return false;
  28. }
  29. }
  30. return true;
  31. }
  32. function bool(prop, defValue) {
  33. if (typeof prop === 'boolean') {
  34. return prop;
  35. }
  36. return defValue;
  37. }
  38. function string(prop, defValue) {
  39. if (typeof (prop) === 'string') {
  40. return prop;
  41. }
  42. return defValue;
  43. }
  44. function arrayToHash(array) {
  45. const result = {};
  46. for (const e of array) {
  47. result[e] = true;
  48. }
  49. return result;
  50. }
  51. function createKeywordMatcher(arr, caseInsensitive = false) {
  52. if (caseInsensitive) {
  53. arr = arr.map(function (x) { return x.toLowerCase(); });
  54. }
  55. const hash = arrayToHash(arr);
  56. if (caseInsensitive) {
  57. return function (word) {
  58. return hash[word.toLowerCase()] !== undefined && hash.hasOwnProperty(word.toLowerCase());
  59. };
  60. }
  61. else {
  62. return function (word) {
  63. return hash[word] !== undefined && hash.hasOwnProperty(word);
  64. };
  65. }
  66. }
  67. // Lexer helpers
  68. /**
  69. * Compiles a regular expression string, adding the 'i' flag if 'ignoreCase' is set, and the 'u' flag if 'unicode' is set.
  70. * Also replaces @\w+ or sequences with the content of the specified attribute
  71. * @\w+ replacement can be avoided by escaping `@` signs with another `@` sign.
  72. * @example /@attr/ will be replaced with the value of lexer[attr]
  73. * @example /@@text/ will not be replaced and will become /@text/.
  74. */
  75. function compileRegExp(lexer, str) {
  76. // @@ must be interpreted as a literal @, so we replace all occurences of @@ with a placeholder character
  77. str = str.replace(/@@/g, `\x01`);
  78. let n = 0;
  79. let hadExpansion;
  80. do {
  81. hadExpansion = false;
  82. str = str.replace(/@(\w+)/g, function (s, attr) {
  83. hadExpansion = true;
  84. let sub = '';
  85. if (typeof (lexer[attr]) === 'string') {
  86. sub = lexer[attr];
  87. }
  88. else if (lexer[attr] && lexer[attr] instanceof RegExp) {
  89. sub = lexer[attr].source;
  90. }
  91. else {
  92. if (lexer[attr] === undefined) {
  93. throw monarchCommon.createError(lexer, 'language definition does not contain attribute \'' + attr + '\', used at: ' + str);
  94. }
  95. else {
  96. throw monarchCommon.createError(lexer, 'attribute reference \'' + attr + '\' must be a string, used at: ' + str);
  97. }
  98. }
  99. return (monarchCommon.empty(sub) ? '' : '(?:' + sub + ')');
  100. });
  101. n++;
  102. } while (hadExpansion && n < 5);
  103. // handle escaped @@
  104. str = str.replace(/\x01/g, '@');
  105. const flags = (lexer.ignoreCase ? 'i' : '') + (lexer.unicode ? 'u' : '');
  106. return new RegExp(str, flags);
  107. }
  108. /**
  109. * Compiles guard functions for case matches.
  110. * This compiles 'cases' attributes into efficient match functions.
  111. *
  112. */
  113. function selectScrutinee(id, matches, state, num) {
  114. if (num < 0) {
  115. return id;
  116. }
  117. if (num < matches.length) {
  118. return matches[num];
  119. }
  120. if (num >= 100) {
  121. num = num - 100;
  122. const parts = state.split('.');
  123. parts.unshift(state);
  124. if (num < parts.length) {
  125. return parts[num];
  126. }
  127. }
  128. return null;
  129. }
  130. function createGuard(lexer, ruleName, tkey, val) {
  131. // get the scrutinee and pattern
  132. let scrut = -1; // -1: $!, 0-99: $n, 100+n: $Sn
  133. let oppat = tkey;
  134. let matches = tkey.match(/^\$(([sS]?)(\d\d?)|#)(.*)$/);
  135. if (matches) {
  136. if (matches[3]) { // if digits
  137. scrut = parseInt(matches[3]);
  138. if (matches[2]) {
  139. scrut = scrut + 100; // if [sS] present
  140. }
  141. }
  142. oppat = matches[4];
  143. }
  144. // get operator
  145. let op = '~';
  146. let pat = oppat;
  147. if (!oppat || oppat.length === 0) {
  148. op = '!=';
  149. pat = '';
  150. }
  151. else if (/^\w*$/.test(pat)) { // just a word
  152. op = '==';
  153. }
  154. else {
  155. matches = oppat.match(/^(@|!@|~|!~|==|!=)(.*)$/);
  156. if (matches) {
  157. op = matches[1];
  158. pat = matches[2];
  159. }
  160. }
  161. // set the tester function
  162. let tester;
  163. // special case a regexp that matches just words
  164. if ((op === '~' || op === '!~') && /^(\w|\|)*$/.test(pat)) {
  165. const inWords = createKeywordMatcher(pat.split('|'), lexer.ignoreCase);
  166. tester = function (s) { return (op === '~' ? inWords(s) : !inWords(s)); };
  167. }
  168. else if (op === '@' || op === '!@') {
  169. const words = lexer[pat];
  170. if (!words) {
  171. throw monarchCommon.createError(lexer, 'the @ match target \'' + pat + '\' is not defined, in rule: ' + ruleName);
  172. }
  173. if (!(isArrayOf(function (elem) { return (typeof (elem) === 'string'); }, words))) {
  174. throw monarchCommon.createError(lexer, 'the @ match target \'' + pat + '\' must be an array of strings, in rule: ' + ruleName);
  175. }
  176. const inWords = createKeywordMatcher(words, lexer.ignoreCase);
  177. tester = function (s) { return (op === '@' ? inWords(s) : !inWords(s)); };
  178. }
  179. else if (op === '~' || op === '!~') {
  180. if (pat.indexOf('$') < 0) {
  181. // precompile regular expression
  182. const re = compileRegExp(lexer, '^' + pat + '$');
  183. tester = function (s) { return (op === '~' ? re.test(s) : !re.test(s)); };
  184. }
  185. else {
  186. tester = function (s, id, matches, state) {
  187. const re = compileRegExp(lexer, '^' + monarchCommon.substituteMatches(lexer, pat, id, matches, state) + '$');
  188. return re.test(s);
  189. };
  190. }
  191. }
  192. else { // if (op==='==' || op==='!=') {
  193. if (pat.indexOf('$') < 0) {
  194. const patx = monarchCommon.fixCase(lexer, pat);
  195. tester = function (s) { return (op === '==' ? s === patx : s !== patx); };
  196. }
  197. else {
  198. const patx = monarchCommon.fixCase(lexer, pat);
  199. tester = function (s, id, matches, state, eos) {
  200. const patexp = monarchCommon.substituteMatches(lexer, patx, id, matches, state);
  201. return (op === '==' ? s === patexp : s !== patexp);
  202. };
  203. }
  204. }
  205. // return the branch object
  206. if (scrut === -1) {
  207. return {
  208. name: tkey, value: val, test: function (id, matches, state, eos) {
  209. return tester(id, id, matches, state, eos);
  210. }
  211. };
  212. }
  213. else {
  214. return {
  215. name: tkey, value: val, test: function (id, matches, state, eos) {
  216. const scrutinee = selectScrutinee(id, matches, state, scrut);
  217. return tester(!scrutinee ? '' : scrutinee, id, matches, state, eos);
  218. }
  219. };
  220. }
  221. }
  222. /**
  223. * Compiles an action: i.e. optimize regular expressions and case matches
  224. * and do many sanity checks.
  225. *
  226. * This is called only during compilation but if the lexer definition
  227. * contains user functions as actions (which is usually not allowed), then this
  228. * may be called during lexing. It is important therefore to compile common cases efficiently
  229. */
  230. function compileAction(lexer, ruleName, action) {
  231. if (!action) {
  232. return { token: '' };
  233. }
  234. else if (typeof (action) === 'string') {
  235. return action; // { token: action };
  236. }
  237. else if (action.token || action.token === '') {
  238. if (typeof (action.token) !== 'string') {
  239. throw monarchCommon.createError(lexer, 'a \'token\' attribute must be of type string, in rule: ' + ruleName);
  240. }
  241. else {
  242. // only copy specific typed fields (only happens once during compile Lexer)
  243. const newAction = { token: action.token };
  244. if (action.token.indexOf('$') >= 0) {
  245. newAction.tokenSubst = true;
  246. }
  247. if (typeof (action.bracket) === 'string') {
  248. if (action.bracket === '@open') {
  249. newAction.bracket = 1 /* monarchCommon.MonarchBracket.Open */;
  250. }
  251. else if (action.bracket === '@close') {
  252. newAction.bracket = -1 /* monarchCommon.MonarchBracket.Close */;
  253. }
  254. else {
  255. throw monarchCommon.createError(lexer, 'a \'bracket\' attribute must be either \'@open\' or \'@close\', in rule: ' + ruleName);
  256. }
  257. }
  258. if (action.next) {
  259. if (typeof (action.next) !== 'string') {
  260. throw monarchCommon.createError(lexer, 'the next state must be a string value in rule: ' + ruleName);
  261. }
  262. else {
  263. let next = action.next;
  264. if (!/^(@pop|@push|@popall)$/.test(next)) {
  265. if (next[0] === '@') {
  266. next = next.substr(1); // peel off starting @ sign
  267. }
  268. if (next.indexOf('$') < 0) { // no dollar substitution, we can check if the state exists
  269. if (!monarchCommon.stateExists(lexer, monarchCommon.substituteMatches(lexer, next, '', [], ''))) {
  270. throw monarchCommon.createError(lexer, 'the next state \'' + action.next + '\' is not defined in rule: ' + ruleName);
  271. }
  272. }
  273. }
  274. newAction.next = next;
  275. }
  276. }
  277. if (typeof (action.goBack) === 'number') {
  278. newAction.goBack = action.goBack;
  279. }
  280. if (typeof (action.switchTo) === 'string') {
  281. newAction.switchTo = action.switchTo;
  282. }
  283. if (typeof (action.log) === 'string') {
  284. newAction.log = action.log;
  285. }
  286. if (typeof (action.nextEmbedded) === 'string') {
  287. newAction.nextEmbedded = action.nextEmbedded;
  288. lexer.usesEmbedded = true;
  289. }
  290. return newAction;
  291. }
  292. }
  293. else if (Array.isArray(action)) {
  294. const results = [];
  295. for (let i = 0, len = action.length; i < len; i++) {
  296. results[i] = compileAction(lexer, ruleName, action[i]);
  297. }
  298. return { group: results };
  299. }
  300. else if (action.cases) {
  301. // build an array of test cases
  302. const cases = [];
  303. // for each case, push a test function and result value
  304. for (const tkey in action.cases) {
  305. if (action.cases.hasOwnProperty(tkey)) {
  306. const val = compileAction(lexer, ruleName, action.cases[tkey]);
  307. // what kind of case
  308. if (tkey === '@default' || tkey === '@' || tkey === '') {
  309. cases.push({ test: undefined, value: val, name: tkey });
  310. }
  311. else if (tkey === '@eos') {
  312. cases.push({ test: function (id, matches, state, eos) { return eos; }, value: val, name: tkey });
  313. }
  314. else {
  315. cases.push(createGuard(lexer, ruleName, tkey, val)); // call separate function to avoid local variable capture
  316. }
  317. }
  318. }
  319. // create a matching function
  320. const def = lexer.defaultToken;
  321. return {
  322. test: function (id, matches, state, eos) {
  323. for (const _case of cases) {
  324. const didmatch = (!_case.test || _case.test(id, matches, state, eos));
  325. if (didmatch) {
  326. return _case.value;
  327. }
  328. }
  329. return def;
  330. }
  331. };
  332. }
  333. else {
  334. throw monarchCommon.createError(lexer, 'an action must be a string, an object with a \'token\' or \'cases\' attribute, or an array of actions; in rule: ' + ruleName);
  335. }
  336. }
  337. /**
  338. * Helper class for creating matching rules
  339. */
  340. class Rule {
  341. constructor(name) {
  342. this.regex = new RegExp('');
  343. this.action = { token: '' };
  344. this.matchOnlyAtLineStart = false;
  345. this.name = '';
  346. this.name = name;
  347. }
  348. setRegex(lexer, re) {
  349. let sregex;
  350. if (typeof (re) === 'string') {
  351. sregex = re;
  352. }
  353. else if (re instanceof RegExp) {
  354. sregex = re.source;
  355. }
  356. else {
  357. throw monarchCommon.createError(lexer, 'rules must start with a match string or regular expression: ' + this.name);
  358. }
  359. this.matchOnlyAtLineStart = (sregex.length > 0 && sregex[0] === '^');
  360. this.name = this.name + ': ' + sregex;
  361. this.regex = compileRegExp(lexer, '^(?:' + (this.matchOnlyAtLineStart ? sregex.substr(1) : sregex) + ')');
  362. }
  363. setAction(lexer, act) {
  364. this.action = compileAction(lexer, this.name, act);
  365. }
  366. }
  367. /**
  368. * Compiles a json description function into json where all regular expressions,
  369. * case matches etc, are compiled and all include rules are expanded.
  370. * We also compile the bracket definitions, supply defaults, and do many sanity checks.
  371. * If the 'jsonStrict' parameter is 'false', we allow at certain locations
  372. * regular expression objects and functions that get called during lexing.
  373. * (Currently we have no samples that need this so perhaps we should always have
  374. * jsonStrict to true).
  375. */
  376. export function compile(languageId, json) {
  377. if (!json || typeof (json) !== 'object') {
  378. throw new Error('Monarch: expecting a language definition object');
  379. }
  380. // Create our lexer
  381. const lexer = {};
  382. lexer.languageId = languageId;
  383. lexer.includeLF = bool(json.includeLF, false);
  384. lexer.noThrow = false; // raise exceptions during compilation
  385. lexer.maxStack = 100;
  386. // Set standard fields: be defensive about types
  387. lexer.start = (typeof json.start === 'string' ? json.start : null);
  388. lexer.ignoreCase = bool(json.ignoreCase, false);
  389. lexer.unicode = bool(json.unicode, false);
  390. lexer.tokenPostfix = string(json.tokenPostfix, '.' + lexer.languageId);
  391. lexer.defaultToken = string(json.defaultToken, 'source');
  392. lexer.usesEmbedded = false; // becomes true if we find a nextEmbedded action
  393. // For calling compileAction later on
  394. const lexerMin = json;
  395. lexerMin.languageId = languageId;
  396. lexerMin.includeLF = lexer.includeLF;
  397. lexerMin.ignoreCase = lexer.ignoreCase;
  398. lexerMin.unicode = lexer.unicode;
  399. lexerMin.noThrow = lexer.noThrow;
  400. lexerMin.usesEmbedded = lexer.usesEmbedded;
  401. lexerMin.stateNames = json.tokenizer;
  402. lexerMin.defaultToken = lexer.defaultToken;
  403. // Compile an array of rules into newrules where RegExp objects are created.
  404. function addRules(state, newrules, rules) {
  405. for (const rule of rules) {
  406. let include = rule.include;
  407. if (include) {
  408. if (typeof (include) !== 'string') {
  409. throw monarchCommon.createError(lexer, 'an \'include\' attribute must be a string at: ' + state);
  410. }
  411. if (include[0] === '@') {
  412. include = include.substr(1); // peel off starting @
  413. }
  414. if (!json.tokenizer[include]) {
  415. throw monarchCommon.createError(lexer, 'include target \'' + include + '\' is not defined at: ' + state);
  416. }
  417. addRules(state + '.' + include, newrules, json.tokenizer[include]);
  418. }
  419. else {
  420. const newrule = new Rule(state);
  421. // Set up new rule attributes
  422. if (Array.isArray(rule) && rule.length >= 1 && rule.length <= 3) {
  423. newrule.setRegex(lexerMin, rule[0]);
  424. if (rule.length >= 3) {
  425. if (typeof (rule[1]) === 'string') {
  426. newrule.setAction(lexerMin, { token: rule[1], next: rule[2] });
  427. }
  428. else if (typeof (rule[1]) === 'object') {
  429. const rule1 = rule[1];
  430. rule1.next = rule[2];
  431. newrule.setAction(lexerMin, rule1);
  432. }
  433. else {
  434. throw monarchCommon.createError(lexer, 'a next state as the last element of a rule can only be given if the action is either an object or a string, at: ' + state);
  435. }
  436. }
  437. else {
  438. newrule.setAction(lexerMin, rule[1]);
  439. }
  440. }
  441. else {
  442. if (!rule.regex) {
  443. throw monarchCommon.createError(lexer, 'a rule must either be an array, or an object with a \'regex\' or \'include\' field at: ' + state);
  444. }
  445. if (rule.name) {
  446. if (typeof rule.name === 'string') {
  447. newrule.name = rule.name;
  448. }
  449. }
  450. if (rule.matchOnlyAtStart) {
  451. newrule.matchOnlyAtLineStart = bool(rule.matchOnlyAtLineStart, false);
  452. }
  453. newrule.setRegex(lexerMin, rule.regex);
  454. newrule.setAction(lexerMin, rule.action);
  455. }
  456. newrules.push(newrule);
  457. }
  458. }
  459. }
  460. // compile the tokenizer rules
  461. if (!json.tokenizer || typeof (json.tokenizer) !== 'object') {
  462. throw monarchCommon.createError(lexer, 'a language definition must define the \'tokenizer\' attribute as an object');
  463. }
  464. lexer.tokenizer = [];
  465. for (const key in json.tokenizer) {
  466. if (json.tokenizer.hasOwnProperty(key)) {
  467. if (!lexer.start) {
  468. lexer.start = key;
  469. }
  470. const rules = json.tokenizer[key];
  471. lexer.tokenizer[key] = new Array();
  472. addRules('tokenizer.' + key, lexer.tokenizer[key], rules);
  473. }
  474. }
  475. lexer.usesEmbedded = lexerMin.usesEmbedded; // can be set during compileAction
  476. // Set simple brackets
  477. if (json.brackets) {
  478. if (!(Array.isArray(json.brackets))) {
  479. throw monarchCommon.createError(lexer, 'the \'brackets\' attribute must be defined as an array');
  480. }
  481. }
  482. else {
  483. json.brackets = [
  484. { open: '{', close: '}', token: 'delimiter.curly' },
  485. { open: '[', close: ']', token: 'delimiter.square' },
  486. { open: '(', close: ')', token: 'delimiter.parenthesis' },
  487. { open: '<', close: '>', token: 'delimiter.angle' }
  488. ];
  489. }
  490. const brackets = [];
  491. for (const el of json.brackets) {
  492. let desc = el;
  493. if (desc && Array.isArray(desc) && desc.length === 3) {
  494. desc = { token: desc[2], open: desc[0], close: desc[1] };
  495. }
  496. if (desc.open === desc.close) {
  497. throw monarchCommon.createError(lexer, 'open and close brackets in a \'brackets\' attribute must be different: ' + desc.open +
  498. '\n hint: use the \'bracket\' attribute if matching on equal brackets is required.');
  499. }
  500. if (typeof desc.open === 'string' && typeof desc.token === 'string' && typeof desc.close === 'string') {
  501. brackets.push({
  502. token: desc.token + lexer.tokenPostfix,
  503. open: monarchCommon.fixCase(lexer, desc.open),
  504. close: monarchCommon.fixCase(lexer, desc.close)
  505. });
  506. }
  507. else {
  508. throw monarchCommon.createError(lexer, 'every element in the \'brackets\' array must be a \'{open,close,token}\' object or array');
  509. }
  510. }
  511. lexer.brackets = brackets;
  512. // Disable throw so the syntax highlighter goes, no matter what
  513. lexer.noThrow = true;
  514. return lexer;
  515. }