parse-html.js 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637
  1. "use strict";
  2. Object.defineProperty(exports, "__esModule", { value: true });
  3. exports.parseHtml = void 0;
  4. var tslib_1 = require("tslib");
  5. var regex_lib_1 = require("../regex-lib");
  6. var utils_1 = require("../utils");
  7. // For debugging: search for other "For debugging" lines
  8. // import CliTable from 'cli-table';
  9. /**
  10. * Parses an HTML string, calling the callbacks to notify of tags and text.
  11. *
  12. * ## History
  13. *
  14. * This file previously used a regular expression to find html tags in the input
  15. * text. Unfortunately, we ran into a bunch of catastrophic backtracking issues
  16. * with certain input text, causing Autolinker to either hang or just take a
  17. * really long time to parse the string.
  18. *
  19. * The current code is intended to be a O(n) algorithm that walks through
  20. * the string in one pass, and tries to be as cheap as possible. We don't need
  21. * to implement the full HTML spec, but rather simply determine where the string
  22. * looks like an HTML tag, and where it looks like text (so that we can autolink
  23. * that).
  24. *
  25. * This state machine parser is intended just to be a simple but performant
  26. * parser of HTML for the subset of requirements we have. We simply need to:
  27. *
  28. * 1. Determine where HTML tags are
  29. * 2. Determine the tag name (Autolinker specifically only cares about <a>,
  30. * <script>, and <style> tags, so as not to link any text within them)
  31. *
  32. * We don't need to:
  33. *
  34. * 1. Create a parse tree
  35. * 2. Auto-close tags with invalid markup
  36. * 3. etc.
  37. *
  38. * The other intention behind this is that we didn't want to add external
  39. * dependencies on the Autolinker utility which would increase its size. For
  40. * instance, adding htmlparser2 adds 125kb to the minified output file,
  41. * increasing its final size from 47kb to 172kb (at the time of writing). It
  42. * also doesn't work exactly correctly, treating the string "<3 blah blah blah"
  43. * as an HTML tag.
  44. *
  45. * Reference for HTML spec:
  46. *
  47. * https://www.w3.org/TR/html51/syntax.html#sec-tokenization
  48. *
  49. * @param {String} html The HTML to parse
  50. * @param {Object} callbacks
  51. * @param {Function} callbacks.onOpenTag Callback function to call when an open
  52. * tag is parsed. Called with the tagName as its argument.
  53. * @param {Function} callbacks.onCloseTag Callback function to call when a close
  54. * tag is parsed. Called with the tagName as its argument. If a self-closing
  55. * tag is found, `onCloseTag` is called immediately after `onOpenTag`.
  56. * @param {Function} callbacks.onText Callback function to call when text (i.e
  57. * not an HTML tag) is parsed. Called with the text (string) as its first
  58. * argument, and offset (number) into the string as its second.
  59. */
  60. function parseHtml(html, _a) {
  61. var onOpenTag = _a.onOpenTag, onCloseTag = _a.onCloseTag, onText = _a.onText, onComment = _a.onComment, onDoctype = _a.onDoctype;
  62. var noCurrentTag = new CurrentTag();
  63. var charIdx = 0, len = html.length, state = 0 /* Data */, currentDataIdx = 0, // where the current data start index is
  64. currentTag = noCurrentTag; // describes the current tag that is being read
  65. // For debugging: search for other "For debugging" lines
  66. // const table = new CliTable( {
  67. // head: [ 'charIdx', 'char', 'state', 'currentDataIdx', 'currentOpenTagIdx', 'tag.type' ]
  68. // } );
  69. while (charIdx < len) {
  70. var char = html.charAt(charIdx);
  71. // For debugging: search for other "For debugging" lines
  72. // ALSO: Temporarily remove the 'const' keyword on the State enum
  73. // table.push(
  74. // [ charIdx, char, State[ state ], currentDataIdx, currentTag.idx, currentTag.idx === -1 ? '' : currentTag.type ]
  75. // );
  76. switch (state) {
  77. case 0 /* Data */:
  78. stateData(char);
  79. break;
  80. case 1 /* TagOpen */:
  81. stateTagOpen(char);
  82. break;
  83. case 2 /* EndTagOpen */:
  84. stateEndTagOpen(char);
  85. break;
  86. case 3 /* TagName */:
  87. stateTagName(char);
  88. break;
  89. case 4 /* BeforeAttributeName */:
  90. stateBeforeAttributeName(char);
  91. break;
  92. case 5 /* AttributeName */:
  93. stateAttributeName(char);
  94. break;
  95. case 6 /* AfterAttributeName */:
  96. stateAfterAttributeName(char);
  97. break;
  98. case 7 /* BeforeAttributeValue */:
  99. stateBeforeAttributeValue(char);
  100. break;
  101. case 8 /* AttributeValueDoubleQuoted */:
  102. stateAttributeValueDoubleQuoted(char);
  103. break;
  104. case 9 /* AttributeValueSingleQuoted */:
  105. stateAttributeValueSingleQuoted(char);
  106. break;
  107. case 10 /* AttributeValueUnquoted */:
  108. stateAttributeValueUnquoted(char);
  109. break;
  110. case 11 /* AfterAttributeValueQuoted */:
  111. stateAfterAttributeValueQuoted(char);
  112. break;
  113. case 12 /* SelfClosingStartTag */:
  114. stateSelfClosingStartTag(char);
  115. break;
  116. case 13 /* MarkupDeclarationOpenState */:
  117. stateMarkupDeclarationOpen(char);
  118. break;
  119. case 14 /* CommentStart */:
  120. stateCommentStart(char);
  121. break;
  122. case 15 /* CommentStartDash */:
  123. stateCommentStartDash(char);
  124. break;
  125. case 16 /* Comment */:
  126. stateComment(char);
  127. break;
  128. case 17 /* CommentEndDash */:
  129. stateCommentEndDash(char);
  130. break;
  131. case 18 /* CommentEnd */:
  132. stateCommentEnd(char);
  133. break;
  134. case 19 /* CommentEndBang */:
  135. stateCommentEndBang(char);
  136. break;
  137. case 20 /* Doctype */:
  138. stateDoctype(char);
  139. break;
  140. default:
  141. (0, utils_1.assertNever)(state);
  142. }
  143. // For debugging: search for other "For debugging" lines
  144. // ALSO: Temporarily remove the 'const' keyword on the State enum
  145. // table.push(
  146. // [ charIdx, char, State[ state ], currentDataIdx, currentTag.idx, currentTag.idx === -1 ? '' : currentTag.type ]
  147. // );
  148. charIdx++;
  149. }
  150. if (currentDataIdx < charIdx) {
  151. emitText();
  152. }
  153. // For debugging: search for other "For debugging" lines
  154. // console.log( '\n' + table.toString() );
  155. // Called when non-tags are being read (i.e. the text around HTML †ags)
  156. // https://www.w3.org/TR/html51/syntax.html#data-state
  157. function stateData(char) {
  158. if (char === '<') {
  159. startNewTag();
  160. }
  161. }
  162. // Called after a '<' is read from the Data state
  163. // https://www.w3.org/TR/html51/syntax.html#tag-open-state
  164. function stateTagOpen(char) {
  165. if (char === '!') {
  166. state = 13 /* MarkupDeclarationOpenState */;
  167. }
  168. else if (char === '/') {
  169. state = 2 /* EndTagOpen */;
  170. currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { isClosing: true }));
  171. }
  172. else if (char === '<') {
  173. // start of another tag (ignore the previous, incomplete one)
  174. startNewTag();
  175. }
  176. else if (regex_lib_1.letterRe.test(char)) {
  177. // tag name start (and no '/' read)
  178. state = 3 /* TagName */;
  179. currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { isOpening: true }));
  180. }
  181. else {
  182. // Any other
  183. state = 0 /* Data */;
  184. currentTag = noCurrentTag;
  185. }
  186. }
  187. // After a '<x', '</x' sequence is read (where 'x' is a letter character),
  188. // this is to continue reading the tag name
  189. // https://www.w3.org/TR/html51/syntax.html#tag-name-state
  190. function stateTagName(char) {
  191. if (regex_lib_1.whitespaceRe.test(char)) {
  192. currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { name: captureTagName() }));
  193. state = 4 /* BeforeAttributeName */;
  194. }
  195. else if (char === '<') {
  196. // start of another tag (ignore the previous, incomplete one)
  197. startNewTag();
  198. }
  199. else if (char === '/') {
  200. currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { name: captureTagName() }));
  201. state = 12 /* SelfClosingStartTag */;
  202. }
  203. else if (char === '>') {
  204. currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { name: captureTagName() }));
  205. emitTagAndPreviousTextNode(); // resets to Data state as well
  206. }
  207. else if (!regex_lib_1.letterRe.test(char) && !regex_lib_1.digitRe.test(char) && char !== ':') {
  208. // Anything else that does not form an html tag. Note: the colon
  209. // character is accepted for XML namespaced tags
  210. resetToDataState();
  211. }
  212. else {
  213. // continue reading tag name
  214. }
  215. }
  216. // Called after the '/' is read from a '</' sequence
  217. // https://www.w3.org/TR/html51/syntax.html#end-tag-open-state
  218. function stateEndTagOpen(char) {
  219. if (char === '>') {
  220. // parse error. Encountered "</>". Skip it without treating as a tag
  221. resetToDataState();
  222. }
  223. else if (regex_lib_1.letterRe.test(char)) {
  224. state = 3 /* TagName */;
  225. }
  226. else {
  227. // some other non-tag-like character, don't treat this as a tag
  228. resetToDataState();
  229. }
  230. }
  231. // https://www.w3.org/TR/html51/syntax.html#before-attribute-name-state
  232. function stateBeforeAttributeName(char) {
  233. if (regex_lib_1.whitespaceRe.test(char)) {
  234. // stay in BeforeAttributeName state - continue reading chars
  235. }
  236. else if (char === '/') {
  237. state = 12 /* SelfClosingStartTag */;
  238. }
  239. else if (char === '>') {
  240. emitTagAndPreviousTextNode(); // resets to Data state as well
  241. }
  242. else if (char === '<') {
  243. // start of another tag (ignore the previous, incomplete one)
  244. startNewTag();
  245. }
  246. else if (char === "=" || regex_lib_1.quoteRe.test(char) || regex_lib_1.controlCharsRe.test(char)) {
  247. // "Parse error" characters that, according to the spec, should be
  248. // appended to the attribute name, but we'll treat these characters
  249. // as not forming a real HTML tag
  250. resetToDataState();
  251. }
  252. else {
  253. // Any other char, start of a new attribute name
  254. state = 5 /* AttributeName */;
  255. }
  256. }
  257. // https://www.w3.org/TR/html51/syntax.html#attribute-name-state
  258. function stateAttributeName(char) {
  259. if (regex_lib_1.whitespaceRe.test(char)) {
  260. state = 6 /* AfterAttributeName */;
  261. }
  262. else if (char === '/') {
  263. state = 12 /* SelfClosingStartTag */;
  264. }
  265. else if (char === '=') {
  266. state = 7 /* BeforeAttributeValue */;
  267. }
  268. else if (char === '>') {
  269. emitTagAndPreviousTextNode(); // resets to Data state as well
  270. }
  271. else if (char === '<') {
  272. // start of another tag (ignore the previous, incomplete one)
  273. startNewTag();
  274. }
  275. else if (regex_lib_1.quoteRe.test(char)) {
  276. // "Parse error" characters that, according to the spec, should be
  277. // appended to the attribute name, but we'll treat these characters
  278. // as not forming a real HTML tag
  279. resetToDataState();
  280. }
  281. else {
  282. // anything else: continue reading attribute name
  283. }
  284. }
  285. // https://www.w3.org/TR/html51/syntax.html#after-attribute-name-state
  286. function stateAfterAttributeName(char) {
  287. if (regex_lib_1.whitespaceRe.test(char)) {
  288. // ignore the character - continue reading
  289. }
  290. else if (char === '/') {
  291. state = 12 /* SelfClosingStartTag */;
  292. }
  293. else if (char === '=') {
  294. state = 7 /* BeforeAttributeValue */;
  295. }
  296. else if (char === '>') {
  297. emitTagAndPreviousTextNode();
  298. }
  299. else if (char === '<') {
  300. // start of another tag (ignore the previous, incomplete one)
  301. startNewTag();
  302. }
  303. else if (regex_lib_1.quoteRe.test(char)) {
  304. // "Parse error" characters that, according to the spec, should be
  305. // appended to the attribute name, but we'll treat these characters
  306. // as not forming a real HTML tag
  307. resetToDataState();
  308. }
  309. else {
  310. // Any other character, start a new attribute in the current tag
  311. state = 5 /* AttributeName */;
  312. }
  313. }
  314. // https://www.w3.org/TR/html51/syntax.html#before-attribute-value-state
  315. function stateBeforeAttributeValue(char) {
  316. if (regex_lib_1.whitespaceRe.test(char)) {
  317. // ignore the character - continue reading
  318. }
  319. else if (char === "\"") {
  320. state = 8 /* AttributeValueDoubleQuoted */;
  321. }
  322. else if (char === "'") {
  323. state = 9 /* AttributeValueSingleQuoted */;
  324. }
  325. else if (/[>=`]/.test(char)) {
  326. // Invalid chars after an '=' for an attribute value, don't count
  327. // the current tag as an HTML tag
  328. resetToDataState();
  329. }
  330. else if (char === '<') {
  331. // start of another tag (ignore the previous, incomplete one)
  332. startNewTag();
  333. }
  334. else {
  335. // Any other character, consider it an unquoted attribute value
  336. state = 10 /* AttributeValueUnquoted */;
  337. }
  338. }
  339. // https://www.w3.org/TR/html51/syntax.html#attribute-value-double-quoted-state
  340. function stateAttributeValueDoubleQuoted(char) {
  341. if (char === "\"") {
  342. // end the current double-quoted attribute
  343. state = 11 /* AfterAttributeValueQuoted */;
  344. }
  345. else {
  346. // consume the character as part of the double-quoted attribute value
  347. }
  348. }
  349. // https://www.w3.org/TR/html51/syntax.html#attribute-value-single-quoted-state
  350. function stateAttributeValueSingleQuoted(char) {
  351. if (char === "'") {
  352. // end the current single-quoted attribute
  353. state = 11 /* AfterAttributeValueQuoted */;
  354. }
  355. else {
  356. // consume the character as part of the double-quoted attribute value
  357. }
  358. }
  359. // https://www.w3.org/TR/html51/syntax.html#attribute-value-unquoted-state
  360. function stateAttributeValueUnquoted(char) {
  361. if (regex_lib_1.whitespaceRe.test(char)) {
  362. state = 4 /* BeforeAttributeName */;
  363. }
  364. else if (char === '>') {
  365. emitTagAndPreviousTextNode();
  366. }
  367. else if (char === '<') {
  368. // start of another tag (ignore the previous, incomplete one)
  369. startNewTag();
  370. }
  371. else {
  372. // Any other character, treat it as part of the attribute value
  373. }
  374. }
  375. // https://www.w3.org/TR/html51/syntax.html#after-attribute-value-quoted-state
  376. function stateAfterAttributeValueQuoted(char) {
  377. if (regex_lib_1.whitespaceRe.test(char)) {
  378. state = 4 /* BeforeAttributeName */;
  379. }
  380. else if (char === '/') {
  381. state = 12 /* SelfClosingStartTag */;
  382. }
  383. else if (char === '>') {
  384. emitTagAndPreviousTextNode();
  385. }
  386. else if (char === '<') {
  387. // start of another tag (ignore the previous, incomplete one)
  388. startNewTag();
  389. }
  390. else {
  391. // Any other character, "parse error". Spec says to switch to the
  392. // BeforeAttributeState and re-consume the character, as it may be
  393. // the start of a new attribute name
  394. state = 4 /* BeforeAttributeName */;
  395. reconsumeCurrentCharacter();
  396. }
  397. }
  398. // A '/' has just been read in the current tag (presumably for '/>'), and
  399. // this handles the next character
  400. // https://www.w3.org/TR/html51/syntax.html#self-closing-start-tag-state
  401. function stateSelfClosingStartTag(char) {
  402. if (char === '>') {
  403. currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { isClosing: true }));
  404. emitTagAndPreviousTextNode(); // resets to Data state as well
  405. }
  406. else {
  407. state = 4 /* BeforeAttributeName */;
  408. }
  409. }
  410. // https://www.w3.org/TR/html51/syntax.html#markup-declaration-open-state
  411. // (HTML Comments or !DOCTYPE)
  412. function stateMarkupDeclarationOpen(char) {
  413. if (html.substr(charIdx, 2) === '--') {
  414. // html comment
  415. charIdx += 2; // "consume" characters
  416. currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { type: 'comment' }));
  417. state = 14 /* CommentStart */;
  418. }
  419. else if (html.substr(charIdx, 7).toUpperCase() === 'DOCTYPE') {
  420. charIdx += 7; // "consume" characters
  421. currentTag = new CurrentTag((0, tslib_1.__assign)((0, tslib_1.__assign)({}, currentTag), { type: 'doctype' }));
  422. state = 20 /* Doctype */;
  423. }
  424. else {
  425. // At this point, the spec specifies that the state machine should
  426. // enter the "bogus comment" state, in which case any character(s)
  427. // after the '<!' that were read should become an HTML comment up
  428. // until the first '>' that is read (or EOF). Instead, we'll assume
  429. // that a user just typed '<!' as part of text data
  430. resetToDataState();
  431. }
  432. }
  433. // Handles after the sequence '<!--' has been read
  434. // https://www.w3.org/TR/html51/syntax.html#comment-start-state
  435. function stateCommentStart(char) {
  436. if (char === '-') {
  437. // We've read the sequence '<!---' at this point (3 dashes)
  438. state = 15 /* CommentStartDash */;
  439. }
  440. else if (char === '>') {
  441. // At this point, we'll assume the comment wasn't a real comment
  442. // so we'll just emit it as data. We basically read the sequence
  443. // '<!-->'
  444. resetToDataState();
  445. }
  446. else {
  447. // Any other char, take it as part of the comment
  448. state = 16 /* Comment */;
  449. }
  450. }
  451. // We've read the sequence '<!---' at this point (3 dashes)
  452. // https://www.w3.org/TR/html51/syntax.html#comment-start-dash-state
  453. function stateCommentStartDash(char) {
  454. if (char === '-') {
  455. // We've read '<!----' (4 dashes) at this point
  456. state = 18 /* CommentEnd */;
  457. }
  458. else if (char === '>') {
  459. // At this point, we'll assume the comment wasn't a real comment
  460. // so we'll just emit it as data. We basically read the sequence
  461. // '<!--->'
  462. resetToDataState();
  463. }
  464. else {
  465. // Anything else, take it as a valid comment
  466. state = 16 /* Comment */;
  467. }
  468. }
  469. // Currently reading the comment's text (data)
  470. // https://www.w3.org/TR/html51/syntax.html#comment-state
  471. function stateComment(char) {
  472. if (char === '-') {
  473. state = 17 /* CommentEndDash */;
  474. }
  475. else {
  476. // Any other character, stay in the Comment state
  477. }
  478. }
  479. // When we we've read the first dash inside a comment, it may signal the
  480. // end of the comment if we read another dash
  481. // https://www.w3.org/TR/html51/syntax.html#comment-end-dash-state
  482. function stateCommentEndDash(char) {
  483. if (char === '-') {
  484. state = 18 /* CommentEnd */;
  485. }
  486. else {
  487. // Wasn't a dash, must still be part of the comment
  488. state = 16 /* Comment */;
  489. }
  490. }
  491. // After we've read two dashes inside a comment, it may signal the end of
  492. // the comment if we then read a '>' char
  493. // https://www.w3.org/TR/html51/syntax.html#comment-end-state
  494. function stateCommentEnd(char) {
  495. if (char === '>') {
  496. emitTagAndPreviousTextNode();
  497. }
  498. else if (char === '!') {
  499. state = 19 /* CommentEndBang */;
  500. }
  501. else if (char === '-') {
  502. // A 3rd '-' has been read: stay in the CommentEnd state
  503. }
  504. else {
  505. // Anything else, switch back to the comment state since we didn't
  506. // read the full "end comment" sequence (i.e. '-->')
  507. state = 16 /* Comment */;
  508. }
  509. }
  510. // We've read the sequence '--!' inside of a comment
  511. // https://www.w3.org/TR/html51/syntax.html#comment-end-bang-state
  512. function stateCommentEndBang(char) {
  513. if (char === '-') {
  514. // We read the sequence '--!-' inside of a comment. The last dash
  515. // could signify that the comment is going to close
  516. state = 17 /* CommentEndDash */;
  517. }
  518. else if (char === '>') {
  519. // End of comment with the sequence '--!>'
  520. emitTagAndPreviousTextNode();
  521. }
  522. else {
  523. // The '--!' was not followed by a '>', continue reading the
  524. // comment's text
  525. state = 16 /* Comment */;
  526. }
  527. }
  528. /**
  529. * For DOCTYPES in particular, we don't care about the attributes. Just
  530. * advance to the '>' character and emit the tag, unless we find a '<'
  531. * character in which case we'll start a new tag.
  532. *
  533. * Example doctype tag:
  534. * <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
  535. *
  536. * Actual spec: https://www.w3.org/TR/html51/syntax.html#doctype-state
  537. */
  538. function stateDoctype(char) {
  539. if (char === '>') {
  540. emitTagAndPreviousTextNode();
  541. }
  542. else if (char === '<') {
  543. startNewTag();
  544. }
  545. else {
  546. // stay in the Doctype state
  547. }
  548. }
  549. /**
  550. * Resets the state back to the Data state, and removes the current tag.
  551. *
  552. * We'll generally run this function whenever a "parse error" is
  553. * encountered, where the current tag that is being read no longer looks
  554. * like a real HTML tag.
  555. */
  556. function resetToDataState() {
  557. state = 0 /* Data */;
  558. currentTag = noCurrentTag;
  559. }
  560. /**
  561. * Starts a new HTML tag at the current index, ignoring any previous HTML
  562. * tag that was being read.
  563. *
  564. * We'll generally run this function whenever we read a new '<' character,
  565. * including when we read a '<' character inside of an HTML tag that we were
  566. * previously reading.
  567. */
  568. function startNewTag() {
  569. state = 1 /* TagOpen */;
  570. currentTag = new CurrentTag({ idx: charIdx });
  571. }
  572. /**
  573. * Once we've decided to emit an open tag, that means we can also emit the
  574. * text node before it.
  575. */
  576. function emitTagAndPreviousTextNode() {
  577. var textBeforeTag = html.slice(currentDataIdx, currentTag.idx);
  578. if (textBeforeTag) {
  579. // the html tag was the first element in the html string, or two
  580. // tags next to each other, in which case we should not emit a text
  581. // node
  582. onText(textBeforeTag, currentDataIdx);
  583. }
  584. if (currentTag.type === 'comment') {
  585. onComment(currentTag.idx);
  586. }
  587. else if (currentTag.type === 'doctype') {
  588. onDoctype(currentTag.idx);
  589. }
  590. else {
  591. if (currentTag.isOpening) {
  592. onOpenTag(currentTag.name, currentTag.idx);
  593. }
  594. if (currentTag.isClosing) {
  595. // note: self-closing tags will emit both opening and closing
  596. onCloseTag(currentTag.name, currentTag.idx);
  597. }
  598. }
  599. // Since we just emitted a tag, reset to the data state for the next char
  600. resetToDataState();
  601. currentDataIdx = charIdx + 1;
  602. }
  603. function emitText() {
  604. var text = html.slice(currentDataIdx, charIdx);
  605. onText(text, currentDataIdx);
  606. currentDataIdx = charIdx + 1;
  607. }
  608. /**
  609. * Captures the tag name from the start of the tag to the current character
  610. * index, and converts it to lower case
  611. */
  612. function captureTagName() {
  613. var startIdx = currentTag.idx + (currentTag.isClosing ? 2 : 1);
  614. return html.slice(startIdx, charIdx).toLowerCase();
  615. }
  616. /**
  617. * Causes the main loop to re-consume the current character, such as after
  618. * encountering a "parse error" that changed state and needs to reconsume
  619. * the same character in that new state.
  620. */
  621. function reconsumeCurrentCharacter() {
  622. charIdx--;
  623. }
  624. }
  625. exports.parseHtml = parseHtml;
  626. var CurrentTag = /** @class */ (function () {
  627. function CurrentTag(cfg) {
  628. if (cfg === void 0) { cfg = {}; }
  629. this.idx = cfg.idx !== undefined ? cfg.idx : -1;
  630. this.type = cfg.type || 'tag';
  631. this.name = cfg.name || '';
  632. this.isOpening = !!cfg.isOpening;
  633. this.isClosing = !!cfg.isClosing;
  634. }
  635. return CurrentTag;
  636. }());
  637. //# sourceMappingURL=parse-html.js.map