| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257 | /** * Simple HTML Parser * * @author Zongmin Lei<leizongmin@gmail.com> */var _ = require("./util");/** * get tag name * * @param {String} html e.g. '<a hef="#">' * @return {String} */function getTagName(html) {  var i = _.spaceIndex(html);  var tagName;  if (i === -1) {    tagName = html.slice(1, -1);  } else {    tagName = html.slice(1, i + 1);  }  tagName = _.trim(tagName).toLowerCase();  if (tagName.slice(0, 1) === "/") tagName = tagName.slice(1);  if (tagName.slice(-1) === "/") tagName = tagName.slice(0, -1);  return tagName;}/** * is close tag? * * @param {String} html 如:'<a hef="#">' * @return {Boolean} */function isClosing(html) {  return html.slice(0, 2) === "</";}/** * parse input html and returns processed html * * @param {String} html * @param {Function} onTag e.g. function (sourcePosition, position, tag, html, isClosing) * @param {Function} escapeHtml * @return {String} */function parseTag(html, onTag, escapeHtml) {  "use strict";  var rethtml = "";  var lastPos = 0;  var tagStart = false;  var quoteStart = false;  var currentPos = 0;  var len = html.length;  var currentTagName = "";  var currentHtml = "";  chariterator: for (currentPos = 0; currentPos < len; currentPos++) {    var c = html.charAt(currentPos);    if (tagStart === false) {      if (c === "<") {        tagStart = currentPos;        continue;      }    } else {      if (quoteStart === false) {        if (c === "<") {          rethtml += escapeHtml(html.slice(lastPos, currentPos));          tagStart = currentPos;          lastPos = currentPos;          continue;        }        if (c === ">") {          rethtml += escapeHtml(html.slice(lastPos, tagStart));          currentHtml = html.slice(tagStart, currentPos + 1);          currentTagName = getTagName(currentHtml);          rethtml += onTag(            tagStart,            rethtml.length,            currentTagName,            currentHtml,            isClosing(currentHtml)          );          lastPos = currentPos + 1;          tagStart = false;          continue;        }        if (c === '"' || c === "'") {          var i = 1;          var ic = html.charAt(currentPos - i);          while (ic.trim() === "" || ic === "=") {            if (ic === "=") {              quoteStart = c;              continue chariterator;            }            ic = html.charAt(currentPos - ++i);          }        }      } else {        if (c === quoteStart) {          quoteStart = false;          continue;        }      }    }  }  if (lastPos < html.length) {    rethtml += escapeHtml(html.substr(lastPos));  }  return rethtml;}var REGEXP_ILLEGAL_ATTR_NAME = /[^a-zA-Z0-9\\_:.-]/gim;/** * parse input attributes and returns processed attributes * * @param {String} html e.g. `href="#" target="_blank"` * @param {Function} onAttr e.g. `function (name, value)` * @return {String} */function parseAttr(html, onAttr) {  "use strict";  var lastPos = 0;  var lastMarkPos = 0;  var retAttrs = [];  var tmpName = false;  var len = html.length;  function addAttr(name, value) {    name = _.trim(name);    name = name.replace(REGEXP_ILLEGAL_ATTR_NAME, "").toLowerCase();    if (name.length < 1) return;    var ret = onAttr(name, value || "");    if (ret) retAttrs.push(ret);  }  // 逐个分析字符  for (var i = 0; i < len; i++) {    var c = html.charAt(i);    var v, j;    if (tmpName === false && c === "=") {      tmpName = html.slice(lastPos, i);      lastPos = i + 1;      lastMarkPos = html.charAt(lastPos) === '"' || html.charAt(lastPos) === "'" ? lastPos : findNextQuotationMark(html, i + 1);      continue;    }    if (tmpName !== false) {      if (        i === lastMarkPos      ) {        j = html.indexOf(c, i + 1);        if (j === -1) {          break;        } else {          v = _.trim(html.slice(lastMarkPos + 1, j));          addAttr(tmpName, v);          tmpName = false;          i = j;          lastPos = i + 1;          continue;        }      }    }    if (/\s|\n|\t/.test(c)) {      html = html.replace(/\s|\n|\t/g, " ");      if (tmpName === false) {        j = findNextEqual(html, i);        if (j === -1) {          v = _.trim(html.slice(lastPos, i));          addAttr(v);          tmpName = false;          lastPos = i + 1;          continue;        } else {          i = j - 1;          continue;        }      } else {        j = findBeforeEqual(html, i - 1);        if (j === -1) {          v = _.trim(html.slice(lastPos, i));          v = stripQuoteWrap(v);          addAttr(tmpName, v);          tmpName = false;          lastPos = i + 1;          continue;        } else {          continue;        }      }    }  }  if (lastPos < html.length) {    if (tmpName === false) {      addAttr(html.slice(lastPos));    } else {      addAttr(tmpName, stripQuoteWrap(_.trim(html.slice(lastPos))));    }  }  return _.trim(retAttrs.join(" "));}function findNextEqual(str, i) {  for (; i < str.length; i++) {    var c = str[i];    if (c === " ") continue;    if (c === "=") return i;    return -1;  }}function findNextQuotationMark(str, i) {  for (; i < str.length; i++) {    var c = str[i];    if (c === " ") continue;    if (c === "'" || c === '"') return i;    return -1;  }}function findBeforeEqual(str, i) {  for (; i > 0; i--) {    var c = str[i];    if (c === " ") continue;    if (c === "=") return i;    return -1;  }}function isQuoteWrapString(text) {  if (    (text[0] === '"' && text[text.length - 1] === '"') ||    (text[0] === "'" && text[text.length - 1] === "'")  ) {    return true;  } else {    return false;  }}function stripQuoteWrap(text) {  if (isQuoteWrapString(text)) {    return text.substr(1, text.length - 2);  } else {    return text;  }}exports.parseTag = parseTag;exports.parseAttr = parseAttr;
 |