parse-matches.js 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093
  1. "use strict";
  2. Object.defineProperty(exports, "__esModule", { value: true });
  3. exports.excludeUnbalancedTrailingBracesAndPunctuation = exports.parseMatches = void 0;
  4. var regex_lib_1 = require("../regex-lib");
  5. var url_match_1 = require("../match/url-match");
  6. var utils_1 = require("../utils");
  7. var uri_utils_1 = require("./uri-utils");
  8. var email_utils_1 = require("./email-utils");
  9. var email_match_1 = require("../match/email-match");
  10. var hashtag_utils_1 = require("./hashtag-utils");
  11. var hashtag_match_1 = require("../match/hashtag-match");
  12. var mention_utils_1 = require("./mention-utils");
  13. var mention_match_1 = require("../match/mention-match");
  14. var phone_number_utils_1 = require("./phone-number-utils");
  15. var phone_match_1 = require("../match/phone-match");
  16. // For debugging: search for and uncomment other "For debugging" lines
  17. // import CliTable from 'cli-table';
  18. /**
  19. * Parses URL, email, twitter, mention, and hashtag matches from the given
  20. * `text`.
  21. */
  22. function parseMatches(text, args) {
  23. var tagBuilder = args.tagBuilder;
  24. var stripPrefix = args.stripPrefix;
  25. var stripTrailingSlash = args.stripTrailingSlash;
  26. var decodePercentEncoding = args.decodePercentEncoding;
  27. var hashtagServiceName = args.hashtagServiceName;
  28. var mentionServiceName = args.mentionServiceName;
  29. var matches = [];
  30. var textLen = text.length;
  31. // An array of all active state machines. Empty array means we're in the
  32. // "no url" state
  33. var stateMachines = [];
  34. // For debugging: search for and uncomment other "For debugging" lines
  35. // const table = new CliTable({
  36. // head: ['charIdx', 'char', 'states', 'charIdx', 'startIdx', 'reached accept state'],
  37. // });
  38. var charIdx = 0;
  39. for (; charIdx < textLen; charIdx++) {
  40. var char = text.charAt(charIdx);
  41. if (stateMachines.length === 0) {
  42. stateNoMatch(char);
  43. }
  44. else {
  45. // Must loop through the state machines backwards for when one
  46. // is removed
  47. for (var stateIdx = stateMachines.length - 1; stateIdx >= 0; stateIdx--) {
  48. var stateMachine = stateMachines[stateIdx];
  49. switch (stateMachine.state) {
  50. // Protocol-relative URL states
  51. case 11 /* ProtocolRelativeSlash1 */:
  52. stateProtocolRelativeSlash1(stateMachine, char);
  53. break;
  54. case 12 /* ProtocolRelativeSlash2 */:
  55. stateProtocolRelativeSlash2(stateMachine, char);
  56. break;
  57. case 0 /* SchemeChar */:
  58. stateSchemeChar(stateMachine, char);
  59. break;
  60. case 1 /* SchemeHyphen */:
  61. stateSchemeHyphen(stateMachine, char);
  62. break;
  63. case 2 /* SchemeColon */:
  64. stateSchemeColon(stateMachine, char);
  65. break;
  66. case 3 /* SchemeSlash1 */:
  67. stateSchemeSlash1(stateMachine, char);
  68. break;
  69. case 4 /* SchemeSlash2 */:
  70. stateSchemeSlash2(stateMachine, char);
  71. break;
  72. case 5 /* DomainLabelChar */:
  73. stateDomainLabelChar(stateMachine, char);
  74. break;
  75. case 6 /* DomainHyphen */:
  76. stateDomainHyphen(stateMachine, char);
  77. break;
  78. case 7 /* DomainDot */:
  79. stateDomainDot(stateMachine, char);
  80. break;
  81. case 13 /* IpV4Digit */:
  82. stateIpV4Digit(stateMachine, char);
  83. break;
  84. case 14 /* IpV4Dot */:
  85. stateIPv4Dot(stateMachine, char);
  86. break;
  87. case 8 /* PortColon */:
  88. statePortColon(stateMachine, char);
  89. break;
  90. case 9 /* PortNumber */:
  91. statePortNumber(stateMachine, char);
  92. break;
  93. case 10 /* Path */:
  94. statePath(stateMachine, char);
  95. break;
  96. // Email States
  97. case 15 /* EmailMailto_M */:
  98. stateEmailMailto_M(stateMachine, char);
  99. break;
  100. case 16 /* EmailMailto_A */:
  101. stateEmailMailto_A(stateMachine, char);
  102. break;
  103. case 17 /* EmailMailto_I */:
  104. stateEmailMailto_I(stateMachine, char);
  105. break;
  106. case 18 /* EmailMailto_L */:
  107. stateEmailMailto_L(stateMachine, char);
  108. break;
  109. case 19 /* EmailMailto_T */:
  110. stateEmailMailto_T(stateMachine, char);
  111. break;
  112. case 20 /* EmailMailto_O */:
  113. stateEmailMailto_O(stateMachine, char);
  114. break;
  115. case 21 /* EmailMailto_Colon */:
  116. stateEmailMailtoColon(stateMachine, char);
  117. break;
  118. case 22 /* EmailLocalPart */:
  119. stateEmailLocalPart(stateMachine, char);
  120. break;
  121. case 23 /* EmailLocalPartDot */:
  122. stateEmailLocalPartDot(stateMachine, char);
  123. break;
  124. case 24 /* EmailAtSign */:
  125. stateEmailAtSign(stateMachine, char);
  126. break;
  127. case 25 /* EmailDomainChar */:
  128. stateEmailDomainChar(stateMachine, char);
  129. break;
  130. case 26 /* EmailDomainHyphen */:
  131. stateEmailDomainHyphen(stateMachine, char);
  132. break;
  133. case 27 /* EmailDomainDot */:
  134. stateEmailDomainDot(stateMachine, char);
  135. break;
  136. // Hashtag states
  137. case 28 /* HashtagHashChar */:
  138. stateHashtagHashChar(stateMachine, char);
  139. break;
  140. case 29 /* HashtagTextChar */:
  141. stateHashtagTextChar(stateMachine, char);
  142. break;
  143. // Mention states
  144. case 30 /* MentionAtChar */:
  145. stateMentionAtChar(stateMachine, char);
  146. break;
  147. case 31 /* MentionTextChar */:
  148. stateMentionTextChar(stateMachine, char);
  149. break;
  150. // Phone number states
  151. case 32 /* PhoneNumberOpenParen */:
  152. statePhoneNumberOpenParen(stateMachine, char);
  153. break;
  154. case 33 /* PhoneNumberAreaCodeDigit1 */:
  155. statePhoneNumberAreaCodeDigit1(stateMachine, char);
  156. break;
  157. case 34 /* PhoneNumberAreaCodeDigit2 */:
  158. statePhoneNumberAreaCodeDigit2(stateMachine, char);
  159. break;
  160. case 35 /* PhoneNumberAreaCodeDigit3 */:
  161. statePhoneNumberAreaCodeDigit3(stateMachine, char);
  162. break;
  163. case 36 /* PhoneNumberCloseParen */:
  164. statePhoneNumberCloseParen(stateMachine, char);
  165. break;
  166. case 37 /* PhoneNumberPlus */:
  167. statePhoneNumberPlus(stateMachine, char);
  168. break;
  169. case 38 /* PhoneNumberDigit */:
  170. statePhoneNumberDigit(stateMachine, char);
  171. break;
  172. case 39 /* PhoneNumberSeparator */:
  173. statePhoneNumberSeparator(stateMachine, char);
  174. break;
  175. case 40 /* PhoneNumberControlChar */:
  176. statePhoneNumberControlChar(stateMachine, char);
  177. break;
  178. case 41 /* PhoneNumberPoundChar */:
  179. statePhoneNumberPoundChar(stateMachine, char);
  180. break;
  181. default:
  182. (0, utils_1.assertNever)(stateMachine.state);
  183. }
  184. }
  185. }
  186. // For debugging: search for and uncomment other "For debugging" lines
  187. // table.push([
  188. // charIdx,
  189. // char,
  190. // stateMachines.map(machine => State[machine.state]).join('\n') || '(none)',
  191. // charIdx,
  192. // stateMachines.map(m => m.startIdx).join('\n'),
  193. // stateMachines.map(m => m.acceptStateReached).join('\n'),
  194. // ]);
  195. }
  196. // Capture any valid match at the end of the string
  197. // Note: this loop must happen in reverse because
  198. // captureMatchIfValidAndRemove() removes state machines from the array
  199. // and we'll end up skipping every other one if we remove while looping
  200. // forward
  201. for (var i = stateMachines.length - 1; i >= 0; i--) {
  202. stateMachines.forEach(function (stateMachine) { return captureMatchIfValidAndRemove(stateMachine); });
  203. }
  204. // For debugging: search for and uncomment other "For debugging" lines
  205. // console.log(`\nRead string:\n ${text}`);
  206. // console.log(table.toString());
  207. return matches;
  208. // Handles the state when we're not in a URL/email/etc. (i.e. when no state machines exist)
  209. function stateNoMatch(char) {
  210. if (char === '#') {
  211. // Hash char, start a Hashtag match
  212. stateMachines.push(createHashtagStateMachine(charIdx, 28 /* HashtagHashChar */));
  213. }
  214. else if (char === '@') {
  215. // '@' char, start a Mention match
  216. stateMachines.push(createMentionStateMachine(charIdx, 30 /* MentionAtChar */));
  217. }
  218. else if (char === '/') {
  219. // A slash could begin a protocol-relative URL
  220. stateMachines.push(createTldUrlStateMachine(charIdx, 11 /* ProtocolRelativeSlash1 */));
  221. }
  222. else if (char === '+') {
  223. // A '+' char can start a Phone number
  224. stateMachines.push(createPhoneNumberStateMachine(charIdx, 37 /* PhoneNumberPlus */));
  225. }
  226. else if (char === '(') {
  227. stateMachines.push(createPhoneNumberStateMachine(charIdx, 32 /* PhoneNumberOpenParen */));
  228. }
  229. else {
  230. if (regex_lib_1.digitRe.test(char)) {
  231. // A digit could start a phone number
  232. stateMachines.push(createPhoneNumberStateMachine(charIdx, 38 /* PhoneNumberDigit */));
  233. // A digit could start an IP address
  234. stateMachines.push(createIpV4UrlStateMachine(charIdx, 13 /* IpV4Digit */));
  235. }
  236. if ((0, email_utils_1.isEmailLocalPartStartChar)(char)) {
  237. // Any email local part. An 'm' character in particular could
  238. // start a 'mailto:' match
  239. var startState = char.toLowerCase() === 'm' ? 15 /* EmailMailto_M */ : 22 /* EmailLocalPart */;
  240. stateMachines.push(createEmailStateMachine(charIdx, startState));
  241. }
  242. if ((0, uri_utils_1.isSchemeStartChar)(char)) {
  243. // An uppercase or lowercase letter may start a scheme match
  244. stateMachines.push(createSchemeUrlStateMachine(charIdx, 0 /* SchemeChar */));
  245. }
  246. if (regex_lib_1.alphaNumericAndMarksRe.test(char)) {
  247. // A unicode alpha character or digit could start a domain name
  248. // label for a TLD match
  249. stateMachines.push(createTldUrlStateMachine(charIdx, 5 /* DomainLabelChar */));
  250. }
  251. }
  252. // Anything else, remain in the "non-url" state by not creating any
  253. // state machines
  254. }
  255. // Implements ABNF: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
  256. function stateSchemeChar(stateMachine, char) {
  257. if (char === ':') {
  258. stateMachine.state = 2 /* SchemeColon */;
  259. }
  260. else if (char === '-') {
  261. stateMachine.state = 1 /* SchemeHyphen */;
  262. }
  263. else if ((0, uri_utils_1.isSchemeChar)(char)) {
  264. // Stay in SchemeChar state
  265. }
  266. else {
  267. // Any other character, not a scheme
  268. (0, utils_1.remove)(stateMachines, stateMachine);
  269. }
  270. }
  271. function stateSchemeHyphen(stateMachine, char) {
  272. if (char === '-') {
  273. // Stay in SchemeHyphen state
  274. // TODO: Should a colon following a dash be counted as the end of the scheme?
  275. // } else if (char === ':') {
  276. // stateMachine.state = State.SchemeColon;
  277. }
  278. else if (char === '/') {
  279. // Not a valid scheme match, but may be the start of a
  280. // protocol-relative match (such as //google.com)
  281. (0, utils_1.remove)(stateMachines, stateMachine);
  282. stateMachines.push(createTldUrlStateMachine(charIdx, 11 /* ProtocolRelativeSlash1 */));
  283. }
  284. else if ((0, uri_utils_1.isSchemeChar)(char)) {
  285. stateMachine.state = 0 /* SchemeChar */;
  286. }
  287. else {
  288. // Any other character, not a scheme
  289. (0, utils_1.remove)(stateMachines, stateMachine);
  290. }
  291. }
  292. function stateSchemeColon(stateMachine, char) {
  293. if (char === '/') {
  294. stateMachine.state = 3 /* SchemeSlash1 */;
  295. }
  296. else if (char === '.') {
  297. // We've read something like 'hello:.' - don't capture
  298. (0, utils_1.remove)(stateMachines, stateMachine);
  299. }
  300. else if ((0, uri_utils_1.isDomainLabelStartChar)(char)) {
  301. stateMachine.state = 5 /* DomainLabelChar */;
  302. // It's possible that we read an "introduction" piece of text,
  303. // and the character after the current colon actually starts an
  304. // actual scheme. An example of this is:
  305. // "The link:http://google.com"
  306. // Hence, start a new machine to capture this match if so
  307. if ((0, uri_utils_1.isSchemeStartChar)(char)) {
  308. stateMachines.push(createSchemeUrlStateMachine(charIdx, 0 /* SchemeChar */));
  309. }
  310. }
  311. else {
  312. (0, utils_1.remove)(stateMachines, stateMachine);
  313. }
  314. }
  315. function stateSchemeSlash1(stateMachine, char) {
  316. if (char === '/') {
  317. stateMachine.state = 4 /* SchemeSlash2 */;
  318. }
  319. else if ((0, uri_utils_1.isPathChar)(char)) {
  320. stateMachine.state = 10 /* Path */;
  321. stateMachine.acceptStateReached = true;
  322. }
  323. else {
  324. captureMatchIfValidAndRemove(stateMachine);
  325. }
  326. }
  327. function stateSchemeSlash2(stateMachine, char) {
  328. if (char === '/') {
  329. // 3rd slash, must be an absolute path (path-absolute in the
  330. // ABNF), such as in a file:///c:/windows/etc. See
  331. // https://tools.ietf.org/html/rfc3986#appendix-A
  332. stateMachine.state = 10 /* Path */;
  333. }
  334. else if ((0, uri_utils_1.isDomainLabelStartChar)(char)) {
  335. // start of "authority" section - see https://tools.ietf.org/html/rfc3986#appendix-A
  336. stateMachine.state = 5 /* DomainLabelChar */;
  337. stateMachine.acceptStateReached = true;
  338. }
  339. else {
  340. // not valid
  341. (0, utils_1.remove)(stateMachines, stateMachine);
  342. }
  343. }
  344. // Handles reading a '/' from the NonUrl state
  345. function stateProtocolRelativeSlash1(stateMachine, char) {
  346. if (char === '/') {
  347. stateMachine.state = 12 /* ProtocolRelativeSlash2 */;
  348. }
  349. else {
  350. // Anything else, cannot be the start of a protocol-relative
  351. // URL.
  352. (0, utils_1.remove)(stateMachines, stateMachine);
  353. }
  354. }
  355. // Handles reading a second '/', which could start a protocol-relative URL
  356. function stateProtocolRelativeSlash2(stateMachine, char) {
  357. if ((0, uri_utils_1.isDomainLabelStartChar)(char)) {
  358. stateMachine.state = 5 /* DomainLabelChar */;
  359. }
  360. else {
  361. // Anything else, not a URL
  362. (0, utils_1.remove)(stateMachines, stateMachine);
  363. }
  364. }
  365. // Handles when we have read a domain label character
  366. function stateDomainLabelChar(stateMachine, char) {
  367. if (char === '.') {
  368. stateMachine.state = 7 /* DomainDot */;
  369. }
  370. else if (char === '-') {
  371. stateMachine.state = 6 /* DomainHyphen */;
  372. }
  373. else if (char === ':') {
  374. // Beginning of a port number, end the domain name
  375. stateMachine.state = 8 /* PortColon */;
  376. }
  377. else if ((0, uri_utils_1.isUrlSuffixStartChar)(char)) {
  378. // '/', '?', or '#'
  379. stateMachine.state = 10 /* Path */;
  380. }
  381. else if ((0, uri_utils_1.isDomainLabelChar)(char)) {
  382. // Stay in the DomainLabelChar state
  383. }
  384. else {
  385. // Anything else, end the domain name
  386. captureMatchIfValidAndRemove(stateMachine);
  387. }
  388. }
  389. function stateDomainHyphen(stateMachine, char) {
  390. if (char === '-') {
  391. // Remain in the DomainHyphen state
  392. }
  393. else if (char === '.') {
  394. // Not valid to have a '-.' in a domain label
  395. captureMatchIfValidAndRemove(stateMachine);
  396. }
  397. else if ((0, uri_utils_1.isDomainLabelStartChar)(char)) {
  398. stateMachine.state = 5 /* DomainLabelChar */;
  399. }
  400. else {
  401. captureMatchIfValidAndRemove(stateMachine);
  402. }
  403. }
  404. function stateDomainDot(stateMachine, char) {
  405. if (char === '.') {
  406. // domain names cannot have multiple '.'s next to each other.
  407. // It's possible we've already read a valid domain name though,
  408. // and that the '..' sequence just forms an ellipsis at the end
  409. // of a sentence
  410. captureMatchIfValidAndRemove(stateMachine);
  411. }
  412. else if ((0, uri_utils_1.isDomainLabelStartChar)(char)) {
  413. stateMachine.state = 5 /* DomainLabelChar */;
  414. stateMachine.acceptStateReached = true; // after hitting a dot, and then another domain label, we've reached an accept state
  415. }
  416. else {
  417. // Anything else, end the domain name
  418. captureMatchIfValidAndRemove(stateMachine);
  419. }
  420. }
  421. function stateIpV4Digit(stateMachine, char) {
  422. if (char === '.') {
  423. stateMachine.state = 14 /* IpV4Dot */;
  424. }
  425. else if (char === ':') {
  426. // Beginning of a port number
  427. stateMachine.state = 8 /* PortColon */;
  428. }
  429. else if (regex_lib_1.digitRe.test(char)) {
  430. // stay in the IPv4 digit state
  431. }
  432. else if ((0, uri_utils_1.isUrlSuffixStartChar)(char)) {
  433. stateMachine.state = 10 /* Path */;
  434. }
  435. else if (regex_lib_1.alphaNumericAndMarksRe.test(char)) {
  436. // If we hit an alpha character, must not be an IPv4
  437. // Example of this: 1.2.3.4abc
  438. (0, utils_1.remove)(stateMachines, stateMachine);
  439. }
  440. else {
  441. captureMatchIfValidAndRemove(stateMachine);
  442. }
  443. }
  444. function stateIPv4Dot(stateMachine, char) {
  445. if (regex_lib_1.digitRe.test(char)) {
  446. stateMachine.octetsEncountered++;
  447. // Once we have encountered 4 octets, it's *potentially* a valid
  448. // IPv4 address. Our IPv4 regex will confirm the match later
  449. // though to make sure each octet is in the 0-255 range, and
  450. // there's exactly 4 octets (not 5 or more)
  451. if (stateMachine.octetsEncountered === 4) {
  452. stateMachine.acceptStateReached = true;
  453. }
  454. stateMachine.state = 13 /* IpV4Digit */;
  455. }
  456. else {
  457. captureMatchIfValidAndRemove(stateMachine);
  458. }
  459. }
  460. function statePortColon(stateMachine, char) {
  461. if (regex_lib_1.digitRe.test(char)) {
  462. stateMachine.state = 9 /* PortNumber */;
  463. }
  464. else {
  465. captureMatchIfValidAndRemove(stateMachine);
  466. }
  467. }
  468. function statePortNumber(stateMachine, char) {
  469. if (regex_lib_1.digitRe.test(char)) {
  470. // Stay in port number state
  471. }
  472. else if ((0, uri_utils_1.isUrlSuffixStartChar)(char)) {
  473. // '/', '?', or '#'
  474. stateMachine.state = 10 /* Path */;
  475. }
  476. else {
  477. captureMatchIfValidAndRemove(stateMachine);
  478. }
  479. }
  480. function statePath(stateMachine, char) {
  481. if ((0, uri_utils_1.isPathChar)(char)) {
  482. // Stay in the path state
  483. }
  484. else {
  485. captureMatchIfValidAndRemove(stateMachine);
  486. }
  487. }
  488. // Handles if we're reading a 'mailto:' prefix on the string
  489. function stateEmailMailto_M(stateMachine, char) {
  490. if (char.toLowerCase() === 'a') {
  491. stateMachine.state = 16 /* EmailMailto_A */;
  492. }
  493. else {
  494. stateEmailLocalPart(stateMachine, char);
  495. }
  496. }
  497. function stateEmailMailto_A(stateMachine, char) {
  498. if (char.toLowerCase() === 'i') {
  499. stateMachine.state = 17 /* EmailMailto_I */;
  500. }
  501. else {
  502. stateEmailLocalPart(stateMachine, char);
  503. }
  504. }
  505. function stateEmailMailto_I(stateMachine, char) {
  506. if (char.toLowerCase() === 'l') {
  507. stateMachine.state = 18 /* EmailMailto_L */;
  508. }
  509. else {
  510. stateEmailLocalPart(stateMachine, char);
  511. }
  512. }
  513. function stateEmailMailto_L(stateMachine, char) {
  514. if (char.toLowerCase() === 't') {
  515. stateMachine.state = 19 /* EmailMailto_T */;
  516. }
  517. else {
  518. stateEmailLocalPart(stateMachine, char);
  519. }
  520. }
  521. function stateEmailMailto_T(stateMachine, char) {
  522. if (char.toLowerCase() === 'o') {
  523. stateMachine.state = 20 /* EmailMailto_O */;
  524. }
  525. else {
  526. stateEmailLocalPart(stateMachine, char);
  527. }
  528. }
  529. function stateEmailMailto_O(stateMachine, char) {
  530. if (char.toLowerCase() === ':') {
  531. stateMachine.state = 21 /* EmailMailto_Colon */;
  532. }
  533. else {
  534. stateEmailLocalPart(stateMachine, char);
  535. }
  536. }
  537. function stateEmailMailtoColon(stateMachine, char) {
  538. if ((0, email_utils_1.isEmailLocalPartChar)(char)) {
  539. stateMachine.state = 22 /* EmailLocalPart */;
  540. }
  541. else {
  542. (0, utils_1.remove)(stateMachines, stateMachine);
  543. }
  544. }
  545. // Handles the state when we're currently in the "local part" of an
  546. // email address (as opposed to the "domain part")
  547. function stateEmailLocalPart(stateMachine, char) {
  548. if (char === '.') {
  549. stateMachine.state = 23 /* EmailLocalPartDot */;
  550. }
  551. else if (char === '@') {
  552. stateMachine.state = 24 /* EmailAtSign */;
  553. }
  554. else if ((0, email_utils_1.isEmailLocalPartChar)(char)) {
  555. // stay in the "local part" of the email address
  556. // Note: because stateEmailLocalPart() is called from the
  557. // 'mailto' states (when the 'mailto' prefix itself has been
  558. // broken), make sure to set the state to EmailLocalPart
  559. stateMachine.state = 22 /* EmailLocalPart */;
  560. }
  561. else {
  562. // not an email address character
  563. (0, utils_1.remove)(stateMachines, stateMachine);
  564. }
  565. }
  566. // Handles the state where we've read
  567. function stateEmailLocalPartDot(stateMachine, char) {
  568. if (char === '.') {
  569. // We read a second '.' in a row, not a valid email address
  570. // local part
  571. (0, utils_1.remove)(stateMachines, stateMachine);
  572. }
  573. else if (char === '@') {
  574. // We read the '@' character immediately after a dot ('.'), not
  575. // an email address
  576. (0, utils_1.remove)(stateMachines, stateMachine);
  577. }
  578. else if ((0, email_utils_1.isEmailLocalPartChar)(char)) {
  579. stateMachine.state = 22 /* EmailLocalPart */;
  580. }
  581. else {
  582. // Anything else, not an email address
  583. (0, utils_1.remove)(stateMachines, stateMachine);
  584. }
  585. }
  586. function stateEmailAtSign(stateMachine, char) {
  587. if ((0, uri_utils_1.isDomainLabelStartChar)(char)) {
  588. stateMachine.state = 25 /* EmailDomainChar */;
  589. }
  590. else {
  591. // Anything else, not an email address
  592. (0, utils_1.remove)(stateMachines, stateMachine);
  593. }
  594. }
  595. function stateEmailDomainChar(stateMachine, char) {
  596. if (char === '.') {
  597. stateMachine.state = 27 /* EmailDomainDot */;
  598. }
  599. else if (char === '-') {
  600. stateMachine.state = 26 /* EmailDomainHyphen */;
  601. }
  602. else if ((0, uri_utils_1.isDomainLabelChar)(char)) {
  603. // Stay in the DomainChar state
  604. }
  605. else {
  606. // Anything else, we potentially matched if the criteria has
  607. // been met
  608. captureMatchIfValidAndRemove(stateMachine);
  609. }
  610. }
  611. function stateEmailDomainHyphen(stateMachine, char) {
  612. if (char === '-' || char === '.') {
  613. // Not valid to have two hyphens ("--") or hypen+dot ("-.")
  614. captureMatchIfValidAndRemove(stateMachine);
  615. }
  616. else if ((0, uri_utils_1.isDomainLabelChar)(char)) {
  617. stateMachine.state = 25 /* EmailDomainChar */;
  618. }
  619. else {
  620. // Anything else
  621. captureMatchIfValidAndRemove(stateMachine);
  622. }
  623. }
  624. function stateEmailDomainDot(stateMachine, char) {
  625. if (char === '.' || char === '-') {
  626. // not valid to have two dots ("..") or dot+hypen (".-")
  627. captureMatchIfValidAndRemove(stateMachine);
  628. }
  629. else if ((0, uri_utils_1.isDomainLabelStartChar)(char)) {
  630. stateMachine.state = 25 /* EmailDomainChar */;
  631. // After having read a '.' and then a valid domain character,
  632. // we now know that the domain part of the email is valid, and
  633. // we have found at least a partial EmailMatch (however, the
  634. // email address may have additional characters from this point)
  635. stateMachine.acceptStateReached = true;
  636. }
  637. else {
  638. // Anything else
  639. captureMatchIfValidAndRemove(stateMachine);
  640. }
  641. }
  642. // Handles the state when we've just encountered a '#' character
  643. function stateHashtagHashChar(stateMachine, char) {
  644. if ((0, hashtag_utils_1.isHashtagTextChar)(char)) {
  645. // '#' char with valid hash text char following
  646. stateMachine.state = 29 /* HashtagTextChar */;
  647. stateMachine.acceptStateReached = true;
  648. }
  649. else {
  650. (0, utils_1.remove)(stateMachines, stateMachine);
  651. }
  652. }
  653. // Handles the state when we're currently in the hash tag's text chars
  654. function stateHashtagTextChar(stateMachine, char) {
  655. if ((0, hashtag_utils_1.isHashtagTextChar)(char)) {
  656. // Continue reading characters in the HashtagText state
  657. }
  658. else {
  659. captureMatchIfValidAndRemove(stateMachine);
  660. }
  661. }
  662. // Handles the state when we've just encountered a '@' character
  663. function stateMentionAtChar(stateMachine, char) {
  664. if ((0, mention_utils_1.isMentionTextChar)(char)) {
  665. // '@' char with valid mention text char following
  666. stateMachine.state = 31 /* MentionTextChar */;
  667. stateMachine.acceptStateReached = true;
  668. }
  669. else {
  670. (0, utils_1.remove)(stateMachines, stateMachine);
  671. }
  672. }
  673. // Handles the state when we're currently in the mention's text chars
  674. function stateMentionTextChar(stateMachine, char) {
  675. if ((0, mention_utils_1.isMentionTextChar)(char)) {
  676. // Continue reading characters in the HashtagText state
  677. }
  678. else if (regex_lib_1.alphaNumericAndMarksRe.test(char)) {
  679. // Char is invalid for a mention text char, not a valid match.
  680. // Note that ascii alphanumeric chars are okay (which are tested
  681. // in the previous 'if' statement, but others are not)
  682. (0, utils_1.remove)(stateMachines, stateMachine);
  683. }
  684. else {
  685. captureMatchIfValidAndRemove(stateMachine);
  686. }
  687. }
  688. function statePhoneNumberPlus(stateMachine, char) {
  689. if (regex_lib_1.digitRe.test(char)) {
  690. stateMachine.state = 38 /* PhoneNumberDigit */;
  691. }
  692. else {
  693. (0, utils_1.remove)(stateMachines, stateMachine);
  694. // This character may start a new match. Add states for it
  695. stateNoMatch(char);
  696. }
  697. }
  698. function statePhoneNumberOpenParen(stateMachine, char) {
  699. if (regex_lib_1.digitRe.test(char)) {
  700. stateMachine.state = 33 /* PhoneNumberAreaCodeDigit1 */;
  701. }
  702. else {
  703. (0, utils_1.remove)(stateMachines, stateMachine);
  704. }
  705. // It's also possible that the paren was just an open brace for
  706. // a piece of text. Start other machines
  707. stateNoMatch(char);
  708. }
  709. function statePhoneNumberAreaCodeDigit1(stateMachine, char) {
  710. if (regex_lib_1.digitRe.test(char)) {
  711. stateMachine.state = 34 /* PhoneNumberAreaCodeDigit2 */;
  712. }
  713. else {
  714. (0, utils_1.remove)(stateMachines, stateMachine);
  715. }
  716. }
  717. function statePhoneNumberAreaCodeDigit2(stateMachine, char) {
  718. if (regex_lib_1.digitRe.test(char)) {
  719. stateMachine.state = 35 /* PhoneNumberAreaCodeDigit3 */;
  720. }
  721. else {
  722. (0, utils_1.remove)(stateMachines, stateMachine);
  723. }
  724. }
  725. function statePhoneNumberAreaCodeDigit3(stateMachine, char) {
  726. if (char === ')') {
  727. stateMachine.state = 36 /* PhoneNumberCloseParen */;
  728. }
  729. else {
  730. (0, utils_1.remove)(stateMachines, stateMachine);
  731. }
  732. }
  733. function statePhoneNumberCloseParen(stateMachine, char) {
  734. if (regex_lib_1.digitRe.test(char)) {
  735. stateMachine.state = 38 /* PhoneNumberDigit */;
  736. }
  737. else if ((0, phone_number_utils_1.isPhoneNumberSeparatorChar)(char)) {
  738. stateMachine.state = 39 /* PhoneNumberSeparator */;
  739. }
  740. else {
  741. (0, utils_1.remove)(stateMachines, stateMachine);
  742. }
  743. }
  744. function statePhoneNumberDigit(stateMachine, char) {
  745. // For now, if we've reached any digits, we'll say that the machine
  746. // has reached its accept state. The phone regex will confirm the
  747. // match later.
  748. // Alternatively, we could count the number of digits to avoid
  749. // invoking the phone number regex
  750. stateMachine.acceptStateReached = true;
  751. if ((0, phone_number_utils_1.isPhoneNumberControlChar)(char)) {
  752. stateMachine.state = 40 /* PhoneNumberControlChar */;
  753. }
  754. else if (char === '#') {
  755. stateMachine.state = 41 /* PhoneNumberPoundChar */;
  756. }
  757. else if (regex_lib_1.digitRe.test(char)) {
  758. // Stay in the phone number digit state
  759. }
  760. else if (char === '(') {
  761. stateMachine.state = 32 /* PhoneNumberOpenParen */;
  762. }
  763. else if ((0, phone_number_utils_1.isPhoneNumberSeparatorChar)(char)) {
  764. stateMachine.state = 39 /* PhoneNumberSeparator */;
  765. }
  766. else {
  767. captureMatchIfValidAndRemove(stateMachine);
  768. // The transition from a digit character to a letter can be the
  769. // start of a new scheme URL match
  770. if ((0, uri_utils_1.isSchemeStartChar)(char)) {
  771. stateMachines.push(createSchemeUrlStateMachine(charIdx, 0 /* SchemeChar */));
  772. }
  773. }
  774. }
  775. function statePhoneNumberSeparator(stateMachine, char) {
  776. if (regex_lib_1.digitRe.test(char)) {
  777. stateMachine.state = 38 /* PhoneNumberDigit */;
  778. }
  779. else if (char === '(') {
  780. stateMachine.state = 32 /* PhoneNumberOpenParen */;
  781. }
  782. else {
  783. captureMatchIfValidAndRemove(stateMachine);
  784. // This character may start a new match. Add states for it
  785. stateNoMatch(char);
  786. }
  787. }
  788. // The ";" characters is "wait" in a phone number
  789. // The "," characters is "pause" in a phone number
  790. function statePhoneNumberControlChar(stateMachine, char) {
  791. if ((0, phone_number_utils_1.isPhoneNumberControlChar)(char)) {
  792. // Stay in the "control char" state
  793. }
  794. else if (char === '#') {
  795. stateMachine.state = 41 /* PhoneNumberPoundChar */;
  796. }
  797. else if (regex_lib_1.digitRe.test(char)) {
  798. stateMachine.state = 38 /* PhoneNumberDigit */;
  799. }
  800. else {
  801. captureMatchIfValidAndRemove(stateMachine);
  802. }
  803. }
  804. // The "#" characters is "pound" in a phone number
  805. function statePhoneNumberPoundChar(stateMachine, char) {
  806. if ((0, phone_number_utils_1.isPhoneNumberControlChar)(char)) {
  807. stateMachine.state = 40 /* PhoneNumberControlChar */;
  808. }
  809. else if (regex_lib_1.digitRe.test(char)) {
  810. // According to some of the older tests, if there's a digit
  811. // after a '#' sign, the match is invalid. TODO: Revisit if this is true
  812. (0, utils_1.remove)(stateMachines, stateMachine);
  813. }
  814. else {
  815. captureMatchIfValidAndRemove(stateMachine);
  816. }
  817. }
  818. /*
  819. * Captures a match if it is valid (i.e. has a full domain name for a
  820. * TLD match). If a match is not valid, it is possible that we want to
  821. * keep reading characters in order to make a full match.
  822. */
  823. function captureMatchIfValidAndRemove(stateMachine) {
  824. // Remove the state machine first. There are a number of code paths
  825. // which return out of this function early, so make sure we have
  826. // this done
  827. (0, utils_1.remove)(stateMachines, stateMachine);
  828. // Make sure the state machine being checked has actually reached an
  829. // "accept" state. If it hasn't reach one, it can't be a match
  830. if (!stateMachine.acceptStateReached) {
  831. return;
  832. }
  833. var startIdx = stateMachine.startIdx;
  834. var matchedText = text.slice(stateMachine.startIdx, charIdx);
  835. // Handle any unbalanced braces (parens, square brackets, or curly
  836. // brackets) inside the URL. This handles situations like:
  837. // The link (google.com)
  838. // and
  839. // Check out this link here (en.wikipedia.org/wiki/IANA_(disambiguation))
  840. //
  841. // And also remove any punctuation chars at the end such as:
  842. // '?', ',', ':', '.', etc.
  843. matchedText = excludeUnbalancedTrailingBracesAndPunctuation(matchedText);
  844. if (stateMachine.type === 'url') {
  845. // We don't want to accidentally match a URL that is preceded by an
  846. // '@' character, which would be an email address
  847. var charBeforeUrlMatch = text.charAt(stateMachine.startIdx - 1);
  848. if (charBeforeUrlMatch === '@') {
  849. return;
  850. }
  851. // For the purpose of this parser, we've generalized 'www'
  852. // matches as part of 'tld' matches. However, for backward
  853. // compatibility, we distinguish beween TLD matches and matches
  854. // that begin with 'www.' so that users may turn off 'www'
  855. // matches. As such, we need to correct for that now if the
  856. // URL begins with 'www.'
  857. var urlMatchType = stateMachine.matchType;
  858. if (urlMatchType === 'scheme') {
  859. // Autolinker accepts many characters in a url's scheme (like `fake://test.com`).
  860. // However, in cases where a URL is missing whitespace before an obvious link,
  861. // (for example: `nowhitespacehttp://www.test.com`), we only want the match to start
  862. // at the http:// part. We will check if the match contains a common scheme and then
  863. // shift the match to start from there.
  864. var httpSchemeMatch = uri_utils_1.httpSchemeRe.exec(matchedText);
  865. if (httpSchemeMatch) {
  866. // If we found an overmatched URL, we want to find the index
  867. // of where the match should start and shift the match to
  868. // start from the beginning of the common scheme
  869. startIdx = startIdx + httpSchemeMatch.index;
  870. matchedText = matchedText.slice(httpSchemeMatch.index);
  871. }
  872. if (!(0, uri_utils_1.isValidSchemeUrl)(matchedText)) {
  873. return; // not a valid match
  874. }
  875. }
  876. else if (urlMatchType === 'tld') {
  877. if (!(0, uri_utils_1.isValidTldMatch)(matchedText)) {
  878. return; // not a valid match
  879. }
  880. }
  881. else if (urlMatchType === 'ipV4') {
  882. if (!(0, uri_utils_1.isValidIpV4Address)(matchedText)) {
  883. return; // not a valid match
  884. }
  885. }
  886. else {
  887. (0, utils_1.assertNever)(urlMatchType);
  888. }
  889. matches.push(new url_match_1.UrlMatch({
  890. tagBuilder: tagBuilder,
  891. matchedText: matchedText,
  892. offset: startIdx,
  893. urlMatchType: urlMatchType,
  894. url: matchedText,
  895. protocolRelativeMatch: matchedText.slice(0, 2) === '//',
  896. // TODO: Do these settings need to be passed to the match,
  897. // or should we handle them here in UrlMatcher?
  898. stripPrefix: stripPrefix,
  899. stripTrailingSlash: stripTrailingSlash,
  900. decodePercentEncoding: decodePercentEncoding,
  901. }));
  902. }
  903. else if (stateMachine.type === 'email') {
  904. // if the email address has a valid TLD, add it to the list of matches
  905. if ((0, email_utils_1.isValidEmail)(matchedText)) {
  906. matches.push(new email_match_1.EmailMatch({
  907. tagBuilder: tagBuilder,
  908. matchedText: matchedText,
  909. offset: startIdx,
  910. email: matchedText.replace(email_utils_1.mailtoSchemePrefixRe, ''),
  911. }));
  912. }
  913. }
  914. else if (stateMachine.type === 'hashtag') {
  915. if ((0, hashtag_utils_1.isValidHashtag)(matchedText)) {
  916. matches.push(new hashtag_match_1.HashtagMatch({
  917. tagBuilder: tagBuilder,
  918. matchedText: matchedText,
  919. offset: startIdx,
  920. serviceName: hashtagServiceName,
  921. hashtag: matchedText.slice(1),
  922. }));
  923. }
  924. }
  925. else if (stateMachine.type === 'mention') {
  926. if ((0, mention_utils_1.isValidMention)(matchedText, mentionServiceName)) {
  927. matches.push(new mention_match_1.MentionMatch({
  928. tagBuilder: tagBuilder,
  929. matchedText: matchedText,
  930. offset: startIdx,
  931. serviceName: mentionServiceName,
  932. mention: matchedText.slice(1), // strip off the '@' character at the beginning
  933. }));
  934. }
  935. }
  936. else if (stateMachine.type === 'phone') {
  937. // remove any trailing spaces that were considered as "separator"
  938. // chars by the state machine
  939. matchedText = matchedText.replace(/ +$/g, '');
  940. if ((0, phone_number_utils_1.isValidPhoneNumber)(matchedText)) {
  941. var cleanNumber = matchedText.replace(/[^0-9,;#]/g, ''); // strip out non-digit characters exclude comma semicolon and #
  942. matches.push(new phone_match_1.PhoneMatch({
  943. tagBuilder: tagBuilder,
  944. matchedText: matchedText,
  945. offset: startIdx,
  946. number: cleanNumber,
  947. plusSign: matchedText.charAt(0) === '+',
  948. }));
  949. }
  950. }
  951. else {
  952. (0, utils_1.assertNever)(stateMachine);
  953. }
  954. }
  955. }
  956. exports.parseMatches = parseMatches;
  957. var openBraceRe = /[\(\{\[]/;
  958. var closeBraceRe = /[\)\}\]]/;
  959. var oppositeBrace = {
  960. ')': '(',
  961. '}': '{',
  962. ']': '[',
  963. };
  964. /**
  965. * Determines if a match found has unmatched closing parenthesis,
  966. * square brackets or curly brackets. If so, these unbalanced symbol(s) will be
  967. * removed from the URL match itself.
  968. *
  969. * A match may have an extra closing parenthesis/square brackets/curly brackets
  970. * at the end of the match because these are valid URL path characters. For
  971. * example, "wikipedia.com/something_(disambiguation)" should be auto-linked.
  972. *
  973. * However, an extra parenthesis *will* be included when the URL itself is
  974. * wrapped in parenthesis, such as in the case of:
  975. *
  976. * "(wikipedia.com/something_(disambiguation))"
  977. *
  978. * In this case, the last closing parenthesis should *not* be part of the
  979. * URL itself, and this method will exclude it from the returned URL.
  980. *
  981. * For square brackets in URLs such as in PHP arrays, the same behavior as
  982. * parenthesis discussed above should happen:
  983. *
  984. * "[http://www.example.com/foo.php?bar[]=1&bar[]=2&bar[]=3]"
  985. *
  986. * The very last closing square bracket should not be part of the URL itself,
  987. * and therefore this method will remove it.
  988. *
  989. * @param matchedText The full matched URL/email/hashtag/etc. from the state
  990. * machine parser.
  991. * @return The updated matched text with extraneous suffix characters removed.
  992. */
  993. function excludeUnbalancedTrailingBracesAndPunctuation(matchedText) {
  994. var braceCounts = {
  995. '(': 0,
  996. '{': 0,
  997. '[': 0,
  998. };
  999. for (var i = 0; i < matchedText.length; i++) {
  1000. var char_1 = matchedText.charAt(i);
  1001. if (openBraceRe.test(char_1)) {
  1002. braceCounts[char_1]++;
  1003. }
  1004. else if (closeBraceRe.test(char_1)) {
  1005. braceCounts[oppositeBrace[char_1]]--;
  1006. }
  1007. }
  1008. var endIdx = matchedText.length - 1;
  1009. var char;
  1010. while (endIdx >= 0) {
  1011. char = matchedText.charAt(endIdx);
  1012. if (closeBraceRe.test(char)) {
  1013. var oppositeBraceChar = oppositeBrace[char];
  1014. if (braceCounts[oppositeBraceChar] < 0) {
  1015. braceCounts[oppositeBraceChar]++;
  1016. endIdx--;
  1017. }
  1018. else {
  1019. break;
  1020. }
  1021. }
  1022. else if (uri_utils_1.urlSuffixedCharsNotAllowedAtEndRe.test(char)) {
  1023. // Walk back a punctuation char like '?', ',', ':', '.', etc.
  1024. endIdx--;
  1025. }
  1026. else {
  1027. break;
  1028. }
  1029. }
  1030. return matchedText.slice(0, endIdx + 1);
  1031. }
  1032. exports.excludeUnbalancedTrailingBracesAndPunctuation = excludeUnbalancedTrailingBracesAndPunctuation;
  1033. function createSchemeUrlStateMachine(startIdx, state) {
  1034. return {
  1035. type: 'url',
  1036. startIdx: startIdx,
  1037. state: state,
  1038. acceptStateReached: false,
  1039. matchType: 'scheme',
  1040. };
  1041. }
  1042. function createTldUrlStateMachine(startIdx, state) {
  1043. return {
  1044. type: 'url',
  1045. startIdx: startIdx,
  1046. state: state,
  1047. acceptStateReached: false,
  1048. matchType: 'tld',
  1049. };
  1050. }
  1051. function createIpV4UrlStateMachine(startIdx, state) {
  1052. return {
  1053. type: 'url',
  1054. startIdx: startIdx,
  1055. state: state,
  1056. acceptStateReached: false,
  1057. matchType: 'ipV4',
  1058. octetsEncountered: 1, // starts at 1 because we create this machine when encountering the first octet
  1059. };
  1060. }
  1061. function createEmailStateMachine(startIdx, state) {
  1062. return {
  1063. type: 'email',
  1064. startIdx: startIdx,
  1065. state: state,
  1066. acceptStateReached: false,
  1067. };
  1068. }
  1069. function createHashtagStateMachine(startIdx, state) {
  1070. return {
  1071. type: 'hashtag',
  1072. startIdx: startIdx,
  1073. state: state,
  1074. acceptStateReached: false,
  1075. };
  1076. }
  1077. function createMentionStateMachine(startIdx, state) {
  1078. return {
  1079. type: 'mention',
  1080. startIdx: startIdx,
  1081. state: state,
  1082. acceptStateReached: false,
  1083. };
  1084. }
  1085. function createPhoneNumberStateMachine(startIdx, state) {
  1086. return {
  1087. type: 'phone',
  1088. startIdx: startIdx,
  1089. state: state,
  1090. acceptStateReached: false,
  1091. };
  1092. }
  1093. //# sourceMappingURL=parse-matches.js.map