uri-utils.js 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. import { alphaNumericAndMarksRe, letterRe, digitRe } from '../regex-lib';
  2. import { tldRegex } from './tld-regex';
  3. /**
  4. * A regular expression that is simply the character class of the characters
  5. * that may be used in a domain name, minus the '-' or '.'
  6. */
  7. export var domainNameCharRegex = alphaNumericAndMarksRe;
  8. /**
  9. * The set of characters that will start a URL suffix (i.e. the path, query, and
  10. * hash part of the URL)
  11. */
  12. export var urlSuffixStartCharsRe = /[\/?#]/;
  13. /**
  14. * The set of characters that are allowed in the URL suffix (i.e. the path,
  15. * query, and hash part of the URL) which may also form the ending character of
  16. * the URL.
  17. *
  18. * The {@link #urlSuffixNotAllowedAsLastCharRe} are additional allowed URL
  19. * suffix characters, but (generally) should not be the last character of a URL.
  20. */
  21. export var urlSuffixAllowedSpecialCharsRe = /[-+&@#/%=~_()|'$*\[\]{}\u2713]/;
  22. /**
  23. * URL suffix characters (i.e. path, query, and has part of the URL) that are
  24. * not allowed as the *last character* in the URL suffix as they would normally
  25. * form the end of a sentence.
  26. *
  27. * The {@link #urlSuffixAllowedSpecialCharsRe} contains additional allowed URL
  28. * suffix characters which are allowed as the last character.
  29. */
  30. export var urlSuffixNotAllowedAsLastCharRe = /[?!:,.;^]/;
  31. /**
  32. * Regular expression to match an http:// or https:// scheme.
  33. */
  34. export var httpSchemeRe = /https?:\/\//i;
  35. /**
  36. * Regular expression to match an http:// or https:// scheme as the prefix of
  37. * a string.
  38. */
  39. export var httpSchemePrefixRe = new RegExp('^' + httpSchemeRe.source, 'i');
  40. export var urlSuffixedCharsNotAllowedAtEndRe = new RegExp(urlSuffixNotAllowedAsLastCharRe.source + '$');
  41. /**
  42. * A regular expression used to determine the schemes we should not autolink
  43. */
  44. export var invalidSchemeRe = /^(javascript|vbscript):/i;
  45. // A regular expression used to determine if the URL is a scheme match (such as
  46. // 'http://google.com', and as opposed to a "TLD match"). This regular
  47. // expression is used to parse out the host along with if the URL has an
  48. // authority component (i.e. '//')
  49. //
  50. // Capturing groups:
  51. // 1. '//' if the URL has an authority component, empty string otherwise
  52. // 2. The host (if one exists). Ex: 'google.com'
  53. //
  54. // See https://www.rfc-editor.org/rfc/rfc3986#appendix-A for terminology
  55. export var schemeUrlRe = /^[A-Za-z][-.+A-Za-z0-9]*:(\/\/)?([^:/]*)/;
  56. // A regular expression used to determine if the URL is a TLD match (such as
  57. // 'google.com', and as opposed to a "scheme match"). This regular
  58. // expression is used to help parse out the TLD (top-level domain) of the host.
  59. //
  60. // See https://www.rfc-editor.org/rfc/rfc3986#appendix-A for terminology
  61. export var tldUrlHostRe = /^(?:\/\/)?([^/#?:]+)/; // optionally prefixed with protocol-relative '//' chars
  62. /**
  63. * Determines if the given character may start a scheme (ex: 'http').
  64. */
  65. export function isSchemeStartChar(char) {
  66. return letterRe.test(char);
  67. }
  68. /**
  69. * Determines if the given character is a valid character in a scheme (such as
  70. * 'http' or 'ssh+git'), but only after the start char (which is handled by
  71. * {@link isSchemeStartChar}.
  72. */
  73. export function isSchemeChar(char) {
  74. return (letterRe.test(char) || digitRe.test(char) || char === '+' || char === '-' || char === '.');
  75. }
  76. /**
  77. * Determines if the character can begin a domain label, which must be an
  78. * alphanumeric character and not an underscore or dash.
  79. *
  80. * A domain label is a segment of a hostname such as subdomain.google.com.
  81. */
  82. export function isDomainLabelStartChar(char) {
  83. return alphaNumericAndMarksRe.test(char);
  84. }
  85. /**
  86. * Determines if the character is part of a domain label (but not a domain label
  87. * start character).
  88. *
  89. * A domain label is a segment of a hostname such as subdomain.google.com.
  90. */
  91. export function isDomainLabelChar(char) {
  92. return char === '_' || isDomainLabelStartChar(char);
  93. }
  94. /**
  95. * Determines if the character is a path character ("pchar") as defined by
  96. * https://tools.ietf.org/html/rfc3986#appendix-A
  97. *
  98. * pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
  99. *
  100. * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
  101. * pct-encoded = "%" HEXDIG HEXDIG
  102. * sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
  103. * / "*" / "+" / "," / ";" / "="
  104. *
  105. * Note that this implementation doesn't follow the spec exactly, but rather
  106. * follows URL path characters found out in the wild (spec might be out of date?)
  107. */
  108. export function isPathChar(char) {
  109. return (alphaNumericAndMarksRe.test(char) ||
  110. urlSuffixAllowedSpecialCharsRe.test(char) ||
  111. urlSuffixNotAllowedAsLastCharRe.test(char));
  112. }
  113. /**
  114. * Determines if the character given may begin the "URL Suffix" section of a
  115. * URI (i.e. the path, query, or hash section). These are the '/', '?' and '#'
  116. * characters.
  117. *
  118. * See https://tools.ietf.org/html/rfc3986#appendix-A
  119. */
  120. export function isUrlSuffixStartChar(char) {
  121. return urlSuffixStartCharsRe.test(char);
  122. }
  123. /**
  124. * Determines if the TLD read in the host is a known TLD (Top-Level Domain).
  125. *
  126. * Example: 'com' would be a known TLD (for a host of 'google.com'), but
  127. * 'local' would not (for a domain name of 'my-computer.local').
  128. */
  129. export function isKnownTld(tld) {
  130. return tldRegex.test(tld.toLowerCase()); // make sure the tld is lowercase for the regex
  131. }
  132. /**
  133. * Determines if the given `url` is a valid scheme-prefixed URL.
  134. */
  135. export function isValidSchemeUrl(url) {
  136. // If the scheme is 'javascript:' or 'vbscript:', these link
  137. // types can be dangerous. Don't link them.
  138. if (invalidSchemeRe.test(url)) {
  139. return false;
  140. }
  141. var schemeMatch = url.match(schemeUrlRe);
  142. if (!schemeMatch) {
  143. return false;
  144. }
  145. var isAuthorityMatch = !!schemeMatch[1];
  146. var host = schemeMatch[2];
  147. if (isAuthorityMatch) {
  148. // Any match that has an authority ('//' chars) after the scheme is
  149. // valid, such as 'http://anything'
  150. return true;
  151. }
  152. // If there's no authority ('//' chars), check that we have a hostname
  153. // that looks valid.
  154. //
  155. // The host must contain at least one '.' char and have a domain label
  156. // with at least one letter to be considered valid.
  157. //
  158. // Accept:
  159. // - git:domain.com (scheme followed by a host
  160. // Do not accept:
  161. // - git:something ('something' doesn't look like a host)
  162. // - version:1.0 ('1.0' doesn't look like a host)
  163. if (host.indexOf('.') === -1 || !letterRe.test(host)) {
  164. return false;
  165. }
  166. return true;
  167. }
  168. /**
  169. * Determines if the given `url` is a match with a valid TLD.
  170. */
  171. export function isValidTldMatch(url) {
  172. // TLD URL such as 'google.com', we need to confirm that we have a valid
  173. // top-level domain
  174. var tldUrlHostMatch = url.match(tldUrlHostRe);
  175. if (!tldUrlHostMatch) {
  176. // At this point, if the URL didn't match our TLD re, it must be invalid
  177. // (highly unlikely to happen, but just in case)
  178. return false;
  179. }
  180. var host = tldUrlHostMatch[0];
  181. var hostLabels = host.split('.');
  182. if (hostLabels.length < 2) {
  183. // 0 or 1 host label, there's no TLD. Ex: 'localhost'
  184. return false;
  185. }
  186. var tld = hostLabels[hostLabels.length - 1];
  187. if (!isKnownTld(tld)) {
  188. return false;
  189. }
  190. // TODO: Implement these conditions for TLD matcher:
  191. // (
  192. // this.longestDomainLabelLength <= 63 &&
  193. // this.domainNameLength <= 255
  194. // );
  195. return true;
  196. }
  197. // Regular expression to confirm a valid IPv4 address (ex: '192.168.0.1')
  198. var ipV4Re = /^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$/;
  199. // Regular expression used to split the IPv4 address itself from any port/path/query/hash
  200. var ipV4PartRe = /[:/?#]/;
  201. /**
  202. * Determines if the given URL is a valid IPv4-prefixed URL.
  203. */
  204. export function isValidIpV4Address(url) {
  205. // Grab just the IP address
  206. var ipV4Part = url.split(ipV4PartRe, 1)[0]; // only 1 result needed
  207. return ipV4Re.test(ipV4Part);
  208. }
  209. //# sourceMappingURL=uri-utils.js.map