parse: fix support for literal URLs

Related-to GH-479. Closes GH-478. Closes GH-481.
remarkjs · Mar 29, 2020 · 512cd7a · 512cd7a
1 parent 7be53be
commit 512cd7a
Show file tree

Hide file tree

Showing 12 changed files with 5,097 additions and 420 deletions.
diff --git a/packages/remark-parse/lib/locate/url.js b/packages/remark-parse/lib/locate/url.js
@@ -2,22 +2,25 @@
 
 module.exports = locate
 
-var protocols = ['https://', 'http://', 'mailto:']
+var values = ['www.', 'http://', 'https://']
 
 function locate(value, fromIndex) {
-  var length = protocols.length
-  var index = -1
   var min = -1
+  var index
+  var length
   var position
 
   if (!this.options.gfm) {
-    return -1
+    return min
   }
 
+  length = values.length
+  index = -1
+
   while (++index < length) {
-    position = value.indexOf(protocols[index], fromIndex)
+    position = value.indexOf(values[index], fromIndex)
 
-    if (position !== -1 && (position < min || min === -1)) {
+    if (position !== -1 && (min === -1 || position < min)) {
       min = position
     }
   }

diff --git a/packages/remark-parse/lib/tokenize/url.js b/packages/remark-parse/lib/tokenize/url.js
@@ -1,153 +1,210 @@
 'use strict'
 
+var ccount = require('ccount')
 var decode = require('parse-entities')
+var decimal = require('is-decimal')
+var alphabetical = require('is-alphabetical')
 var whitespace = require('is-whitespace-character')
 var locate = require('../locate/url')
 
 module.exports = url
 url.locator = locate
 url.notInLink = true
 
-var quotationMark = '"'
-var apostrophe = "'"
-var leftParenthesis = '('
-var rightParenthesis = ')'
-var comma = ','
-var dot = '.'
-var colon = ':'
-var semicolon = ';'
-var lessThan = '<'
-var atSign = '@'
-var leftSquareBracket = '['
-var rightSquareBracket = ']'
-
-var http = 'http://'
-var https = 'https://'
-var mailto = 'mailto:'
-
-var protocols = [http, https, mailto]
-
-var protocolsLength = protocols.length
+var exclamationMark = 33 // '!'
+var ampersand = 38 // '&'
+var rightParenthesis = 41 // ')'
+var asterisk = 42 // '*'
+var comma = 44 // ','
+var dash = 45 // '-'
+var dot = 46 // '.'
+var colon = 58 // ':'
+var semicolon = 59 // ';'
+var questionMark = 63 // '?'
+var lessThan = 60 // '<'
+var underscore = 95 // '_'
+var tilde = 126 // '~'
+
+var leftParenthesisCharacter = '('
+var rightParenthesisCharacter = ')'
 
 function url(eat, value, silent) {
   var self = this
-  var subvalue
-  var content
-  var character
+  var gfm = self.options.gfm
+  var tokenizers = self.inlineTokenizers
+  var length = value.length
+  var previousDot = -1
+  var protocolless = false
+  var dots
+  var lastTwoPartsStart
+  var start
   var index
-  var position
-  var protocol
-  var match
-  var length
-  var queue
-  var parenCount
-  var nextCharacter
-  var tokenizers
+  var pathStart
+  var path
+  var code
+  var end
+  var leftCount
+  var rightCount
+  var content
+  var children
+  var url
   var exit
 
-  if (!self.options.gfm) {
+  if (!gfm) {
+    return
+  }
+
+  // `WWW.` doesn’t work.
+  if (value.slice(0, 4) === 'www.') {
+    protocolless = true
+    index = 4
+  } else if (value.slice(0, 7).toLowerCase() === 'http://') {
+    index = 7
+  } else if (value.slice(0, 8).toLowerCase() === 'https://') {
+    index = 8
+  } else {
     return
   }
 
-  subvalue = ''
-  index = -1
+  // Act as if the starting boundary is a dot.
+  previousDot = index - 1
 
-  while (++index < protocolsLength) {
-    protocol = protocols[index]
-    match = value.slice(0, protocol.length)
+  // Parse a valid domain.
+  start = index
+  dots = []
 
-    if (match.toLowerCase() === protocol) {
-      subvalue = match
-      break
+  while (index < length) {
+    code = value.charCodeAt(index)
+
+    if (code === dot) {
+      // Dots may not appear after each other.
+      if (previousDot === index - 1) {
+        break
+      }
+
+      dots.push(index)
+      previousDot = index
+      index++
+      continue
     }
+
+    if (
+      decimal(code) ||
+      alphabetical(code) ||
+      code === dash ||
+      code === underscore
+    ) {
+      index++
+      continue
+    }
+
+    break
+  }
+
+  // Ignore a final dot:
+  if (code === dot) {
+    dots.pop()
+    index--
   }
 
-  if (!subvalue) {
+  // If there are not dots, exit.
+  if (dots[0] === undefined) {
     return
   }
 
-  index = subvalue.length
-  length = value.length
-  queue = ''
-  parenCount = 0
+  // If there is an underscore in the last two domain parts, exit:
+  // `www.example.c_m` and `www.ex_ample.com` are not OK, but
+  // `www.sub_domain.example.com` is.
+  lastTwoPartsStart = dots.length < 2 ? start : dots[dots.length - 2] + 1
 
+  if (value.slice(lastTwoPartsStart, index).indexOf('_') !== -1) {
+    return
+  }
+
+  /* istanbul ignore if - never used (yet) */
+  if (silent) {
+    return true
+  }
+
+  end = index
+  pathStart = index
+
+  // Parse a path.
   while (index < length) {
-    character = value.charAt(index)
+    code = value.charCodeAt(index)
 
-    if (whitespace(character) || character === lessThan) {
+    if (whitespace(code) || code === lessThan) {
       break
     }
 
+    index++
+
     if (
-      character === dot ||
-      character === comma ||
-      character === colon ||
-      character === semicolon ||
-      character === quotationMark ||
-      character === apostrophe ||
-      character === rightParenthesis ||
-      character === rightSquareBracket
+      code === exclamationMark ||
+      code === asterisk ||
+      code === comma ||
+      code === dot ||
+      code === colon ||
+      code === questionMark ||
+      code === underscore ||
+      code === tilde
     ) {
-      nextCharacter = value.charAt(index + 1)
-
-      if (!nextCharacter || whitespace(nextCharacter)) {
-        break
-      }
+      // Empty
+    } else {
+      end = index
     }
+  }
 
-    if (character === leftParenthesis || character === leftSquareBracket) {
-      parenCount++
-    }
+  index = end
 
-    if (character === rightParenthesis || character === rightSquareBracket) {
-      parenCount--
+  // If the path ends in a closing paren, and the count of closing parens is
+  // higher than the opening count, then remove the supefluous closing parens.
+  if (value.charCodeAt(index - 1) === rightParenthesis) {
+    path = value.slice(pathStart, index)
+    leftCount = ccount(path, leftParenthesisCharacter)
+    rightCount = ccount(path, rightParenthesisCharacter)
 
-      if (parenCount < 0) {
-        break
-      }
+    while (rightCount > leftCount) {
+      index = pathStart + path.lastIndexOf(rightParenthesisCharacter)
+      path = value.slice(pathStart, index)
+      rightCount--
     }
-
-    queue += character
-    index++
   }
 
-  if (!queue) {
-    return
-  }
+  if (value.charCodeAt(index - 1) === semicolon) {
+    // GitHub doesn’t document this, but final semicolons aren’t paret of the
+    // URL either.
+    index--
 
-  subvalue += queue
-  content = subvalue
+    // // If the path ends in what looks like an entity, it’s not part of the path.
+    if (alphabetical(value.charCodeAt(index - 1))) {
+      end = index - 2
 
-  if (protocol === mailto) {
-    position = queue.indexOf(atSign)
+      while (alphabetical(value.charCodeAt(end))) {
+        end--
+      }
 
-    if (position === -1 || position === length - 1) {
-      return
+      if (value.charCodeAt(end) === ampersand) {
+        index = end
+      }
     }
-
-    content = content.slice(mailto.length)
   }
 
-  /* istanbul ignore if - never used (yet) */
-  if (silent) {
-    return true
+  content = value.slice(0, index)
+  url = decode(content, {nonTerminated: false})
+
+  if (protocolless) {
+    url = 'http://' + url
   }
 
   exit = self.enterLink()
 
   // Temporarily remove all tokenizers except text in url.
-  tokenizers = self.inlineTokenizers
   self.inlineTokenizers = {text: tokenizers.text}
-
-  content = self.tokenizeInline(content, eat.now())
-
+  children = self.tokenizeInline(content, eat.now())
   self.inlineTokenizers = tokenizers
+
   exit()
 
-  return eat(subvalue)({
-    type: 'link',
-    title: null,
-    url: decode(subvalue, {nonTerminated: false}),
-    children: content
-  })
+  return eat(content)({type: 'link', title: null, url: url, children: children})
 }
diff --git a/packages/remark-parse/package.json b/packages/remark-parse/package.json
@@ -37,6 +37,7 @@
     "types/index.d.ts"
   ],
   "dependencies": {
+    "ccount": "^1.0.0",
     "collapse-white-space": "^1.0.2",
     "is-alphabetical": "^1.0.0",
     "is-decimal": "^1.0.0",