From cbc2cee2c84a87cc4695785181d042519ed140fe Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Sat, 5 Oct 2019 13:00:09 +0200
Subject: [PATCH 1/2] Improve URL_PATTERN and handling in tokenizer (#4374)

* Move prefix and suffix detection for URL_PATTERN

Move prefix and suffix detection for `URL_PATTERN` into the tokenizer.
Remove associated lookahead and lookbehind from `URL_PATTERN`.

Fix tokenization for Hungarian given new modified handling of prefixes
and suffixes.

* Match a wider range of URI schemes
---
 spacy/lang/hu/punctuation.py       |  6 +++---
 spacy/lang/tokenizer_exceptions.py | 14 ++++----------
 spacy/tests/tokenizer/test_urls.py |  6 +++++-
 spacy/tokenizer.pyx                |  6 +++---
 4 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py
index 4198dcd88..bc043486f 100644
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@@ -10,10 +10,10 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
 
 _currency = r"\$¢£€¥฿"
 _quotes = CONCAT_QUOTES.replace("'", "")
+_units = UNITS.replace("%", "")
 
 _prefixes = (
-    [r"\+"]
-    + LIST_PUNCT
+    LIST_PUNCT
     + LIST_ELLIPSES
     + LIST_QUOTES
     + [_concat_icons]
@@ -29,7 +29,7 @@ _suffixes = (
         r"(?<=[0-9])\+",
         r"(?<=°[FfCcKk])\.",
         r"(?<=[0-9])(?:[{c}])".format(c=_currency),
-        r"(?<=[0-9])(?:{u})".format(u=UNITS),
+        r"(?<=[0-9])(?:{u})".format(u=_units),
         r"(?<=[{al}{e}{q}(?:{c})])\.".format(
             al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
         ),
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index 4d5ff4423..57771cca4 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -10,11 +10,9 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE
 # A few minor mods to this regex to account for use cases represented in test_urls
 URL_PATTERN = (
     r"^"
-    # in order to support the prefix tokenization (see prefix test cases in test_urls).
-    r"(?=[\w])"
-    # protocol identifier
-    r"(?:(?:https?|ftp|mailto)://)?"
-    # user:pass authentication
+    # protocol identifier (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
+    r"(?:(?:[\w\+\-\.]{2,})://)?"
+    # mailto:user or user:pass authentication
     r"(?:\S+(?::\S*)?@)?"
     r"(?:"
     # IP address exclusion
@@ -43,11 +41,7 @@ URL_PATTERN = (
     # port number
     r"(?::\d{2,5})?"
     # resource path
-    r"(?:/\S*)?"
-    # query parameters
-    r"\??(:?\S*)?"
-    # in order to support the suffix tokenization (see suffix test cases in test_urls),
-    r"(?<=[\w/])"
+    r"(?:[/?#]\S*)?"
     r"$"
 ).strip()
 
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index 59c2b3204..bf59ae4d7 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -12,6 +12,7 @@ URLS_BASIC = [
 
 URLS_FULL = URLS_BASIC + [
     "mailto:foo-bar@baz-co.com",
+    "mailto:foo-bar@baz-co.com?subject=hi",
     "www.google.com?q=google",
     "http://foo.com/blah_(wikipedia)#cite-1",
 ]
@@ -45,6 +46,10 @@ URLS_SHOULD_MATCH = [
     "http://a.b-c.de",
     "http://223.255.255.254",
     "http://a.b--c.de/",  # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
+    "ssh://login@server.com:12345/repository.git",
+    "svn+ssh://user@ssh.yourdomain.com/path",
+    pytest.param("chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
+    pytest.param("chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
     pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
     pytest.param(
         "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
@@ -81,7 +86,6 @@ URLS_SHOULD_NOT_MATCH = [
     "http:// shouldfail.com",
     ":// should fail",
     "http://foo.bar/foo(bar)baz quux",
-    "ftps://foo.bar/",
     "http://-error-.invalid/",
     "http://a.b-.co",
     "http://0.0.0.0",
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 81a62d28a..cdfa55dcb 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -227,7 +227,9 @@ cdef class Tokenizer:
         cdef unicode minus_suf
         cdef size_t last_size = 0
         while string and len(string) != last_size:
-            if self.token_match and self.token_match(string):
+            if self.token_match and self.token_match(string) \
+                    and not self.find_prefix(string) \
+                    and not self.find_suffix(string):
                 break
             if self._specials.get(hash_string(string)) != NULL:
                 has_special[0] = 1
@@ -243,8 +245,6 @@ cdef class Tokenizer:
                     prefixes.push_back(self.vocab.get(mem, prefix))
                     has_special[0] = 1
                     break
-                if self.token_match and self.token_match(string):
-                    break
             suf_len = self.find_suffix(string)
             if suf_len != 0:
                 suffix = string[-suf_len:]

From 573e543e4aadc83e30a1f4069f3624899945e66e Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 6 Oct 2019 13:30:01 +0200
Subject: [PATCH 2/2] Alphanumeric -> alphabetic [ci skip]

see ines/spacy-course#38
---
 spacy/lexeme.pyx                          | 2 +-
 spacy/matcher/_schemas.py                 | 2 +-
 website/docs/usage/rule-based-matching.md | 2 +-
 website/docs/usage/spacy-101.md           | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 5b88e8fcc..5c981bc25 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -375,7 +375,7 @@ cdef class Lexeme:
             Lexeme.c_set_flag(self.c, IS_STOP, x)
 
     property is_alpha:
-        """RETURNS (bool): Whether the lexeme consists of alphanumeric
+        """RETURNS (bool): Whether the lexeme consists of alphabetic
             characters. Equivalent to `lexeme.text.isalpha()`.
         """
         def __get__(self):
diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py
index 471e2b7b5..1b10f0dd5 100644
--- a/spacy/matcher/_schemas.py
+++ b/spacy/matcher/_schemas.py
@@ -111,7 +111,7 @@ TOKEN_PATTERN_SCHEMA = {
                 "$ref": "#/definitions/integer_value",
             },
             "IS_ALPHA": {
-                "title": "Token consists of alphanumeric characters",
+                "title": "Token consists of alphabetic characters",
                 "$ref": "#/definitions/boolean_value",
             },
             "IS_ASCII": {
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 9c3a43f1d..fe8e4e2d2 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -163,7 +163,7 @@ rule-based matching are:
 | `TEXT` <Tag variant="new">2.1</Tag>    | unicode | The exact verbatim text of a token.                                                                    |
 | `LOWER`                                | unicode | The lowercase form of the token text.                                                                  |
 |  `LENGTH`                              | int     | The length of the token text.                                                                          |
-|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | bool    | Token text consists of alphanumeric characters, ASCII characters, digits.                              |
+|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | bool    | Token text consists of alphabetic characters, ASCII characters, digits.                                |
 |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | bool    | Token text is in lowercase, uppercase, titlecase.                                                      |
 |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | bool    | Token is punctuation, whitespace, stop word.                                                           |
 |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | bool    | Token text resembles a number, URL, email.                                                             |
diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md
index 379535cf4..da56f2397 100644
--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@@ -573,7 +573,7 @@ apple = doc[0]
 print("Fine-grained POS tag", apple.pos_, apple.pos)
 print("Coarse-grained POS tag", apple.tag_, apple.tag)
 print("Word shape", apple.shape_, apple.shape)
-print("Alphanumeric characters?", apple.is_alpha)
+print("Alphabetic characters?", apple.is_alpha)
 print("Punctuation mark?", apple.is_punct)
 
 billion = doc[10]