From cbc2cee2c84a87cc4695785181d042519ed140fe Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sat, 5 Oct 2019 13:00:09 +0200 Subject: [PATCH 1/2] Improve URL_PATTERN and handling in tokenizer (#4374) * Move prefix and suffix detection for URL_PATTERN Move prefix and suffix detection for `URL_PATTERN` into the tokenizer. Remove associated lookahead and lookbehind from `URL_PATTERN`. Fix tokenization for Hungarian given new modified handling of prefixes and suffixes. * Match a wider range of URI schemes --- spacy/lang/hu/punctuation.py | 6 +++--- spacy/lang/tokenizer_exceptions.py | 14 ++++---------- spacy/tests/tokenizer/test_urls.py | 6 +++++- spacy/tokenizer.pyx | 6 +++--- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index 4198dcd88..bc043486f 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -10,10 +10,10 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "") _currency = r"\$¢£€¥฿" _quotes = CONCAT_QUOTES.replace("'", "") +_units = UNITS.replace("%", "") _prefixes = ( - [r"\+"] - + LIST_PUNCT + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + [_concat_icons] @@ -29,7 +29,7 @@ _suffixes = ( r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:[{c}])".format(c=_currency), - r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[0-9])(?:{u})".format(u=_units), r"(?<=[{al}{e}{q}(?:{c})])\.".format( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency ), diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 4d5ff4423..57771cca4 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -10,11 +10,9 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE # A few minor mods to this regex to account for use cases represented in test_urls URL_PATTERN = ( r"^" - # in order to support the prefix tokenization (see prefix test cases in test_urls). - r"(?=[\w])" - # protocol identifier - r"(?:(?:https?|ftp|mailto)://)?" - # user:pass authentication + # protocol identifier (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml) + r"(?:(?:[\w\+\-\.]{2,})://)?" + # mailto:user or user:pass authentication r"(?:\S+(?::\S*)?@)?" r"(?:" # IP address exclusion @@ -43,11 +41,7 @@ URL_PATTERN = ( # port number r"(?::\d{2,5})?" # resource path - r"(?:/\S*)?" - # query parameters - r"\??(:?\S*)?" - # in order to support the suffix tokenization (see suffix test cases in test_urls), - r"(?<=[\w/])" + r"(?:[/?#]\S*)?" r"$" ).strip() diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 59c2b3204..bf59ae4d7 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -12,6 +12,7 @@ URLS_BASIC = [ URLS_FULL = URLS_BASIC + [ "mailto:foo-bar@baz-co.com", + "mailto:foo-bar@baz-co.com?subject=hi", "www.google.com?q=google", "http://foo.com/blah_(wikipedia)#cite-1", ] @@ -45,6 +46,10 @@ URLS_SHOULD_MATCH = [ "http://a.b-c.de", "http://223.255.255.254", "http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 + "ssh://login@server.com:12345/repository.git", + "svn+ssh://user@ssh.yourdomain.com/path", + pytest.param("chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()), + pytest.param("chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()), pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()), pytest.param( "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail() @@ -81,7 +86,6 @@ URLS_SHOULD_NOT_MATCH = [ "http:// shouldfail.com", ":// should fail", "http://foo.bar/foo(bar)baz quux", - "ftps://foo.bar/", "http://-error-.invalid/", "http://a.b-.co", "http://0.0.0.0", diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 81a62d28a..cdfa55dcb 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -227,7 +227,9 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: - if self.token_match and self.token_match(string): + if self.token_match and self.token_match(string) \ + and not self.find_prefix(string) \ + and not self.find_suffix(string): break if self._specials.get(hash_string(string)) != NULL: has_special[0] = 1 @@ -243,8 +245,6 @@ cdef class Tokenizer: prefixes.push_back(self.vocab.get(mem, prefix)) has_special[0] = 1 break - if self.token_match and self.token_match(string): - break suf_len = self.find_suffix(string) if suf_len != 0: suffix = string[-suf_len:] From 573e543e4aadc83e30a1f4069f3624899945e66e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 6 Oct 2019 13:30:01 +0200 Subject: [PATCH 2/2] Alphanumeric -> alphabetic [ci skip] see ines/spacy-course#38 --- spacy/lexeme.pyx | 2 +- spacy/matcher/_schemas.py | 2 +- website/docs/usage/rule-based-matching.md | 2 +- website/docs/usage/spacy-101.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 5b88e8fcc..5c981bc25 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -375,7 +375,7 @@ cdef class Lexeme: Lexeme.c_set_flag(self.c, IS_STOP, x) property is_alpha: - """RETURNS (bool): Whether the lexeme consists of alphanumeric + """RETURNS (bool): Whether the lexeme consists of alphabetic characters. Equivalent to `lexeme.text.isalpha()`. """ def __get__(self): diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py index 471e2b7b5..1b10f0dd5 100644 --- a/spacy/matcher/_schemas.py +++ b/spacy/matcher/_schemas.py @@ -111,7 +111,7 @@ TOKEN_PATTERN_SCHEMA = { "$ref": "#/definitions/integer_value", }, "IS_ALPHA": { - "title": "Token consists of alphanumeric characters", + "title": "Token consists of alphabetic characters", "$ref": "#/definitions/boolean_value", }, "IS_ASCII": { diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 9c3a43f1d..fe8e4e2d2 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -163,7 +163,7 @@ rule-based matching are: | `TEXT` 2.1 | unicode | The exact verbatim text of a token. | | `LOWER` | unicode | The lowercase form of the token text. | |  `LENGTH` | int | The length of the token text. | -|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphanumeric characters, ASCII characters, digits. | +|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 379535cf4..da56f2397 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -573,7 +573,7 @@ apple = doc[0] print("Fine-grained POS tag", apple.pos_, apple.pos) print("Coarse-grained POS tag", apple.tag_, apple.tag) print("Word shape", apple.shape_, apple.shape) -print("Alphanumeric characters?", apple.is_alpha) +print("Alphabetic characters?", apple.is_alpha) print("Punctuation mark?", apple.is_punct) billion = doc[10]