From 1139247532d42ccc16e2e1c548924d83d7615637 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 9 Mar 2020 12:09:41 +0100 Subject: [PATCH 1/4] Revert changes to token_match priority from #4374 * Revert changes to priority of `token_match` so that it has priority over all other tokenizer patterns * Add lookahead and potentially slow lookbehind back to the default URL pattern * Expand character classes in URL pattern to improve matching around lookaheads and lookbehinds related to #4882 * Revert changes to Hungarian tokenizer * Revert (xfail) several URL tests to their status before #4374 * Update `tokenizer.explain()` and docs accordingly --- spacy/lang/hu/punctuation.py | 6 +++--- spacy/lang/tokenizer_exceptions.py | 6 +++++- spacy/tests/tokenizer/test_urls.py | 8 ++++++-- spacy/tokenizer.pyx | 14 +++++++++---- website/docs/usage/linguistic-features.md | 24 +++++++++++++---------- 5 files changed, 38 insertions(+), 20 deletions(-) diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index bc043486f..a010bb7ae 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -10,7 +10,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "") _currency = r"\$¢£€¥฿" _quotes = CONCAT_QUOTES.replace("'", "") -_units = UNITS.replace("%", "") _prefixes = ( LIST_PUNCT @@ -21,7 +20,8 @@ _prefixes = ( ) _suffixes = ( - LIST_PUNCT + [r"\+"] + + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + [_concat_icons] @@ -29,7 +29,7 @@ _suffixes = ( r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:[{c}])".format(c=_currency), - r"(?<=[0-9])(?:{u})".format(u=_units), + r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[{al}{e}{q}(?:{c})])\.".format( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency ), diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 2c0fc9cf7..42dbc7bac 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .char_classes import ALPHA_LOWER +from .char_classes import ALPHA_LOWER, ALPHA from ..symbols import ORTH, POS, TAG, LEMMA, SPACE @@ -13,6 +13,8 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE URL_PATTERN = ( # fmt: off r"^" + # in order to support the prefix tokenization (see prefix test cases in test_urls). + r"(?=[" + ALPHA + "\w])" # protocol identifier (mods: make optional and expand schemes) # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml) r"(?:(?:[\w\+\-\.]{2,})://)?" @@ -54,6 +56,8 @@ URL_PATTERN = ( r"(?::\d{2,5})?" # resource path r"(?:[/?#]\S*)?" + # in order to support the suffix tokenization (see suffix test cases in test_urls), + r"(?<=[" + ALPHA + "\w/])" r"$" # fmt: on ).strip() diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 58e9d73f3..2d82e213c 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -56,8 +56,12 @@ URLS_SHOULD_MATCH = [ pytest.param( "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail() ), - "http://foo.com/blah_blah_(wikipedia)", - "http://foo.com/blah_blah_(wikipedia)_(again)", + pytest.param( + "http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail() + ), + pytest.param( + "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail() + ), "http://www.foo.co.uk", "http://www.foo.co.uk/", "http://www.foo.co.uk/blah/blah", diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4da081259..6f7e44061 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -239,6 +239,8 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: + if self.token_match and self.token_match(string): + break if self._specials.get(hash_string(string)) != NULL: has_special[0] = 1 break @@ -455,6 +457,10 @@ cdef class Tokenizer: suffixes = [] while substring: while prefix_search(substring) or suffix_search(substring): + if token_match(substring): + tokens.append(("TOKEN_MATCH", substring)) + substring = '' + break if substring in special_cases: tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) substring = '' @@ -475,12 +481,12 @@ cdef class Tokenizer: break suffixes.append(("SUFFIX", substring[split:])) substring = substring[:split] - if substring in special_cases: - tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) - substring = '' - elif token_match(substring): + if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) substring = '' + elif substring in special_cases: + tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) + substring = '' elif list(infix_finditer(substring)): infixes = infix_finditer(substring) offset = 0 diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 685619c88..60a6699a9 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -740,6 +740,10 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, suffixes = [] while substring: while prefix_search(substring) or suffix_search(substring): + if token_match(substring): + tokens.append(substring) + substring = '' + break if substring in special_cases: tokens.extend(special_cases[substring]) substring = '' @@ -754,12 +758,12 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, split = suffix_search(substring).start() suffixes.append(substring[split:]) substring = substring[:split] - if substring in special_cases: - tokens.extend(special_cases[substring]) - substring = '' - elif token_match(substring): + if token_match(substring): tokens.append(substring) substring = '' + elif substring in special_cases: + tokens.extend(special_cases[substring]) + substring = '' elif list(infix_finditer(substring)): infixes = infix_finditer(substring) offset = 0 @@ -780,14 +784,14 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, The algorithm can be summarized as follows: 1. Iterate over whitespace-separated substrings. -2. Check whether we have an explicitly defined rule for this substring. If we +2. Look for a token match. If there is a match, stop processing and keep this token. +3. Check whether we have an explicitly defined rule for this substring. If we do, use it. -3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, - so that special cases always get priority. -4. If we didn't consume a prefix, try to consume a suffix and then go back to +4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, + so that the token match and special cases always get priority. +5. If we didn't consume a prefix, try to consume a suffix and then go back to #2. -5. If we can't consume a prefix or a suffix, look for a special case. -6. Next, look for a token match. +6. If we can't consume a prefix or a suffix, look for a special case. 7. Look for "infixes" — stuff like hyphens etc. and split the substring into tokens on all infixes. 8. Once we can't consume any more of the string, handle it as a single token. From 0c31f03ec5525cd33224a880b6d678c69019727d Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 9 Mar 2020 13:41:01 +0100 Subject: [PATCH 2/4] Update docs [ci skip] --- website/docs/usage/linguistic-features.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 60a6699a9..0ceae4c4f 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -838,8 +838,6 @@ domain. There are five things you would need to define: hyphens etc. 5. An optional boolean function `token_match` matching strings that should never be split, overriding the infix rules. Useful for things like URLs or numbers. - Note that prefixes and suffixes will be split off before `token_match` is - applied. You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is to use `re.compile()` to build a regular expression object, and pass its From 565e0eef73fab8c394339239cc48e4a83e068dfd Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 5 May 2020 10:35:33 +0200 Subject: [PATCH 3/4] Add tokenizer option for token match with affixes To fix the slow tokenizer URL (#4374) and allow `token_match` to take priority over prefixes and suffixes by default, introduce a new tokenizer option for a token match pattern that's applied after prefixes and suffixes but before infixes. --- spacy/lang/fr/tokenizer_exceptions.py | 4 --- spacy/lang/hu/tokenizer_exceptions.py | 3 +-- spacy/lang/tokenizer_exceptions.py | 7 ++--- spacy/language.py | 5 +++- spacy/tests/tokenizer/test_urls.py | 12 +++------ spacy/tokenizer.pxd | 1 + spacy/tokenizer.pyx | 37 +++++++++++++++++++++++---- website/docs/api/tokenizer.md | 3 ++- 8 files changed, 46 insertions(+), 26 deletions(-) diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index cb1702300..465626d39 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .punctuation import ELISION, HYPHENS -from ..tokenizer_exceptions import URL_PATTERN from ..char_classes import ALPHA_LOWER, ALPHA from ...symbols import ORTH, LEMMA @@ -455,9 +454,6 @@ _regular_exp += [ for hc in _hyphen_combination ] -# URLs -_regular_exp.append(URL_PATTERN) - TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py index c18a2cec2..d328baa22 100644 --- a/spacy/lang/hu/tokenizer_exceptions.py +++ b/spacy/lang/hu/tokenizer_exceptions.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from ..punctuation import ALPHA_LOWER, CURRENCY -from ..tokenizer_exceptions import URL_PATTERN from ...symbols import ORTH @@ -649,4 +648,4 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format( TOKENIZER_EXCEPTIONS = _exc -TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match +TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index f1eabd9aa..6a9a5363f 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -13,8 +13,6 @@ from ..symbols import ORTH, POS, TAG, LEMMA, SPACE URL_PATTERN = ( # fmt: off r"^" - # in order to support the prefix tokenization (see prefix test cases in test_urls). - r"(?=[" + ALPHA + "\w])" # protocol identifier (mods: make optional and expand schemes) # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml) r"(?:(?:[\w\+\-\.]{2,})://)?" @@ -56,13 +54,12 @@ URL_PATTERN = ( r"(?::\d{2,5})?" # resource path r"(?:[/?#]\S*)?" - # in order to support the suffix tokenization (see suffix test cases in test_urls), - r"(?<=[" + ALPHA + "\w/])" r"$" # fmt: on ).strip() -TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match +TOKEN_MATCH = None +TOKEN_MATCH_WITH_AFFIXES = re.compile("(?u)" + URL_PATTERN).match BASE_EXCEPTIONS = {} diff --git a/spacy/language.py b/spacy/language.py index e89f80f08..d4f6c78ec 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -31,7 +31,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer from .attrs import IS_STOP, LANG from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES -from .lang.tokenizer_exceptions import TOKEN_MATCH +from .lang.tokenizer_exceptions import TOKEN_MATCH, TOKEN_MATCH_WITH_AFFIXES from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop @@ -86,6 +86,7 @@ class BaseDefaults(object): def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions token_match = cls.token_match + token_match_with_affixes = cls.token_match_with_affixes prefix_search = ( util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None ) @@ -103,10 +104,12 @@ class BaseDefaults(object): suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match, + token_match_with_affixes=token_match_with_affixes, ) pipe_names = ["tagger", "parser", "ner"] token_match = TOKEN_MATCH + token_match_with_affixes = TOKEN_MATCH_WITH_AFFIXES prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 2d82e213c..2f76111e5 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -56,12 +56,8 @@ URLS_SHOULD_MATCH = [ pytest.param( "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail() ), - pytest.param( - "http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail() - ), - pytest.param( - "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail() - ), + "http://foo.com/blah_blah_(wikipedia)", + "http://foo.com/blah_blah_(wikipedia)_(again)", "http://www.foo.co.uk", "http://www.foo.co.uk/", "http://www.foo.co.uk/blah/blah", @@ -126,12 +122,12 @@ SUFFIXES = ['"', ":", ">"] @pytest.mark.parametrize("url", URLS_SHOULD_MATCH) def test_should_match(en_tokenizer, url): - assert en_tokenizer.token_match(url) is not None + assert en_tokenizer.token_match_with_affixes(url) is not None @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) def test_should_not_match(en_tokenizer, url): - assert en_tokenizer.token_match(url) is None + assert en_tokenizer.token_match_with_affixes(url) is None @pytest.mark.parametrize("url", URLS_BASIC) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index dadbad7bd..70d49bb39 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -17,6 +17,7 @@ cdef class Tokenizer: cpdef readonly Vocab vocab cdef object _token_match + cdef object _token_match_with_affixes cdef object _prefix_search cdef object _suffix_search cdef object _infix_finditer diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 16a2cf27b..cf0421158 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -30,7 +30,8 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, - suffix_search=None, infix_finditer=None, token_match=None): + suffix_search=None, infix_finditer=None, token_match=None, + token_match_with_affixes=None): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -43,6 +44,8 @@ cdef class Tokenizer: `re.compile(string).finditer` to find infixes. token_match (callable): A boolean function matching strings to be recognised as tokens. + token_match_with_affixes (callable): A boolean function matching strings to be + recognised as tokens after considering prefixes and suffixes. RETURNS (Tokenizer): The newly constructed object. EXAMPLE: @@ -55,6 +58,7 @@ cdef class Tokenizer: self._cache = PreshMap() self._specials = PreshMap() self.token_match = token_match + self.token_match_with_affixes = token_match_with_affixes self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -70,6 +74,14 @@ cdef class Tokenizer: self._token_match = token_match self._flush_cache() + property token_match_with_affixes: + def __get__(self): + return self._token_match_with_affixes + + def __set__(self, token_match_with_affixes): + self._token_match_with_affixes = token_match_with_affixes + self._flush_cache() + property prefix_search: def __get__(self): return self._prefix_search @@ -108,11 +120,12 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, - self._rules, + self.rules, self.prefix_search, self.suffix_search, self.infix_finditer, - self.token_match) + self.token_match, + self.token_match_with_affixes) return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): @@ -297,7 +310,9 @@ cdef class Tokenizer: cache_hit = self._try_cache(hash_string(string), tokens) if cache_hit: pass - elif self.token_match and self.token_match(string): + elif (self.token_match and self.token_match(string)) or \ + (self.token_match_with_affixes and \ + self.token_match_with_affixes(string)): # We're always saying 'no' to spaces here -- the caller will # fix up the outermost one, with reference to the original. # See Issue #859 @@ -450,6 +465,11 @@ cdef class Tokenizer: suffix_search = self.suffix_search infix_finditer = self.infix_finditer token_match = self.token_match + if token_match is None: + token_match = re.compile("a^").match + token_match_with_affixes = self.token_match_with_affixes + if token_match_with_affixes is None: + token_match_with_affixes = re.compile("a^").match special_cases = {} for orth, special_tokens in self.rules.items(): special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] @@ -485,6 +505,9 @@ cdef class Tokenizer: if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) substring = '' + elif token_match_with_affixes(substring): + tokens.append(("TOKEN_MATCH_WITH_AFFIXES", substring)) + substring = '' elif substring in special_cases: tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) substring = '' @@ -549,6 +572,7 @@ cdef class Tokenizer: ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), ("token_match", lambda: _get_regex_pattern(self.token_match)), + ("token_match_with_affixes", lambda: _get_regex_pattern(self.token_match_with_affixes)), ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) )) exclude = util.get_serialization_exclude(serializers, exclude, kwargs) @@ -570,11 +594,12 @@ cdef class Tokenizer: ("suffix_search", lambda b: data.setdefault("suffix_search", b)), ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), ("token_match", lambda b: data.setdefault("token_match", b)), + ("token_match_with_affixes", lambda b: data.setdefault("token_match_with_affixes", b)), ("exceptions", lambda b: data.setdefault("rules", b)) )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]: + for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "token_match_with_affixes"]: if key in data: data[key] = unescape_unicode(data[key]) if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): @@ -585,6 +610,8 @@ cdef class Tokenizer: self.infix_finditer = re.compile(data["infix_finditer"]).finditer if "token_match" in data and isinstance(data["token_match"], basestring_): self.token_match = re.compile(data["token_match"]).match + if "token_match_with_affixes" in data and isinstance(data["token_match_with_affixes"], basestring_): + self.token_match_with_affixes = re.compile(data["token_match_with_affixes"]).match if "rules" in data and isinstance(data["rules"], dict): # make sure to hard reset the cache to remove data from the default exceptions self._rules = {} diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 7462af739..f73e851f7 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -41,7 +41,8 @@ the | `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | | `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | | `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | +| `token_match_with_affixes` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | | **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} From e4a1b5dab1f2de60fa0ddbb3e80282b0749635da Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 22 May 2020 12:41:03 +0200 Subject: [PATCH 4/4] Rename to url_match Rename to `url_match` and update docs. --- spacy/lang/tokenizer_exceptions.py | 2 +- spacy/language.py | 8 ++--- spacy/tests/tokenizer/test_urls.py | 4 +-- spacy/tokenizer.pxd | 2 +- spacy/tokenizer.pyx | 40 +++++++++++------------ website/docs/api/tokenizer.md | 16 ++++----- website/docs/usage/linguistic-features.md | 23 ++++++++----- 7 files changed, 51 insertions(+), 44 deletions(-) diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 6a9a5363f..67349916b 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -59,7 +59,7 @@ URL_PATTERN = ( ).strip() TOKEN_MATCH = None -TOKEN_MATCH_WITH_AFFIXES = re.compile("(?u)" + URL_PATTERN).match +URL_MATCH = re.compile("(?u)" + URL_PATTERN).match BASE_EXCEPTIONS = {} diff --git a/spacy/language.py b/spacy/language.py index 2c7f4e2b5..53a788f2a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -28,7 +28,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer from .attrs import IS_STOP, LANG, NORM from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES -from .lang.tokenizer_exceptions import TOKEN_MATCH, TOKEN_MATCH_WITH_AFFIXES +from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH from .lang.norm_exceptions import BASE_NORMS from .lang.tag_map import TAG_MAP from .tokens import Doc @@ -89,7 +89,7 @@ class BaseDefaults(object): def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions token_match = cls.token_match - token_match_with_affixes = cls.token_match_with_affixes + url_match = cls.url_match prefix_search = ( util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None ) @@ -107,12 +107,12 @@ class BaseDefaults(object): suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match, - token_match_with_affixes=token_match_with_affixes, + url_match=url_match, ) pipe_names = ["tagger", "parser", "ner"] token_match = TOKEN_MATCH - token_match_with_affixes = TOKEN_MATCH_WITH_AFFIXES + url_match = URL_MATCH prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 2f76111e5..65ba93d66 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -122,12 +122,12 @@ SUFFIXES = ['"', ":", ">"] @pytest.mark.parametrize("url", URLS_SHOULD_MATCH) def test_should_match(en_tokenizer, url): - assert en_tokenizer.token_match_with_affixes(url) is not None + assert en_tokenizer.url_match(url) is not None @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) def test_should_not_match(en_tokenizer, url): - assert en_tokenizer.token_match_with_affixes(url) is None + assert en_tokenizer.url_match(url) is None @pytest.mark.parametrize("url", URLS_BASIC) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 70d49bb39..694ea49cc 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -17,7 +17,7 @@ cdef class Tokenizer: cpdef readonly Vocab vocab cdef object _token_match - cdef object _token_match_with_affixes + cdef object _url_match cdef object _prefix_search cdef object _suffix_search cdef object _infix_finditer diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index cf0421158..154a42c4f 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -31,7 +31,7 @@ cdef class Tokenizer: """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, infix_finditer=None, token_match=None, - token_match_with_affixes=None): + url_match=None): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -44,7 +44,7 @@ cdef class Tokenizer: `re.compile(string).finditer` to find infixes. token_match (callable): A boolean function matching strings to be recognised as tokens. - token_match_with_affixes (callable): A boolean function matching strings to be + url_match (callable): A boolean function matching strings to be recognised as tokens after considering prefixes and suffixes. RETURNS (Tokenizer): The newly constructed object. @@ -58,7 +58,7 @@ cdef class Tokenizer: self._cache = PreshMap() self._specials = PreshMap() self.token_match = token_match - self.token_match_with_affixes = token_match_with_affixes + self.url_match = url_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -74,12 +74,12 @@ cdef class Tokenizer: self._token_match = token_match self._flush_cache() - property token_match_with_affixes: + property url_match: def __get__(self): - return self._token_match_with_affixes + return self._url_match - def __set__(self, token_match_with_affixes): - self._token_match_with_affixes = token_match_with_affixes + def __set__(self, url_match): + self._url_match = url_match self._flush_cache() property prefix_search: @@ -125,7 +125,7 @@ cdef class Tokenizer: self.suffix_search, self.infix_finditer, self.token_match, - self.token_match_with_affixes) + self.url_match) return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): @@ -311,8 +311,8 @@ cdef class Tokenizer: if cache_hit: pass elif (self.token_match and self.token_match(string)) or \ - (self.token_match_with_affixes and \ - self.token_match_with_affixes(string)): + (self.url_match and \ + self.url_match(string)): # We're always saying 'no' to spaces here -- the caller will # fix up the outermost one, with reference to the original. # See Issue #859 @@ -467,9 +467,9 @@ cdef class Tokenizer: token_match = self.token_match if token_match is None: token_match = re.compile("a^").match - token_match_with_affixes = self.token_match_with_affixes - if token_match_with_affixes is None: - token_match_with_affixes = re.compile("a^").match + url_match = self.url_match + if url_match is None: + url_match = re.compile("a^").match special_cases = {} for orth, special_tokens in self.rules.items(): special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] @@ -505,8 +505,8 @@ cdef class Tokenizer: if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) substring = '' - elif token_match_with_affixes(substring): - tokens.append(("TOKEN_MATCH_WITH_AFFIXES", substring)) + elif url_match(substring): + tokens.append(("URL_MATCH", substring)) substring = '' elif substring in special_cases: tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) @@ -572,7 +572,7 @@ cdef class Tokenizer: ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), ("token_match", lambda: _get_regex_pattern(self.token_match)), - ("token_match_with_affixes", lambda: _get_regex_pattern(self.token_match_with_affixes)), + ("url_match", lambda: _get_regex_pattern(self.url_match)), ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) )) exclude = util.get_serialization_exclude(serializers, exclude, kwargs) @@ -594,12 +594,12 @@ cdef class Tokenizer: ("suffix_search", lambda b: data.setdefault("suffix_search", b)), ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), ("token_match", lambda b: data.setdefault("token_match", b)), - ("token_match_with_affixes", lambda b: data.setdefault("token_match_with_affixes", b)), + ("url_match", lambda b: data.setdefault("url_match", b)), ("exceptions", lambda b: data.setdefault("rules", b)) )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "token_match_with_affixes"]: + for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "url_match"]: if key in data: data[key] = unescape_unicode(data[key]) if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): @@ -610,8 +610,8 @@ cdef class Tokenizer: self.infix_finditer = re.compile(data["infix_finditer"]).finditer if "token_match" in data and isinstance(data["token_match"], basestring_): self.token_match = re.compile(data["token_match"]).match - if "token_match_with_affixes" in data and isinstance(data["token_match_with_affixes"], basestring_): - self.token_match_with_affixes = re.compile(data["token_match_with_affixes"]).match + if "url_match" in data and isinstance(data["url_match"], basestring_): + self.url_match = re.compile(data["url_match"]).match if "rules" in data and isinstance(data["rules"], dict): # make sure to hard reset the cache to remove data from the default exceptions self._rules = {} diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index f73e851f7..6f8badfe8 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -35,15 +35,15 @@ the > ``` | Name | Type | Description | -| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | | `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | -| `token_match_with_affixes` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | +| **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 91ca1267b..bcc943436 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -759,6 +759,9 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, if token_match(substring): tokens.append(substring) substring = '' + elif url_match(substring): + tokens.append(substring) + substring = '' elif substring in special_cases: tokens.extend(special_cases[substring]) substring = '' @@ -782,17 +785,19 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, The algorithm can be summarized as follows: 1. Iterate over whitespace-separated substrings. -2. Look for a token match. If there is a match, stop processing and keep this token. -3. Check whether we have an explicitly defined rule for this substring. If we - do, use it. -4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, - so that the token match and special cases always get priority. +2. Look for a token match. If there is a match, stop processing and keep this + token. +3. Check whether we have an explicitly defined special case for this substring. + If we do, use it. +4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to + #2, so that the token match and special cases always get priority. 5. If we didn't consume a prefix, try to consume a suffix and then go back to #2. -6. If we can't consume a prefix or a suffix, look for a special case. -7. Look for "infixes" — stuff like hyphens etc. and split the substring into +6. If we can't consume a prefix or a suffix, look for a URL match. +7. If there's no URL match, then look for a special case. +8. Look for "infixes" — stuff like hyphens etc. and split the substring into tokens on all infixes. -8. Once we can't consume any more of the string, handle it as a single token. +9. Once we can't consume any more of the string, handle it as a single token. #### Debugging the tokenizer {#tokenizer-debug new="2.2.3"} @@ -836,6 +841,8 @@ domain. There are five things you would need to define: hyphens etc. 5. An optional boolean function `token_match` matching strings that should never be split, overriding the infix rules. Useful for things like URLs or numbers. +6. An optional boolean function `url_match`, which is similar to `token_match` + except prefixes and suffixes are removed before applying the match. You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is to use `re.compile()` to build a regular expression object, and pass its