diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 4eb4c1568..933607bdf 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .punctuation import ELISION, HYPHENS -from ..tokenizer_exceptions import URL_PATTERN from ..char_classes import ALPHA_LOWER, ALPHA from ...symbols import ORTH, LEMMA @@ -455,9 +454,6 @@ _regular_exp += [ for hc in _hyphen_combination ] -# URLs -_regular_exp.append(URL_PATTERN) - TOKENIZER_EXCEPTIONS = _exc TOKEN_MATCH = re.compile( diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index bc043486f..a010bb7ae 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -10,7 +10,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "") _currency = r"\$¢£€¥฿" _quotes = CONCAT_QUOTES.replace("'", "") -_units = UNITS.replace("%", "") _prefixes = ( LIST_PUNCT @@ -21,7 +20,8 @@ _prefixes = ( ) _suffixes = ( - LIST_PUNCT + [r"\+"] + + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + [_concat_icons] @@ -29,7 +29,7 @@ _suffixes = ( r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:[{c}])".format(c=_currency), - r"(?<=[0-9])(?:{u})".format(u=_units), + r"(?<=[0-9])(?:{u})".format(u=UNITS), r"(?<=[{al}{e}{q}(?:{c})])\.".format( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency ), diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py index c18a2cec2..d328baa22 100644 --- a/spacy/lang/hu/tokenizer_exceptions.py +++ b/spacy/lang/hu/tokenizer_exceptions.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from ..punctuation import ALPHA_LOWER, CURRENCY -from ..tokenizer_exceptions import URL_PATTERN from ...symbols import ORTH @@ -649,4 +648,4 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format( TOKENIZER_EXCEPTIONS = _exc -TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match +TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 29ce75442..67349916b 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .char_classes import ALPHA_LOWER +from .char_classes import ALPHA_LOWER, ALPHA from ..symbols import ORTH, POS, TAG, LEMMA, SPACE @@ -58,7 +58,8 @@ URL_PATTERN = ( # fmt: on ).strip() -TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match +TOKEN_MATCH = None +URL_MATCH = re.compile("(?u)" + URL_PATTERN).match BASE_EXCEPTIONS = {} diff --git a/spacy/language.py b/spacy/language.py index d5bd879e9..53a788f2a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -28,7 +28,7 @@ from ._ml import link_vectors_to_models, create_default_optimizer from .attrs import IS_STOP, LANG, NORM from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES -from .lang.tokenizer_exceptions import TOKEN_MATCH +from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH from .lang.norm_exceptions import BASE_NORMS from .lang.tag_map import TAG_MAP from .tokens import Doc @@ -89,6 +89,7 @@ class BaseDefaults(object): def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions token_match = cls.token_match + url_match = cls.url_match prefix_search = ( util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None ) @@ -106,10 +107,12 @@ class BaseDefaults(object): suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match, + url_match=url_match, ) pipe_names = ["tagger", "parser", "ner"] token_match = TOKEN_MATCH + url_match = URL_MATCH prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 58e9d73f3..65ba93d66 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -122,12 +122,12 @@ SUFFIXES = ['"', ":", ">"] @pytest.mark.parametrize("url", URLS_SHOULD_MATCH) def test_should_match(en_tokenizer, url): - assert en_tokenizer.token_match(url) is not None + assert en_tokenizer.url_match(url) is not None @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) def test_should_not_match(en_tokenizer, url): - assert en_tokenizer.token_match(url) is None + assert en_tokenizer.url_match(url) is None @pytest.mark.parametrize("url", URLS_BASIC) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index dadbad7bd..694ea49cc 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -17,6 +17,7 @@ cdef class Tokenizer: cpdef readonly Vocab vocab cdef object _token_match + cdef object _url_match cdef object _prefix_search cdef object _suffix_search cdef object _infix_finditer diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 69d6285e1..154a42c4f 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -30,7 +30,8 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, - suffix_search=None, infix_finditer=None, token_match=None): + suffix_search=None, infix_finditer=None, token_match=None, + url_match=None): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -43,6 +44,8 @@ cdef class Tokenizer: `re.compile(string).finditer` to find infixes. token_match (callable): A boolean function matching strings to be recognised as tokens. + url_match (callable): A boolean function matching strings to be + recognised as tokens after considering prefixes and suffixes. RETURNS (Tokenizer): The newly constructed object. EXAMPLE: @@ -55,6 +58,7 @@ cdef class Tokenizer: self._cache = PreshMap() self._specials = PreshMap() self.token_match = token_match + self.url_match = url_match self.prefix_search = prefix_search self.suffix_search = suffix_search self.infix_finditer = infix_finditer @@ -70,6 +74,14 @@ cdef class Tokenizer: self._token_match = token_match self._flush_cache() + property url_match: + def __get__(self): + return self._url_match + + def __set__(self, url_match): + self._url_match = url_match + self._flush_cache() + property prefix_search: def __get__(self): return self._prefix_search @@ -108,11 +120,12 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, - self._rules, + self.rules, self.prefix_search, self.suffix_search, self.infix_finditer, - self.token_match) + self.token_match, + self.url_match) return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): @@ -240,6 +253,8 @@ cdef class Tokenizer: cdef unicode minus_suf cdef size_t last_size = 0 while string and len(string) != last_size: + if self.token_match and self.token_match(string): + break if self._specials.get(hash_string(string)) != NULL: has_special[0] = 1 break @@ -295,7 +310,9 @@ cdef class Tokenizer: cache_hit = self._try_cache(hash_string(string), tokens) if cache_hit: pass - elif self.token_match and self.token_match(string): + elif (self.token_match and self.token_match(string)) or \ + (self.url_match and \ + self.url_match(string)): # We're always saying 'no' to spaces here -- the caller will # fix up the outermost one, with reference to the original. # See Issue #859 @@ -448,6 +465,11 @@ cdef class Tokenizer: suffix_search = self.suffix_search infix_finditer = self.infix_finditer token_match = self.token_match + if token_match is None: + token_match = re.compile("a^").match + url_match = self.url_match + if url_match is None: + url_match = re.compile("a^").match special_cases = {} for orth, special_tokens in self.rules.items(): special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens] @@ -456,6 +478,10 @@ cdef class Tokenizer: suffixes = [] while substring: while prefix_search(substring) or suffix_search(substring): + if token_match(substring): + tokens.append(("TOKEN_MATCH", substring)) + substring = '' + break if substring in special_cases: tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) substring = '' @@ -476,12 +502,15 @@ cdef class Tokenizer: break suffixes.append(("SUFFIX", substring[split:])) substring = substring[:split] - if substring in special_cases: - tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) - substring = '' - elif token_match(substring): + if token_match(substring): tokens.append(("TOKEN_MATCH", substring)) substring = '' + elif url_match(substring): + tokens.append(("URL_MATCH", substring)) + substring = '' + elif substring in special_cases: + tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring])) + substring = '' elif list(infix_finditer(substring)): infixes = infix_finditer(substring) offset = 0 @@ -543,6 +572,7 @@ cdef class Tokenizer: ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), ("token_match", lambda: _get_regex_pattern(self.token_match)), + ("url_match", lambda: _get_regex_pattern(self.url_match)), ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) )) exclude = util.get_serialization_exclude(serializers, exclude, kwargs) @@ -564,11 +594,12 @@ cdef class Tokenizer: ("suffix_search", lambda b: data.setdefault("suffix_search", b)), ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), ("token_match", lambda b: data.setdefault("token_match", b)), + ("url_match", lambda b: data.setdefault("url_match", b)), ("exceptions", lambda b: data.setdefault("rules", b)) )) exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match"]: + for key in ["prefix_search", "suffix_search", "infix_finditer", "token_match", "url_match"]: if key in data: data[key] = unescape_unicode(data[key]) if "prefix_search" in data and isinstance(data["prefix_search"], basestring_): @@ -579,6 +610,8 @@ cdef class Tokenizer: self.infix_finditer = re.compile(data["infix_finditer"]).finditer if "token_match" in data and isinstance(data["token_match"], basestring_): self.token_match = re.compile(data["token_match"]).match + if "url_match" in data and isinstance(data["url_match"], basestring_): + self.url_match = re.compile(data["url_match"]).match if "rules" in data and isinstance(data["rules"], dict): # make sure to hard reset the cache to remove data from the default exceptions self._rules = {} diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 7462af739..6f8badfe8 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -35,14 +35,15 @@ the > ``` | Name | Type | Description | -| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | +| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | +| **RETURNS** | `Tokenizer` | The newly constructed object. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index d17e5a661..bcc943436 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -738,6 +738,10 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, suffixes = [] while substring: while prefix_search(substring) or suffix_search(substring): + if token_match(substring): + tokens.append(substring) + substring = '' + break if substring in special_cases: tokens.extend(special_cases[substring]) substring = '' @@ -752,12 +756,15 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, split = suffix_search(substring).start() suffixes.append(substring[split:]) substring = substring[:split] - if substring in special_cases: - tokens.extend(special_cases[substring]) - substring = '' - elif token_match(substring): + if token_match(substring): tokens.append(substring) substring = '' + elif url_match(substring): + tokens.append(substring) + substring = '' + elif substring in special_cases: + tokens.extend(special_cases[substring]) + substring = '' elif list(infix_finditer(substring)): infixes = infix_finditer(substring) offset = 0 @@ -778,17 +785,19 @@ def tokenizer_pseudo_code(self, special_cases, prefix_search, suffix_search, The algorithm can be summarized as follows: 1. Iterate over whitespace-separated substrings. -2. Check whether we have an explicitly defined rule for this substring. If we - do, use it. -3. Otherwise, try to consume one prefix. If we consumed a prefix, go back to #2, - so that special cases always get priority. -4. If we didn't consume a prefix, try to consume a suffix and then go back to +2. Look for a token match. If there is a match, stop processing and keep this + token. +3. Check whether we have an explicitly defined special case for this substring. + If we do, use it. +4. Otherwise, try to consume one prefix. If we consumed a prefix, go back to + #2, so that the token match and special cases always get priority. +5. If we didn't consume a prefix, try to consume a suffix and then go back to #2. -5. If we can't consume a prefix or a suffix, look for a special case. -6. Next, look for a token match. -7. Look for "infixes" — stuff like hyphens etc. and split the substring into +6. If we can't consume a prefix or a suffix, look for a URL match. +7. If there's no URL match, then look for a special case. +8. Look for "infixes" — stuff like hyphens etc. and split the substring into tokens on all infixes. -8. Once we can't consume any more of the string, handle it as a single token. +9. Once we can't consume any more of the string, handle it as a single token. #### Debugging the tokenizer {#tokenizer-debug new="2.2.3"} @@ -832,8 +841,8 @@ domain. There are five things you would need to define: hyphens etc. 5. An optional boolean function `token_match` matching strings that should never be split, overriding the infix rules. Useful for things like URLs or numbers. - Note that prefixes and suffixes will be split off before `token_match` is - applied. +6. An optional boolean function `url_match`, which is similar to `token_match` + except prefixes and suffixes are removed before applying the match. You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is to use `re.compile()` to build a regular expression object, and pass its