From 216efaf5f53960f80519cfa2c343a1f9efdbf72e Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 31 Aug 2020 09:42:06 +0200 Subject: [PATCH] Restrict tokenizer exceptions to ORTH and NORM --- spacy/errors.py | 3 +++ spacy/tests/tokenizer/test_tokenizer.py | 8 +++++++- spacy/tokenizer.pyx | 12 +++++++++--- website/docs/usage/v3.md | 14 ++++++++++++++ 4 files changed, 33 insertions(+), 4 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 38c89c479..e53aaef07 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -645,6 +645,9 @@ class Errors: "Required tables '{tables}', found '{found}'. If you are not " "providing custom lookups, make sure you have the package " "spacy-lookups-data installed.") + E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for " + "'{chunk}'. Tokenizer exceptions are only allowed to specify " + "`ORTH` and `NORM`.") @add_codes diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index b89c0627f..ff31ae8a9 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -105,7 +105,13 @@ def test_tokenizer_add_special_case(tokenizer, text, tokens): assert doc[1].text == tokens[1]["orth"] -@pytest.mark.parametrize("text,tokens", [("lorem", [{"orth": "lo"}, {"orth": "re"}])]) +@pytest.mark.parametrize( + "text,tokens", + [ + ("lorem", [{"orth": "lo"}, {"orth": "re"}]), + ("lorem", [{"orth": "lo", "tag": "A"}, {"orth": "rem"}]), + ], +) def test_tokenizer_validate_special_case(tokenizer, text, tokens): with pytest.raises(ValueError): tokenizer.add_special_case(text, tokens) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 9fda1800b..12c634e61 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -17,7 +17,7 @@ from .strings cimport hash_string from .lexeme cimport EMPTY_LEXEME from .attrs import intify_attrs -from .symbols import ORTH +from .symbols import ORTH, NORM from .errors import Errors, Warnings from . import util from .util import registry @@ -584,9 +584,11 @@ cdef class Tokenizer: self.add_special_case(chunk, substrings) def _validate_special_case(self, chunk, substrings): - """Check whether the `ORTH` fields match the string. + """Check whether the `ORTH` fields match the string. Check that + additional features beyond `ORTH` and `NORM` are not set by the + exception. - string (str): The string to specially tokenize. + chunk (str): The string to specially tokenize. substrings (iterable): A sequence of dicts, where each dict describes a token and its attributes. """ @@ -594,6 +596,10 @@ cdef class Tokenizer: orth = "".join([spec[ORTH] for spec in attrs]) if chunk != orth: raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings)) + for substring in attrs: + for attr in substring: + if attr not in (ORTH, NORM): + raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk)) def add_special_case(self, unicode string, substrings): """Add a special-case tokenization rule. diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index d5fea9fee..20b7a139b 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -566,6 +566,20 @@ patterns = [nlp("health care reform"), nlp("healthcare reform")] + matcher.add("HEALTH", patterns, on_match=on_match) ``` +### Migrating attributes in tokenizer exceptions {#migrating-tokenizer-exceptions} + +Tokenizer exceptions are now only allowed to set `ORTH` and `NORM` values as +part of the token attributes. Exceptions for other attributes such as `TAG` and +`LEMMA` should be moved to an [`AttributeRuler`](/api/attributeruler) component: + +```diff +nlp = spacy.blank("en") +- nlp.tokenizer.add_special_case("don't", [{"ORTH": "do"}, {"ORTH": "n't", "LEMMA": "not"}]) ++ nlp.tokenizer.add_special_case("don't", [{"ORTH": "do"}, {"ORTH": "n't"}]) ++ ruler = nlp.add_pipe("attribute_ruler") ++ ruler.add(patterns=[[{"ORTH": "n't"}]], attrs={"LEMMA": "not"}) +``` + ### Migrating tag maps and morph rules {#migrating-training-mappings-exceptions} Instead of defining a `tag_map` and `morph_rules` in the language data, spaCy