mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Restrict tokenizer exceptions to ORTH and NORM
This commit is contained in:
parent
9341cbc013
commit
216efaf5f5
|
@ -645,6 +645,9 @@ class Errors:
|
||||||
"Required tables '{tables}', found '{found}'. If you are not "
|
"Required tables '{tables}', found '{found}'. If you are not "
|
||||||
"providing custom lookups, make sure you have the package "
|
"providing custom lookups, make sure you have the package "
|
||||||
"spacy-lookups-data installed.")
|
"spacy-lookups-data installed.")
|
||||||
|
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
|
||||||
|
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
|
||||||
|
"`ORTH` and `NORM`.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -105,7 +105,13 @@ def test_tokenizer_add_special_case(tokenizer, text, tokens):
|
||||||
assert doc[1].text == tokens[1]["orth"]
|
assert doc[1].text == tokens[1]["orth"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,tokens", [("lorem", [{"orth": "lo"}, {"orth": "re"}])])
|
@pytest.mark.parametrize(
|
||||||
|
"text,tokens",
|
||||||
|
[
|
||||||
|
("lorem", [{"orth": "lo"}, {"orth": "re"}]),
|
||||||
|
("lorem", [{"orth": "lo", "tag": "A"}, {"orth": "rem"}]),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_tokenizer_validate_special_case(tokenizer, text, tokens):
|
def test_tokenizer_validate_special_case(tokenizer, text, tokens):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
tokenizer.add_special_case(text, tokens)
|
tokenizer.add_special_case(text, tokens)
|
||||||
|
|
|
@ -17,7 +17,7 @@ from .strings cimport hash_string
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
|
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH, NORM
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
from . import util
|
from . import util
|
||||||
from .util import registry
|
from .util import registry
|
||||||
|
@ -584,9 +584,11 @@ cdef class Tokenizer:
|
||||||
self.add_special_case(chunk, substrings)
|
self.add_special_case(chunk, substrings)
|
||||||
|
|
||||||
def _validate_special_case(self, chunk, substrings):
|
def _validate_special_case(self, chunk, substrings):
|
||||||
"""Check whether the `ORTH` fields match the string.
|
"""Check whether the `ORTH` fields match the string. Check that
|
||||||
|
additional features beyond `ORTH` and `NORM` are not set by the
|
||||||
|
exception.
|
||||||
|
|
||||||
string (str): The string to specially tokenize.
|
chunk (str): The string to specially tokenize.
|
||||||
substrings (iterable): A sequence of dicts, where each dict describes
|
substrings (iterable): A sequence of dicts, where each dict describes
|
||||||
a token and its attributes.
|
a token and its attributes.
|
||||||
"""
|
"""
|
||||||
|
@ -594,6 +596,10 @@ cdef class Tokenizer:
|
||||||
orth = "".join([spec[ORTH] for spec in attrs])
|
orth = "".join([spec[ORTH] for spec in attrs])
|
||||||
if chunk != orth:
|
if chunk != orth:
|
||||||
raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
|
raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
|
||||||
|
for substring in attrs:
|
||||||
|
for attr in substring:
|
||||||
|
if attr not in (ORTH, NORM):
|
||||||
|
raise ValueError(Errors.E1005.format(attr=self.vocab.strings[attr], chunk=chunk))
|
||||||
|
|
||||||
def add_special_case(self, unicode string, substrings):
|
def add_special_case(self, unicode string, substrings):
|
||||||
"""Add a special-case tokenization rule.
|
"""Add a special-case tokenization rule.
|
||||||
|
|
|
@ -566,6 +566,20 @@ patterns = [nlp("health care reform"), nlp("healthcare reform")]
|
||||||
+ matcher.add("HEALTH", patterns, on_match=on_match)
|
+ matcher.add("HEALTH", patterns, on_match=on_match)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Migrating attributes in tokenizer exceptions {#migrating-tokenizer-exceptions}
|
||||||
|
|
||||||
|
Tokenizer exceptions are now only allowed to set `ORTH` and `NORM` values as
|
||||||
|
part of the token attributes. Exceptions for other attributes such as `TAG` and
|
||||||
|
`LEMMA` should be moved to an [`AttributeRuler`](/api/attributeruler) component:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
- nlp.tokenizer.add_special_case("don't", [{"ORTH": "do"}, {"ORTH": "n't", "LEMMA": "not"}])
|
||||||
|
+ nlp.tokenizer.add_special_case("don't", [{"ORTH": "do"}, {"ORTH": "n't"}])
|
||||||
|
+ ruler = nlp.add_pipe("attribute_ruler")
|
||||||
|
+ ruler.add(patterns=[[{"ORTH": "n't"}]], attrs={"LEMMA": "not"})
|
||||||
|
```
|
||||||
|
|
||||||
### Migrating tag maps and morph rules {#migrating-training-mappings-exceptions}
|
### Migrating tag maps and morph rules {#migrating-training-mappings-exceptions}
|
||||||
|
|
||||||
Instead of defining a `tag_map` and `morph_rules` in the language data, spaCy
|
Instead of defining a `tag_map` and `morph_rules` in the language data, spaCy
|
||||||
|
|
Loading…
Reference in New Issue
Block a user