diff --git a/spacy/errors.py b/spacy/errors.py index 8a2d95b10..45f3fea76 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -327,6 +327,17 @@ class Errors(object): "performance.") E117 = ("The newly split tokens must match the text of the original token. " "New orths: {new}. Old text: {old}.") + E118 = ("The custom extension attribute '{attr}' is not registered on the " + "Token object so it can't be set during retokenization. To " + "register an attribute, use the Token.set_extension classmethod.") + E119 = ("Can't set custom extension attribute '{attr}' during retokenization " + "because it's not writable. This usually means it was registered " + "with a getter function (and no setter) or as a method extension, " + "so the value is computed dynamically. To overwrite a custom " + "attribute manually, it should be registered with a default value " + "or with a getter AND setter.") + E120 = ("Can't set custom extension attributes during retokenization. " + "Expected dict mapping attribute names to values, but got: {value}") @add_codes diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 8c1b2a25a..59547d1b3 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest from spacy.attrs import LEMMA from spacy.vocab import Vocab -from spacy.tokens import Doc +from spacy.tokens import Doc, Token from ..util import get_doc @@ -259,3 +259,36 @@ def test_doc_retokenize_spans_subtree_size_check(en_tokenizer): attrs = {"lemma": "none", "ent_type": "none"} retokenizer.merge(doc[0:2], attrs=attrs) assert len(list(sent1.root.subtree)) == init_len - 1 + + +def test_doc_retokenize_merge_extension_attrs(en_vocab): + Token.set_extension("a", default=False, force=True) + Token.set_extension("b", default="nothing", force=True) + doc = Doc(en_vocab, words=["hello", "world", "!"]) + # Test regular merging + with doc.retokenize() as retokenizer: + attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}} + retokenizer.merge(doc[0:2], attrs=attrs) + assert doc[0].lemma_ == "hello world" + assert doc[0]._.a == True + assert doc[0]._.b == "1" + # Test bulk merging + doc = Doc(en_vocab, words=["hello", "world", "!", "!"]) + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}}) + retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}}) + assert doc[0]._.a == True + assert doc[0]._.b == "1" + assert doc[1]._.a == None + assert doc[1]._.b == "2" + + +@pytest.mark.parametrize("underscore_attrs", [{"a": "x"}, {"b": "x"}, {"c": "x"}, [1]]) +def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs): + Token.set_extension("a", getter=lambda x: x, force=True) + Token.set_extension("b", method=lambda x: x, force=True) + doc = Doc(en_vocab, words=["hello", "world", "!"]) + attrs = {"_": underscore_attrs} + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[0:2], attrs=attrs) diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index b93a781f7..2fc102cf6 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import pytest from spacy.vocab import Vocab -from spacy.tokens import Doc +from spacy.tokens import Doc, Token from ..util import get_doc @@ -125,3 +125,43 @@ def test_doc_retokenize_split_orths_mismatch(en_vocab): with pytest.raises(ValueError): with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["L", "A"], [(doc[0], 0), (doc[0], 0)]) + + +def test_doc_retokenize_split_extension_attrs(en_vocab): + Token.set_extension("a", default=False, force=True) + Token.set_extension("b", default="nothing", force=True) + doc = Doc(en_vocab, words=["LosAngeles", "start"]) + with doc.retokenize() as retokenizer: + heads = [(doc[0], 1), doc[1]] + underscore = [{"a": True, "b": "1"}, {"b": "2"}] + attrs = {"lemma": ["los", "angeles"], "_": underscore} + retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) + assert doc[0].lemma_ == "los" + assert doc[0]._.a == True + assert doc[0]._.b == "1" + assert doc[1].lemma_ == "angeles" + assert doc[1]._.a == False + assert doc[1]._.b == "2" + + +@pytest.mark.parametrize( + "underscore_attrs", + [ + [{"a": "x"}, {}], # Overwriting getter without setter + [{"b": "x"}, {}], # Overwriting method + [{"c": "x"}, {}], # Overwriting nonexistent attribute + [{"a": "x"}, {"x": "x"}], # Combination + [{"a": "x", "x": "x"}, {"x": "x"}], # Combination + {"x": "x"}, # Not a list of dicts + ], +) +def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs): + Token.set_extension("x", default=False, force=True) + Token.set_extension("a", getter=lambda x: x, force=True) + Token.set_extension("b", method=lambda x: x, force=True) + doc = Doc(en_vocab, words=["LosAngeles", "start"]) + attrs = {"_": underscore_attrs} + with pytest.raises(ValueError): + with doc.retokenize() as retokenizer: + heads = [(doc[0], 1), doc[1]] + retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs) diff --git a/spacy/tests/regression/test_issue1971.py b/spacy/tests/regression/test_issue1971.py index 5288e4f1d..858cb393d 100644 --- a/spacy/tests/regression/test_issue1971.py +++ b/spacy/tests/regression/test_issue1971.py @@ -36,8 +36,8 @@ def test_issue_1971_2(en_vocab): def test_issue_1971_3(en_vocab): """Test that pattern matches correctly for multiple extension attributes.""" - Token.set_extension("a", default=1) - Token.set_extension("b", default=2) + Token.set_extension("a", default=1, force=True) + Token.set_extension("b", default=2, force=True) doc = Doc(en_vocab, words=["hello", "world"]) matcher = Matcher(en_vocab) matcher.add("A", None, [{"_": {"a": 1}}]) @@ -51,8 +51,8 @@ def test_issue_1971_4(en_vocab): """Test that pattern matches correctly with multiple extension attribute values on a single token. """ - Token.set_extension("ext_a", default="str_a") - Token.set_extension("ext_b", default="str_b") + Token.set_extension("ext_a", default="str_a", force=True) + Token.set_extension("ext_b", default="str_b", force=True) matcher = Matcher(en_vocab) doc = Doc(en_vocab, words=["this", "is", "text"]) pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index fd621922f..9dc107185 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -17,6 +17,8 @@ from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..structs cimport LexemeC, TokenC from ..attrs cimport TAG + +from .underscore import is_writable_attr from ..attrs import intify_attrs from ..util import SimpleFrozenDict from ..errors import Errors @@ -43,8 +45,14 @@ cdef class Retokenizer: if token.i in self.tokens_to_merge: raise ValueError(Errors.E102.format(token=repr(token))) self.tokens_to_merge.add(token.i) - - attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) + if "_" in attrs: # Extension attributes + extensions = attrs["_"] + _validate_extensions(extensions) + attrs = {key: value for key, value in attrs.items() if key != "_"} + attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) + attrs["_"] = extensions + else: + attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) self.merges.append((span, attrs)) def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()): @@ -53,7 +61,15 @@ cdef class Retokenizer: """ if ''.join(orths) != token.text: raise ValueError(Errors.E117.format(new=''.join(orths), old=token.text)) - attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) + if "_" in attrs: # Extension attributes + extensions = attrs["_"] + for extension in extensions: + _validate_extensions(extension) + attrs = {key: value for key, value in attrs.items() if key != "_"} + attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) + attrs["_"] = extensions + else: + attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) head_offsets = [] for head in heads: if isinstance(head, Token): @@ -131,7 +147,10 @@ def _merge(Doc doc, int start, int end, attributes): cdef TokenC* token = &doc.c[start] token.spacy = doc.c[end-1].spacy for attr_name, attr_value in attributes.items(): - if attr_name == TAG: + if attr_name == "_": # Set extension attributes + for ext_attr_key, ext_attr_value in attr_value.items(): + doc[start]._.set(ext_attr_key, ext_attr_value) + elif attr_name == TAG: doc.vocab.morphology.assign_tag(token, attr_value) else: Token.set_struct_attr(token, attr_name, attr_value) @@ -183,6 +202,7 @@ def _merge(Doc doc, int start, int end, attributes): # Return the merged Python object return doc[start] + def _bulk_merge(Doc doc, merges): """Retokenize the document, such that the spans described in 'merges' are merged into a single token. This method assumes that the merges @@ -213,7 +233,10 @@ def _bulk_merge(Doc doc, merges): tokens[merge_index] = token # Assign attributes for attr_name, attr_value in attributes.items(): - if attr_name == TAG: + if attr_name == "_": # Set extension attributes + for ext_attr_key, ext_attr_value in attr_value.items(): + doc[start]._.set(ext_attr_key, ext_attr_value) + elif attr_name == TAG: doc.vocab.morphology.assign_tag(token, attr_value) else: Token.set_struct_attr(token, attr_name, attr_value) @@ -379,7 +402,10 @@ def _split(Doc doc, int token_index, orths, heads, attrs): for attr_name, attr_values in attrs.items(): for i, attr_value in enumerate(attr_values): token = &doc.c[token_index + i] - if attr_name == TAG: + if attr_name == "_": + for ext_attr_key, ext_attr_value in attr_value.items(): + doc[token_index + i]._.set(ext_attr_key, ext_attr_value) + elif attr_name == TAG: doc.vocab.morphology.assign_tag(token, get_string_id(attr_value)) else: Token.set_struct_attr(token, attr_name, get_string_id(attr_value)) @@ -391,3 +417,15 @@ def _split(Doc doc, int token_index, orths, heads, attrs): doc.c[i].head -= i # set children from head set_children_from_heads(doc.c, doc.length) + + +def _validate_extensions(extensions): + if not isinstance(extensions, dict): + raise ValueError(Errors.E120.format(value=repr(extensions))) + for key, value in extensions.items(): + # Get the extension and make sure it's available and writable + extension = Token.get_extension(key) + if not extension: # Extension attribute doesn't exist + raise ValueError(Errors.E118.format(attr=key)) + if not is_writable_attr(extension): + raise ValueError(Errors.E119.format(attr=key)) diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index 6b8b9a349..4e2057e4a 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -75,3 +75,18 @@ def get_ext_args(**kwargs): if method is not None and not hasattr(method, "__call__"): raise ValueError(Errors.E091.format(name="method", value=repr(method))) return (default, method, getter, setter) + + +def is_writable_attr(ext): + """Check if an extension attribute is writable. + ext (tuple): The (default, getter, setter, method) tuple available via + {Doc,Span,Token}.get_extension. + RETURNS (bool): Whether the attribute is writable. + """ + default, method, getter, setter = ext + # Extension is writable if it has a setter (getter + setter), if it has a + # default value (or, if its default value is none, none of the other values + # should be set). + if setter is not None or default is not None or all(e is None for e in ext): + return True + return False diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 87b7b373e..107be4e5e 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -403,6 +403,8 @@ invalidated, although they may accidentally continue to work. ### Retokenizer.merge {#retokenizer.merge tag="method"} Mark a span for merging. The `attrs` will be applied to the resulting token. +Writable custom extension attributes can be provided as a dictionary mapping +attribute names to values as the `"_"` key. > #### Example > @@ -428,7 +430,8 @@ subtoken index. For example, `(doc[3], 1)` will attach the subtoken to the second subtoken of `doc[3]`. This mechanism allows attaching subtokens to other newly created subtokens, without having to keep track of the changing token indices. If the specified head token will be split within the retokenizer block -and no subtoken index is specified, it will default to `0`. +and no subtoken index is specified, it will default to `0`. Attributes to set on +subtokens can be provided as a list of values. > #### Example > diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index ac419f42f..176b558e4 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1083,6 +1083,55 @@ with doc.retokenize() as retokenizer: +### Overwriting custom extension attributes {#retokenization-extensions} + +If you've registered custom +[extension attributes](/usage/processing-pipelines##custom-components-attributes), +you can overwrite them during tokenization by providing a dictionary of +attribute names mapped to new values as the `"_"` key in the `attrs`. For +merging, you need to provide one dictionary of attributes for the resulting +merged token. For splitting, you need to provide a list of dictionaries with +custom attributes, one per split subtoken. + + + +To set extension attributes during retokenization, the attributes need to be +**registered** using the [`Token.set_extension`](/api/token#set_extension) +method and they need to be **writable**. This means that they should either have +a default value that can be overwritten, or a getter _and_ setter. Method +extensions or extensions with only a getter are computed dynamically, so their +values can't be overwritten. For more details, see the +[extension attribute docs](/usage/processing-pipelines/#custom-components-attributes). + + + +> #### ✏️ Things to try +> +> 1. Add another custom extension – maybe `"music_style"`? – and overwrite it. +> 2. Change the extension attribute to use only a `getter` function. You should +> see that spaCy raises an error, because the attribute is not writable +> anymore. +> 3. Rewrite the code to split a token with `retokenizer.split`. Remember that +> you need to provide a list of extension attribute values as the `"_"` +> property, one for each split subtoken. + +```python +### {executable="true"} +import spacy +from spacy.tokens import Token + +# Register a custom token attribute, token._.is_musician +Token.set_extension("is_musician", default=False) + +nlp = spacy.load("en_core_web_sm") +doc = nlp("I like David Bowie") +print("Before:", [(token.text, token._.is_musician) for token in doc]) + +with doc.retokenize() as retokenizer: + retokenizer.merge(doc[2:4], attrs={"_": {"is_musician": True}}) +print("After:", [(token.text, token._.is_musician) for token in doc]) +``` + ## Sentence Segmentation {#sbd} A [`Doc`](/api/doc) object's sentences are available via the `Doc.sents`