Merge branch 'develop' into spacy.io

2025-10-24 20:51:30 +03:00 · 2019-02-24 18:36:27 +01:00 · 2019-02-24 18:36:27 +01:00 · a6709a2f29
commit a6709a2f29
parent 948ca2bb3e df19e2bff6
8 changed files with 202 additions and 13 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -327,6 +327,17 @@ class Errors(object):
            "performance.")
    E117 = ("The newly split tokens must match the text of the original token. "
            "New orths: {new}. Old text: {old}.")
+    E118 = ("The custom extension attribute '{attr}' is not registered on the "
+            "Token object so it can't be set during retokenization. To "
+            "register an attribute, use the Token.set_extension classmethod.")
+    E119 = ("Can't set custom extension attribute '{attr}' during retokenization "
+            "because it's not writable. This usually means it was registered "
+            "with a getter function (and no setter) or as a method extension, "
+            "so the value is computed dynamically. To overwrite a custom "
+            "attribute manually, it should be registered with a default value "
+            "or with a getter AND setter.")
+    E120 = ("Can't set custom extension attributes during retokenization. "
+            "Expected dict mapping attribute names to values, but got: {value}")


@add_codes
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -4,7 +4,7 @@ from __future__ import unicode_literals
 import pytest
 from spacy.attrs import LEMMA
 from spacy.vocab import Vocab
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Token

 from ..util import get_doc

@ -259,3 +259,36 @@ def test_doc_retokenize_spans_subtree_size_check(en_tokenizer):
        attrs = {"lemma": "none", "ent_type": "none"}
        retokenizer.merge(doc[0:2], attrs=attrs)
    assert len(list(sent1.root.subtree)) == init_len - 1
+
+
+def test_doc_retokenize_merge_extension_attrs(en_vocab):
+    Token.set_extension("a", default=False, force=True)
+    Token.set_extension("b", default="nothing", force=True)
+    doc = Doc(en_vocab, words=["hello", "world", "!"])
+    # Test regular merging
+    with doc.retokenize() as retokenizer:
+        attrs = {"lemma": "hello world", "_": {"a": True, "b": "1"}}
+        retokenizer.merge(doc[0:2], attrs=attrs)
+    assert doc[0].lemma_ == "hello world"
+    assert doc[0]._.a == True
+    assert doc[0]._.b == "1"
+    # Test bulk merging
+    doc = Doc(en_vocab, words=["hello", "world", "!", "!"])
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[0:2], attrs={"_": {"a": True, "b": "1"}})
+        retokenizer.merge(doc[2:4], attrs={"_": {"a": None, "b": "2"}})
+    assert doc[0]._.a == True
+    assert doc[0]._.b == "1"
+    assert doc[1]._.a == None
+    assert doc[1]._.b == "2"
+
+
+@pytest.mark.parametrize("underscore_attrs", [{"a": "x"}, {"b": "x"}, {"c": "x"}, [1]])
+def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs):
+    Token.set_extension("a", getter=lambda x: x, force=True)
+    Token.set_extension("b", method=lambda x: x, force=True)
+    doc = Doc(en_vocab, words=["hello", "world", "!"])
+    attrs = {"_": underscore_attrs}
+    with pytest.raises(ValueError):
+        with doc.retokenize() as retokenizer:
+            retokenizer.merge(doc[0:2], attrs=attrs)
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@ -3,7 +3,7 @@ from __future__ import unicode_literals

 import pytest
 from spacy.vocab import Vocab
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Token

 from ..util import get_doc

@ -125,3 +125,43 @@ def test_doc_retokenize_split_orths_mismatch(en_vocab):
    with pytest.raises(ValueError):
        with doc.retokenize() as retokenizer:
            retokenizer.split(doc[0], ["L", "A"], [(doc[0], 0), (doc[0], 0)])
+
+
+def test_doc_retokenize_split_extension_attrs(en_vocab):
+    Token.set_extension("a", default=False, force=True)
+    Token.set_extension("b", default="nothing", force=True)
+    doc = Doc(en_vocab, words=["LosAngeles", "start"])
+    with doc.retokenize() as retokenizer:
+        heads = [(doc[0], 1), doc[1]]
+        underscore = [{"a": True, "b": "1"}, {"b": "2"}]
+        attrs = {"lemma": ["los", "angeles"], "_": underscore}
+        retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
+    assert doc[0].lemma_ == "los"
+    assert doc[0]._.a == True
+    assert doc[0]._.b == "1"
+    assert doc[1].lemma_ == "angeles"
+    assert doc[1]._.a == False
+    assert doc[1]._.b == "2"
+
+
+@pytest.mark.parametrize(
+    "underscore_attrs",
+    [
+        [{"a": "x"}, {}],  # Overwriting getter without setter
+        [{"b": "x"}, {}],  # Overwriting method
+        [{"c": "x"}, {}],  # Overwriting nonexistent attribute
+        [{"a": "x"}, {"x": "x"}],  # Combination
+        [{"a": "x", "x": "x"}, {"x": "x"}],  # Combination
+        {"x": "x"},  # Not a list of dicts
+    ],
+)
+def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
+    Token.set_extension("x", default=False, force=True)
+    Token.set_extension("a", getter=lambda x: x, force=True)
+    Token.set_extension("b", method=lambda x: x, force=True)
+    doc = Doc(en_vocab, words=["LosAngeles", "start"])
+    attrs = {"_": underscore_attrs}
+    with pytest.raises(ValueError):
+        with doc.retokenize() as retokenizer:
+            heads = [(doc[0], 1), doc[1]]
+            retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
--- a/spacy/tests/regression/test_issue1971.py
+++ b/spacy/tests/regression/test_issue1971.py
@ -36,8 +36,8 @@ def test_issue_1971_2(en_vocab):

 def test_issue_1971_3(en_vocab):
    """Test that pattern matches correctly for multiple extension attributes."""
-    Token.set_extension("a", default=1)
-    Token.set_extension("b", default=2)
+    Token.set_extension("a", default=1, force=True)
+    Token.set_extension("b", default=2, force=True)
    doc = Doc(en_vocab, words=["hello", "world"])
    matcher = Matcher(en_vocab)
    matcher.add("A", None, [{"_": {"a": 1}}])
@ -51,8 +51,8 @@ def test_issue_1971_4(en_vocab):
    """Test that pattern matches correctly with multiple extension attribute
    values on a single token.
    """
-    Token.set_extension("ext_a", default="str_a")
-    Token.set_extension("ext_b", default="str_b")
+    Token.set_extension("ext_a", default="str_a", force=True)
+    Token.set_extension("ext_b", default="str_b", force=True)
    matcher = Matcher(en_vocab)
    doc = Doc(en_vocab, words=["this", "is", "text"])
    pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -17,6 +17,8 @@ from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..structs cimport LexemeC, TokenC
 from ..attrs cimport TAG
+
+from .underscore import is_writable_attr
 from ..attrs import intify_attrs
 from ..util import SimpleFrozenDict
 from ..errors import Errors
@ -43,8 +45,14 @@ cdef class Retokenizer:
            if token.i in self.tokens_to_merge:
                raise ValueError(Errors.E102.format(token=repr(token)))
            self.tokens_to_merge.add(token.i)
-
-        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
+        if "_" in attrs:  # Extension attributes
+            extensions = attrs["_"]
+            _validate_extensions(extensions)
+            attrs = {key: value for key, value in attrs.items() if key != "_"}
+            attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
+            attrs["_"] = extensions
+        else:
+            attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
        self.merges.append((span, attrs))

    def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()):
@ -53,7 +61,15 @@ cdef class Retokenizer:
        """
        if ''.join(orths) != token.text:
            raise ValueError(Errors.E117.format(new=''.join(orths), old=token.text))
-        attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
+        if "_" in attrs:  # Extension attributes
+            extensions = attrs["_"]
+            for extension in extensions:
+                _validate_extensions(extension)
+            attrs = {key: value for key, value in attrs.items() if key != "_"}
+            attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
+            attrs["_"] = extensions
+        else:
+            attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
        head_offsets = []
        for head in heads:
            if isinstance(head, Token):
@ -131,7 +147,10 @@ def _merge(Doc doc, int start, int end, attributes):
    cdef TokenC* token = &doc.c[start]
    token.spacy = doc.c[end-1].spacy
    for attr_name, attr_value in attributes.items():
-        if attr_name == TAG:
+        if attr_name == "_":  # Set extension attributes
+            for ext_attr_key, ext_attr_value in attr_value.items():
+                doc[start]._.set(ext_attr_key, ext_attr_value)
+        elif attr_name == TAG:
            doc.vocab.morphology.assign_tag(token, attr_value)
        else:
            Token.set_struct_attr(token, attr_name, attr_value)
@ -183,6 +202,7 @@ def _merge(Doc doc, int start, int end, attributes):
    # Return the merged Python object
    return doc[start]

+
 def _bulk_merge(Doc doc, merges):
    """Retokenize the document, such that the spans described in 'merges'
     are merged into a single token. This method assumes that the merges
@ -213,7 +233,10 @@ def _bulk_merge(Doc doc, merges):
        tokens[merge_index] = token
        # Assign attributes
        for attr_name, attr_value in attributes.items():
-            if attr_name == TAG:
+            if attr_name == "_":  # Set extension attributes
+                for ext_attr_key, ext_attr_value in attr_value.items():
+                    doc[start]._.set(ext_attr_key, ext_attr_value)
+            elif attr_name == TAG:
                doc.vocab.morphology.assign_tag(token, attr_value)
            else:
                Token.set_struct_attr(token, attr_name, attr_value)
@ -379,7 +402,10 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
    for attr_name, attr_values in attrs.items():
        for i, attr_value in enumerate(attr_values):
            token = &doc.c[token_index + i]
-            if attr_name == TAG:
+            if attr_name == "_":
+                for ext_attr_key, ext_attr_value in attr_value.items():
+                    doc[token_index + i]._.set(ext_attr_key, ext_attr_value)
+            elif attr_name == TAG:
                doc.vocab.morphology.assign_tag(token, get_string_id(attr_value))
            else:
                Token.set_struct_attr(token, attr_name, get_string_id(attr_value))
@ -391,3 +417,15 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
        doc.c[i].head -= i
    # set children from head
    set_children_from_heads(doc.c, doc.length)
+
+
+def _validate_extensions(extensions):
+    if not isinstance(extensions, dict):
+        raise ValueError(Errors.E120.format(value=repr(extensions)))
+    for key, value in extensions.items():
+        # Get the extension and make sure it's available and writable
+        extension = Token.get_extension(key)
+        if not extension:  # Extension attribute doesn't exist
+            raise ValueError(Errors.E118.format(attr=key))
+        if not is_writable_attr(extension):
+            raise ValueError(Errors.E119.format(attr=key))
--- a/spacy/tokens/underscore.py
+++ b/spacy/tokens/underscore.py
@ -75,3 +75,18 @@ def get_ext_args(**kwargs):
    if method is not None and not hasattr(method, "__call__"):
        raise ValueError(Errors.E091.format(name="method", value=repr(method)))
    return (default, method, getter, setter)
+
+
+def is_writable_attr(ext):
+    """Check if an extension attribute is writable.
+    ext (tuple): The (default, getter, setter, method) tuple available  via
+        {Doc,Span,Token}.get_extension.
+    RETURNS (bool): Whether the attribute is writable.
+    """
+    default, method, getter, setter = ext
+    # Extension is writable if it has a setter (getter + setter), if it has a
+    # default value (or, if its default value is none, none of the other values
+    # should be set).
+    if setter is not None or default is not None or all(e is None for e in ext):
+        return True
+    return False
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -403,6 +403,8 @@ invalidated, although they may accidentally continue to work.
 ### Retokenizer.merge {#retokenizer.merge tag="method"}

 Mark a span for merging. The `attrs` will be applied to the resulting token.
+Writable custom extension attributes can be provided as a dictionary mapping
+attribute names to values as the `"_"` key.

 > #### Example
 >
@ -428,7 +430,8 @@ subtoken index. For example, `(doc[3], 1)` will attach the subtoken to the
 second subtoken of `doc[3]`. This mechanism allows attaching subtokens to other
 newly created subtokens, without having to keep track of the changing token
 indices. If the specified head token will be split within the retokenizer block
-and no subtoken index is specified, it will default to `0`.
+and no subtoken index is specified, it will default to `0`. Attributes to set on
+subtokens can be provided as a list of values.

 > #### Example
 >
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -1083,6 +1083,55 @@ with doc.retokenize() as retokenizer:

 </Infobox>

+### Overwriting custom extension attributes {#retokenization-extensions}
+
+If you've registered custom
+[extension attributes](/usage/processing-pipelines##custom-components-attributes),
+you can overwrite them during tokenization by providing a dictionary of
+attribute names mapped to new values as the `"_"` key in the `attrs`. For
+merging, you need to provide one dictionary of attributes for the resulting
+merged token. For splitting, you need to provide a list of dictionaries with
+custom attributes, one per split subtoken.
+
+<Infobox title="Important note" variant="warning">
+
+To set extension attributes during retokenization, the attributes need to be
+**registered** using the [`Token.set_extension`](/api/token#set_extension)
+method and they need to be **writable**. This means that they should either have
+a default value that can be overwritten, or a getter _and_ setter. Method
+extensions or extensions with only a getter are computed dynamically, so their
+values can't be overwritten. For more details, see the
+[extension attribute docs](/usage/processing-pipelines/#custom-components-attributes).
+
+</Infobox>
+
+> #### ✏️ Things to try
+>
+> 1. Add another custom extension – maybe `"music_style"`? – and overwrite it.
+> 2. Change the extension attribute to use only a `getter` function. You should
+>    see that spaCy raises an error, because the attribute is not writable
+>    anymore.
+> 3. Rewrite the code to split a token with `retokenizer.split`. Remember that
+>    you need to provide a list of extension attribute values as the `"_"`
+>    property, one for each split subtoken.
+
+```python
+### {executable="true"}
+import spacy
+from spacy.tokens import Token
+
+# Register a custom token attribute, token._.is_musician
+Token.set_extension("is_musician", default=False)
+
+nlp = spacy.load("en_core_web_sm")
+doc = nlp("I like David Bowie")
+print("Before:", [(token.text, token._.is_musician) for token in doc])
+
+with doc.retokenize() as retokenizer:
+    retokenizer.merge(doc[2:4], attrs={"_": {"is_musician": True}})
+print("After:", [(token.text, token._.is_musician) for token in doc])
+```
+
 ## Sentence Segmentation {#sbd}

 A [`Doc`](/api/doc) object's sentences are available via the `Doc.sents`