diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py index 6d79c56e7..8f47157fa 100644 --- a/spacy/tests/doc/test_underscore.py +++ b/spacy/tests/doc/test_underscore.py @@ -106,3 +106,37 @@ def test_underscore_raises_for_invalid(invalid_kwargs): def test_underscore_accepts_valid(valid_kwargs): valid_kwargs["force"] = True Doc.set_extension("test", **valid_kwargs) + + +def test_underscore_mutable_defaults_list(en_vocab): + """Test that mutable default arguments are handled correctly (see #2581).""" + Doc.set_extension("mutable", default=[]) + doc1 = Doc(en_vocab, words=["one"]) + doc2 = Doc(en_vocab, words=["two"]) + doc1._.mutable.append("foo") + assert len(doc1._.mutable) == 1 + assert doc1._.mutable[0] == "foo" + assert len(doc2._.mutable) == 0 + doc1._.mutable = ["bar", "baz"] + doc1._.mutable.append("foo") + assert len(doc1._.mutable) == 3 + assert len(doc2._.mutable) == 0 + + +def test_underscore_mutable_defaults_dict(en_vocab): + """Test that mutable default arguments are handled correctly (see #2581).""" + Token.set_extension("mutable", default={}) + token1 = Doc(en_vocab, words=["one"])[0] + token2 = Doc(en_vocab, words=["two"])[0] + token1._.mutable["foo"] = "bar" + assert len(token1._.mutable) == 1 + assert token1._.mutable["foo"] == "bar" + assert len(token2._.mutable) == 0 + token1._.mutable["foo"] = "baz" + assert len(token1._.mutable) == 1 + assert token1._.mutable["foo"] == "baz" + token1._.mutable["x"] = [] + token1._.mutable["x"].append("y") + assert len(token1._.mutable) == 2 + assert token1._.mutable["x"] == ["y"] + assert len(token2._.mutable) == 0 diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index 4e2057e4a..ef1d78717 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -2,11 +2,13 @@ from __future__ import unicode_literals import functools +import copy from ..errors import Errors class Underscore(object): + mutable_types = (dict, list, set) doc_extensions = {} span_extensions = {} token_extensions = {} @@ -32,7 +34,15 @@ class Underscore(object): elif method is not None: return functools.partial(method, self._obj) else: - return self._doc.user_data.get(self._get_key(name), default) + key = self._get_key(name) + if key in self._doc.user_data: + return self._doc.user_data[key] + elif isinstance(default, self.mutable_types): + # Handle mutable default arguments (see #2581) + new_default = copy.copy(default) + self.__setattr__(name, new_default) + return new_default + return default def __setattr__(self, name, value): if name not in self._extensions: diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index ab780485f..264774b7c 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -458,9 +458,7 @@ There are three main types of extensions, which can be defined using the 1. **Attribute extensions.** Set a default value for an attribute, which can be overwritten manually at any time. Attribute extensions work like "normal" variables and are the quickest way to store arbitrary information on a `Doc`, - `Span` or `Token`. Attribute defaults behaves just like argument defaults - [in Python functions](http://docs.python-guide.org/en/latest/writing/gotchas/#mutable-default-arguments), - and should not be used for mutable values like dictionaries or lists. + `Span` or `Token`. ```python Doc.set_extension("hello", default=True) @@ -527,25 +525,6 @@ Once you've registered your custom attribute, you can also use the built-in especially useful it you want to pass in a string instead of calling `doc._.my_attr`. - - -When using **mutable values** like dictionaries or lists as the `default` -argument, keep in mind that they behave just like mutable default arguments -[in Python functions](http://docs.python-guide.org/en/latest/writing/gotchas/#mutable-default-arguments). -This can easily cause unintended results, like the same value being set on _all_ -objects instead of only one particular instance. In most cases, it's better to -use **getters and setters**, and only set the `default` for boolean or string -values. - -```diff -+ Doc.set_extension('fruits', getter=get_fruits, setter=set_fruits) - -- Doc.set_extension('fruits', default={}) -- doc._.fruits['apple'] = u'🍎' # all docs now have {'apple': u'🍎'} -``` - - - ### Example: Pipeline component for GPE entities and country meta data via a REST API {#component-example3} This example shows the implementation of a pipeline component that fetches