Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-05-16 23:16:14 +02:00
commit 3cbbc4afcb
7 changed files with 52 additions and 5 deletions

View File

@ -383,6 +383,8 @@ class Errors(object):
E133 = ("The sum of prior probabilities for alias '{alias}' should not exceed 1, " E133 = ("The sum of prior probabilities for alias '{alias}' should not exceed 1, "
"but found {sum}.") "but found {sum}.")
E134 = ("Alias '{alias}' defined for unknown entity '{entity}'.") E134 = ("Alias '{alias}' defined for unknown entity '{entity}'.")
E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
"`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
@add_codes @add_codes

View File

@ -333,6 +333,11 @@ class Language(object):
""" """
if name not in self.pipe_names: if name not in self.pipe_names:
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names)) raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
if not hasattr(component, "__call__"):
msg = Errors.E003.format(component=repr(component), name=name)
if isinstance(component, basestring_) and component in self.factories:
msg += Errors.E135.format(name=name)
raise ValueError(msg)
self.pipeline[self.pipe_names.index(name)] = (name, component) self.pipeline[self.pipe_names.index(name)] = (name, component)
def rename_pipe(self, old_name, new_name): def rename_pipe(self, old_name, new_name):

View File

@ -140,3 +140,28 @@ def test_underscore_mutable_defaults_dict(en_vocab):
assert len(token1._.mutable) == 2 assert len(token1._.mutable) == 2
assert token1._.mutable["x"] == ["y"] assert token1._.mutable["x"] == ["y"]
assert len(token2._.mutable) == 0 assert len(token2._.mutable) == 0
def test_underscore_dir(en_vocab):
"""Test that dir() correctly returns extension attributes. This enables
things like tab-completion for the attributes in doc._."""
Doc.set_extension("test_dir", default=None)
doc = Doc(en_vocab, words=["hello", "world"])
assert "_" in dir(doc)
assert "test_dir" in dir(doc._)
assert "test_dir" not in dir(doc[0]._)
assert "test_dir" not in dir(doc[0:2]._)
def test_underscore_docstring(en_vocab):
"""Test that docstrings are available for extension methods, even though
they're partials."""
def test_method(doc, arg1=1, arg2=2):
"""I am a docstring"""
return (arg1, arg2)
Doc.set_extension("test_docstrings", method=test_method)
doc = Doc(en_vocab, words=["hello", "world"])
assert test_method.__doc__ == "I am a docstring"
assert doc._.test_docstrings.__doc__.rsplit(". ")[-1] == "I am a docstring"

View File

@ -52,11 +52,13 @@ def test_get_pipe(nlp, name):
assert nlp.get_pipe(name) == new_pipe assert nlp.get_pipe(name) == new_pipe
@pytest.mark.parametrize("name,replacement", [("my_component", lambda doc: doc)]) @pytest.mark.parametrize("name,replacement,not_callable", [("my_component", lambda doc: doc, {})])
def test_replace_pipe(nlp, name, replacement): def test_replace_pipe(nlp, name, replacement, not_callable):
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.replace_pipe(name, new_pipe) nlp.replace_pipe(name, new_pipe)
nlp.add_pipe(new_pipe, name=name) nlp.add_pipe(new_pipe, name=name)
with pytest.raises(ValueError):
nlp.replace_pipe(name, not_callable)
nlp.replace_pipe(name, replacement) nlp.replace_pipe(name, replacement)
assert nlp.get_pipe(name) != new_pipe assert nlp.get_pipe(name) != new_pipe
assert nlp.get_pipe(name) == replacement assert nlp.get_pipe(name) == replacement

View File

@ -25,6 +25,11 @@ class Underscore(object):
object.__setattr__(self, "_start", start) object.__setattr__(self, "_start", start)
object.__setattr__(self, "_end", end) object.__setattr__(self, "_end", end)
def __dir__(self):
# Hack to enable autocomplete on custom extensions
extensions = list(self._extensions.keys())
return ["set", "get", "has"] + extensions
def __getattr__(self, name): def __getattr__(self, name):
if name not in self._extensions: if name not in self._extensions:
raise AttributeError(Errors.E046.format(name=name)) raise AttributeError(Errors.E046.format(name=name))
@ -32,7 +37,16 @@ class Underscore(object):
if getter is not None: if getter is not None:
return getter(self._obj) return getter(self._obj)
elif method is not None: elif method is not None:
return functools.partial(method, self._obj) method_partial = functools.partial(method, self._obj)
# Hack to port over docstrings of the original function
# See https://stackoverflow.com/q/27362727/6400719
method_docstring = method.__doc__ or ""
method_docstring_prefix = (
"This method is a partial function and its first argument "
"(the object it's called on) will be filled automatically. "
)
method_partial.__doc__ = method_docstring_prefix + method_docstring
return method_partial
else: else:
key = self._get_key(name) key = self._get_key(name)
if key in self._doc.user_data: if key in self._doc.user_data:

View File

@ -128,7 +128,6 @@ The L2 norm of the lexeme's vector representation.
| `text` | unicode | Verbatim text content. | | `text` | unicode | Verbatim text content. |
| `orth` | int | ID of the verbatim text content. | | `orth` | int | ID of the verbatim text content. |
| `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | | `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. |
| `lex_id` | int | ID of the lexeme's lexical type. |
| `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. | | `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. |
| `flags` | int | Container of the lexeme's binary flags. | | `flags` | int | Container of the lexeme's binary flags. |
| `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. | | `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. |

View File

@ -468,7 +468,7 @@ The L2 norm of the token's vector representation.
| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | | `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). |
| `idx` | int | The character offset of the token within the parent document. | | `idx` | int | The character offset of the token within the parent document. |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | | `sentiment` | float | A scalar value indicating the positivity or negativity of the token. |
| `lex_id` | int | Sequential ID of the token's lexical type. | | `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. |
| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | | `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. |
| `cluster` | int | Brown cluster ID. | | `cluster` | int | Brown cluster ID. |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | | `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |