Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-10-05 11:58:12 +02:00
commit f0aea5b198
4 changed files with 34 additions and 2 deletions

View File

@ -9,7 +9,7 @@ from ..vocab cimport Vocab
cdef class PhraseMatcher: cdef class PhraseMatcher:
cdef Vocab vocab cdef readonly Vocab vocab
cdef attr_id_t attr cdef attr_id_t attr
cdef object _callbacks cdef object _callbacks
cdef object _docs cdef object _docs

View File

@ -0,0 +1,13 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.matcher import Matcher, PhraseMatcher
from spacy.vocab import Vocab
def test_issue4373():
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
matcher = Matcher(Vocab())
assert isinstance(matcher.vocab, Vocab)
matcher = PhraseMatcher(Vocab())
assert isinstance(matcher.vocab, Vocab)

View File

@ -46,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations.
| Argument | Type | Description | | Argument | Type | Description |
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. | | `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
| `store_user_data` | bool | Whether to include the `Doc.user_data`. Defaults to `False`. | | `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
| **RETURNS** | `DocBin` | The newly constructed object. | | **RETURNS** | `DocBin` | The newly constructed object. |
## DocBin.\_\len\_\_ {#len tag="method"} ## DocBin.\_\len\_\_ {#len tag="method"}

View File

@ -92,6 +92,25 @@ doc_bin = DocBin().from_bytes(bytes_data)
docs = list(doc_bin.get_docs(nlp.vocab)) docs = list(doc_bin.get_docs(nlp.vocab))
``` ```
If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as
well, which includes the values of
[extension attributes](/processing-pipelines#custom-components-attributes) (if
they're serializable with msgpack).
<Infobox title="Important note on serializing extension attributes" variant="warning">
Including the `Doc.user_data` and extension attributes will only serialize the
**values** of the attributes. To restore the values and access them via the
`doc._.` property, you need to register the global attribute on the `Doc` again.
```python
docs = list(doc_bin.get_docs(nlp.vocab))
Doc.set_extension("my_custom_attr", default=None)
print([doc._.my_custom_attr for doc in docs])
```
</Infobox>
### Using Pickle {#pickle} ### Using Pickle {#pickle}
> #### Example > #### Example