mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-23 07:44:12 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
f0aea5b198
|
@ -9,7 +9,7 @@ from ..vocab cimport Vocab
|
||||||
|
|
||||||
|
|
||||||
cdef class PhraseMatcher:
|
cdef class PhraseMatcher:
|
||||||
cdef Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
cdef attr_id_t attr
|
cdef attr_id_t attr
|
||||||
cdef object _callbacks
|
cdef object _callbacks
|
||||||
cdef object _docs
|
cdef object _docs
|
||||||
|
|
13
spacy/tests/regression/test_issue4373.py
Normal file
13
spacy/tests/regression/test_issue4373.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.matcher import Matcher, PhraseMatcher
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4373():
|
||||||
|
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
||||||
|
matcher = Matcher(Vocab())
|
||||||
|
assert isinstance(matcher.vocab, Vocab)
|
||||||
|
matcher = PhraseMatcher(Vocab())
|
||||||
|
assert isinstance(matcher.vocab, Vocab)
|
|
@ -46,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations.
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
|
| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
|
||||||
| `store_user_data` | bool | Whether to include the `Doc.user_data`. Defaults to `False`. |
|
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
|
||||||
| **RETURNS** | `DocBin` | The newly constructed object. |
|
| **RETURNS** | `DocBin` | The newly constructed object. |
|
||||||
|
|
||||||
## DocBin.\_\len\_\_ {#len tag="method"}
|
## DocBin.\_\len\_\_ {#len tag="method"}
|
||||||
|
|
|
@ -92,6 +92,25 @@ doc_bin = DocBin().from_bytes(bytes_data)
|
||||||
docs = list(doc_bin.get_docs(nlp.vocab))
|
docs = list(doc_bin.get_docs(nlp.vocab))
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as
|
||||||
|
well, which includes the values of
|
||||||
|
[extension attributes](/processing-pipelines#custom-components-attributes) (if
|
||||||
|
they're serializable with msgpack).
|
||||||
|
|
||||||
|
<Infobox title="Important note on serializing extension attributes" variant="warning">
|
||||||
|
|
||||||
|
Including the `Doc.user_data` and extension attributes will only serialize the
|
||||||
|
**values** of the attributes. To restore the values and access them via the
|
||||||
|
`doc._.` property, you need to register the global attribute on the `Doc` again.
|
||||||
|
|
||||||
|
```python
|
||||||
|
docs = list(doc_bin.get_docs(nlp.vocab))
|
||||||
|
Doc.set_extension("my_custom_attr", default=None)
|
||||||
|
print([doc._.my_custom_attr for doc in docs])
|
||||||
|
```
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### Using Pickle {#pickle}
|
### Using Pickle {#pickle}
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
Loading…
Reference in New Issue
Block a user