From 4e7259c6cf44540160a4e4fd5a62c7ff7308cc44 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 3 Oct 2019 14:48:45 +0200 Subject: [PATCH] Bugfix initializing DocBin with attributes (#4368) * docbin init fix + documentation fix + unit tests * newline * try with zlib instead of gzip (python 2 incompatibilities) --- bin/ud/ud_train.py | 3 +- spacy/attrs.pyx | 38 +++++++++++++-------- spacy/tests/regression/test_issue4367.py | 11 ++++++ spacy/tests/serialize/test_serialize_doc.py | 20 ++++++++++- spacy/tokens/_serialize.py | 10 +++--- website/docs/usage/saving-loading.md | 2 +- 6 files changed, 61 insertions(+), 23 deletions(-) create mode 100644 spacy/tests/regression/test_issue4367.py diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index a66bb619e..5d4f20d6e 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -12,6 +12,7 @@ import json import spacy import spacy.util +from bin.ud import conll17_ud_eval from spacy.tokens import Token, Doc from spacy.gold import GoldParse from spacy.util import compounding, minibatch, minibatch_by_words @@ -25,8 +26,6 @@ import itertools import random import numpy.random -import conll17_ud_eval - from spacy import lang from spacy.lang import zh from spacy.lang import ja diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 40236630a..6d1c18eb9 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -142,18 +142,28 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): elif key.upper() in stringy_attrs: stringy_attrs.pop(key.upper()) for name, value in stringy_attrs.items(): - if isinstance(name, int): - int_key = name - elif name in IDS: - int_key = IDS[name] - elif name.upper() in IDS: - int_key = IDS[name.upper()] - else: - continue - if strings_map is not None and isinstance(value, basestring): - if hasattr(strings_map, 'add'): - value = strings_map.add(value) - else: - value = strings_map[value] - inty_attrs[int_key] = value + int_key = intify_attr(name) + if int_key is not None: + if strings_map is not None and isinstance(value, basestring): + if hasattr(strings_map, 'add'): + value = strings_map.add(value) + else: + value = strings_map[value] + inty_attrs[int_key] = value return inty_attrs + + +def intify_attr(name): + """ + Normalize an attribute name, converting it to int. + + stringy_attr (string): Attribute string name. Can also be int (will then be left unchanged) + RETURNS (int): int representation of the attribute, or None if it couldn't be converted. + """ + if isinstance(name, int): + return name + elif name in IDS: + return IDS[name] + elif name.upper() in IDS: + return IDS[name.upper()] + return None diff --git a/spacy/tests/regression/test_issue4367.py b/spacy/tests/regression/test_issue4367.py new file mode 100644 index 000000000..6c9e54cdb --- /dev/null +++ b/spacy/tests/regression/test_issue4367.py @@ -0,0 +1,11 @@ +# coding: utf8 +from __future__ import unicode_literals + +from spacy.tokens import DocBin + + +def test_issue4367(): + """Test that docbin init goes well""" + doc_bin_1 = DocBin() + doc_bin_2 = DocBin(attrs=["LEMMA"]) + doc_bin_3 = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index b109ca0b2..de91a50b6 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -1,8 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import spacy + import pytest -from spacy.tokens import Doc + +from spacy.lang.en import English +from spacy.tokens import Doc, DocBin from spacy.compat import path2str from ..util import make_tempdir @@ -57,3 +61,17 @@ def test_serialize_doc_exclude(en_vocab): doc.to_bytes(user_data=False) with pytest.raises(ValueError): Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False) + + +def test_serialize_doc_bin(): + doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) + texts = ["Some text", "Lots of texts...", "..."] + nlp = English() + for doc in nlp.pipe(texts): + doc_bin.add(doc) + bytes_data = doc_bin.to_bytes() + + # Deserialize later, e.g. in a new process + nlp = spacy.blank("en") + doc_bin = DocBin().from_bytes(bytes_data) + docs = list(doc_bin.get_docs(nlp.vocab)) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 634d7450a..67ad9a21a 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals import numpy -import gzip +import zlib import srsly from thinc.neural.ops import NumpyOps from ..compat import copy_reg from ..tokens import Doc -from ..attrs import SPACY, ORTH, intify_attrs +from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors @@ -53,7 +53,7 @@ class DocBin(object): DOCS: https://spacy.io/api/docbin#init """ attrs = attrs or [] - attrs = sorted(intify_attrs(attrs)) + attrs = sorted([intify_attr(attr) for attr in attrs]) self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] self.tokens = [] @@ -142,7 +142,7 @@ class DocBin(object): } if self.store_user_data: msg["user_data"] = self.user_data - return gzip.compress(srsly.msgpack_dumps(msg)) + return zlib.compress(srsly.msgpack_dumps(msg)) def from_bytes(self, bytes_data): """Deserialize the DocBin's annotations from a bytestring. @@ -152,7 +152,7 @@ class DocBin(object): DOCS: https://spacy.io/api/docbin#from_bytes """ - msg = srsly.msgpack_loads(gzip.decompress(bytes_data)) + msg = srsly.msgpack_loads(zlib.decompress(bytes_data)) self.attrs = msg["attrs"] self.strings = set(msg["strings"]) lengths = numpy.fromstring(msg["lengths"], dtype="int32") diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index fe2f4868f..c7578a8df 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -84,7 +84,7 @@ texts = ["Some text", "Lots of texts...", "..."] nlp = spacy.load("en_core_web_sm") for doc in nlp.pipe(texts): doc_bin.add(doc) -bytes_data = docbin.to_bytes() +bytes_data = doc_bin.to_bytes() # Deserialize later, e.g. in a new process nlp = spacy.blank("en")