Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-10-29 15:07:54 +03:00 · 2017-06-02 04:26:39 -05:00 · 2017-06-02 04:26:39 -05:00 · ed6f575e06
commit ed6f575e06
parent c650bc481c acd65c00f6
4 changed files with 142 additions and 4 deletions
--- a/spacy/tests/serialize/test_serialize_stringstore.py
+++ b/spacy/tests/serialize/test_serialize_stringstore.py
@ -0,0 +1,46 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from ..util import make_tempdir
 from ...strings import StringStore
 import pytest
 test_strings = [([], []), (['rats', 'are', 'cute'], ['i', 'like', 'rats'])]
@pytest.mark.parametrize('strings1,strings2', test_strings)
 def test_serialize_stringstore_roundtrip_bytes(strings1,strings2):
    sstore1 = StringStore(strings=strings1)
    sstore2 = StringStore(strings=strings2)
    sstore1_b = sstore1.to_bytes()
    sstore2_b = sstore2.to_bytes()
    if strings1 == strings2:
        assert sstore1_b == sstore2_b
    else:
        assert sstore1_b != sstore2_b
    sstore1 = sstore1.from_bytes(sstore1_b)
    assert sstore1.to_bytes() == sstore1_b
    new_sstore1 = StringStore().from_bytes(sstore1_b)
    assert new_sstore1.to_bytes() == sstore1_b
    assert list(new_sstore1) == strings1
@pytest.mark.parametrize('strings1,strings2', test_strings)
 def test_serialize_stringstore_roundtrip_disk(strings1,strings2):
    sstore1 = StringStore(strings=strings1)
    sstore2 = StringStore(strings=strings2)
    with make_tempdir() as d:
        file_path1 = d / 'strings1'
        file_path2 = d / 'strings2'
        sstore1.to_disk(file_path1)
        sstore2.to_disk(file_path2)
        sstore1_d = StringStore().from_disk(file_path1)
        sstore2_d = StringStore().from_disk(file_path2)
        assert list(sstore1_d) == list(sstore1)
        assert list(sstore2_d) == list(sstore2)
        if strings1 == strings2:
            assert list(sstore1_d) == list(sstore2_d)
        else:
            assert list(sstore1_d) != list(sstore2_d)
--- a/spacy/tests/serialize/test_serialize_vocab.py
+++ b/spacy/tests/serialize/test_serialize_vocab.py
@ -0,0 +1,73 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from ..util import make_tempdir
 from ...vocab import Vocab
 import pytest
 test_strings = [([], []), (['rats', 'are', 'cute'], ['i', 'like', 'rats'])]
 test_strings_attrs = [(['rats', 'are', 'cute'], 'Hello')]
@pytest.mark.parametrize('strings1,strings2', test_strings)
 def test_serialize_vocab_roundtrip_bytes(strings1,strings2):
    vocab1 = Vocab(strings=strings1)
    vocab2 = Vocab(strings=strings2)
    vocab1_b = vocab1.to_bytes()
    vocab2_b = vocab2.to_bytes()
    if strings1 == strings2:
        assert vocab1_b == vocab2_b
    else:
        assert vocab1_b != vocab2_b
    vocab1 = vocab1.from_bytes(vocab1_b)
    assert vocab1.to_bytes() == vocab1_b
    new_vocab1 = Vocab().from_bytes(vocab1_b)
    assert new_vocab1.to_bytes() == vocab1_b
    assert len(new_vocab1) == len(strings1)
    assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
@pytest.mark.parametrize('strings1,strings2', test_strings)
 def test_serialize_vocab_roundtrip_disk(strings1,strings2):
    vocab1 = Vocab(strings=strings1)
    vocab2 = Vocab(strings=strings2)
    with make_tempdir() as d:
        file_path1 = d / 'vocab1'
        file_path2 = d / 'vocab2'
        vocab1.to_disk(file_path1)
        vocab2.to_disk(file_path2)
        vocab1_d = Vocab().from_disk(file_path1)
        vocab2_d = Vocab().from_disk(file_path2)
        assert list(vocab1_d) == list(vocab1)
        assert list(vocab2_d) == list(vocab2)
        if strings1 == strings2:
            assert list(vocab1_d) == list(vocab2_d)
        else:
            assert list(vocab1_d) != list(vocab2_d)
@pytest.mark.parametrize('strings,lex_attr', test_strings_attrs)
 def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
    vocab1 = Vocab(strings=strings)
    vocab2 = Vocab()
    vocab1[strings[0]].norm_ = lex_attr
    assert vocab1[strings[0]].norm_ == lex_attr
    assert vocab2[strings[0]].norm_ != lex_attr
    vocab2 = vocab2.from_bytes(vocab1.to_bytes())
    assert vocab2[strings[0]].norm_ == lex_attr
@pytest.mark.parametrize('strings,lex_attr', test_strings_attrs)
 def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
    vocab1 = Vocab(strings=strings)
    vocab2 = Vocab()
    vocab1[strings[0]].norm_ = lex_attr
    assert vocab1[strings[0]].norm_ == lex_attr
    assert vocab2[strings[0]].norm_ != lex_attr
    with make_tempdir() as d:
        file_path = d / 'vocab'
        vocab1.to_disk(file_path)
        vocab2 = vocab2.from_disk(file_path)
    assert vocab2[strings[0]].norm_ == lex_attr
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -3,9 +3,14 @@ from __future__ import unicode_literals
 from ..tokens import Doc
 from ..attrs import ORTH, POS, HEAD, DEP
 from ..compat import path2str
 import pytest
 import numpy
 import tempfile
 import shutil
 import contextlib
 from pathlib import Path
 MODELS = {}
@ -19,6 +24,20 @@ def load_test_model(model):
    return MODELS[model]
@contextlib.contextmanager
 def make_tempfile(mode='r'):
    f = tempfile.TemporaryFile(mode=mode)
    yield f
    f.close()
@contextlib.contextmanager
 def make_tempdir():
    d = Path(tempfile.mkdtemp())
    yield d
    shutil.rmtree(path2str(d))
 def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
    """Create Doc object from given vocab, words and annotations."""
    pos = pos or [''] * len(words)
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -54,17 +54,16 @@ cdef class Vocab:
        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
        self.strings = StringStore()
        self.length = 0
        if strings:
            for string in strings:
-                self.strings.add(string)
+                _ = self[string]
        for name in tag_map.keys():
            if name:
                self.strings.add(name)
        self.lex_attr_getters = lex_attr_getters
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
        self.length = 1
    property lang:
        def __get__(self):
            langfunc = None
@ -329,7 +328,8 @@ cdef class Vocab:
            ('strings', lambda b: self.strings.from_bytes(b)),
            ('lexemes', lambda b: self.lexemes_from_bytes(b)),
        ))
-        return util.from_bytes(bytes_data, setters, exclude)
+        util.from_bytes(bytes_data, setters, exclude)
        return self
    def lexemes_to_bytes(self):
        cdef hash_t key