Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-06-02 04:26:39 -05:00
commit ed6f575e06
4 changed files with 142 additions and 4 deletions

View File

@ -0,0 +1,46 @@
# coding: utf-8
from __future__ import unicode_literals
from ..util import make_tempdir
from ...strings import StringStore
import pytest
test_strings = [([], []), (['rats', 'are', 'cute'], ['i', 'like', 'rats'])]
@pytest.mark.parametrize('strings1,strings2', test_strings)
def test_serialize_stringstore_roundtrip_bytes(strings1,strings2):
sstore1 = StringStore(strings=strings1)
sstore2 = StringStore(strings=strings2)
sstore1_b = sstore1.to_bytes()
sstore2_b = sstore2.to_bytes()
if strings1 == strings2:
assert sstore1_b == sstore2_b
else:
assert sstore1_b != sstore2_b
sstore1 = sstore1.from_bytes(sstore1_b)
assert sstore1.to_bytes() == sstore1_b
new_sstore1 = StringStore().from_bytes(sstore1_b)
assert new_sstore1.to_bytes() == sstore1_b
assert list(new_sstore1) == strings1
@pytest.mark.parametrize('strings1,strings2', test_strings)
def test_serialize_stringstore_roundtrip_disk(strings1,strings2):
sstore1 = StringStore(strings=strings1)
sstore2 = StringStore(strings=strings2)
with make_tempdir() as d:
file_path1 = d / 'strings1'
file_path2 = d / 'strings2'
sstore1.to_disk(file_path1)
sstore2.to_disk(file_path2)
sstore1_d = StringStore().from_disk(file_path1)
sstore2_d = StringStore().from_disk(file_path2)
assert list(sstore1_d) == list(sstore1)
assert list(sstore2_d) == list(sstore2)
if strings1 == strings2:
assert list(sstore1_d) == list(sstore2_d)
else:
assert list(sstore1_d) != list(sstore2_d)

View File

@ -0,0 +1,73 @@
# coding: utf-8
from __future__ import unicode_literals
from ..util import make_tempdir
from ...vocab import Vocab
import pytest
test_strings = [([], []), (['rats', 'are', 'cute'], ['i', 'like', 'rats'])]
test_strings_attrs = [(['rats', 'are', 'cute'], 'Hello')]
@pytest.mark.parametrize('strings1,strings2', test_strings)
def test_serialize_vocab_roundtrip_bytes(strings1,strings2):
vocab1 = Vocab(strings=strings1)
vocab2 = Vocab(strings=strings2)
vocab1_b = vocab1.to_bytes()
vocab2_b = vocab2.to_bytes()
if strings1 == strings2:
assert vocab1_b == vocab2_b
else:
assert vocab1_b != vocab2_b
vocab1 = vocab1.from_bytes(vocab1_b)
assert vocab1.to_bytes() == vocab1_b
new_vocab1 = Vocab().from_bytes(vocab1_b)
assert new_vocab1.to_bytes() == vocab1_b
assert len(new_vocab1) == len(strings1)
assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
@pytest.mark.parametrize('strings1,strings2', test_strings)
def test_serialize_vocab_roundtrip_disk(strings1,strings2):
vocab1 = Vocab(strings=strings1)
vocab2 = Vocab(strings=strings2)
with make_tempdir() as d:
file_path1 = d / 'vocab1'
file_path2 = d / 'vocab2'
vocab1.to_disk(file_path1)
vocab2.to_disk(file_path2)
vocab1_d = Vocab().from_disk(file_path1)
vocab2_d = Vocab().from_disk(file_path2)
assert list(vocab1_d) == list(vocab1)
assert list(vocab2_d) == list(vocab2)
if strings1 == strings2:
assert list(vocab1_d) == list(vocab2_d)
else:
assert list(vocab1_d) != list(vocab2_d)
@pytest.mark.parametrize('strings,lex_attr', test_strings_attrs)
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
vocab1 = Vocab(strings=strings)
vocab2 = Vocab()
vocab1[strings[0]].norm_ = lex_attr
assert vocab1[strings[0]].norm_ == lex_attr
assert vocab2[strings[0]].norm_ != lex_attr
vocab2 = vocab2.from_bytes(vocab1.to_bytes())
assert vocab2[strings[0]].norm_ == lex_attr
@pytest.mark.parametrize('strings,lex_attr', test_strings_attrs)
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
vocab1 = Vocab(strings=strings)
vocab2 = Vocab()
vocab1[strings[0]].norm_ = lex_attr
assert vocab1[strings[0]].norm_ == lex_attr
assert vocab2[strings[0]].norm_ != lex_attr
with make_tempdir() as d:
file_path = d / 'vocab'
vocab1.to_disk(file_path)
vocab2 = vocab2.from_disk(file_path)
assert vocab2[strings[0]].norm_ == lex_attr

View File

@ -3,9 +3,14 @@ from __future__ import unicode_literals
from ..tokens import Doc from ..tokens import Doc
from ..attrs import ORTH, POS, HEAD, DEP from ..attrs import ORTH, POS, HEAD, DEP
from ..compat import path2str
import pytest import pytest
import numpy import numpy
import tempfile
import shutil
import contextlib
from pathlib import Path
MODELS = {} MODELS = {}
@ -19,6 +24,20 @@ def load_test_model(model):
return MODELS[model] return MODELS[model]
@contextlib.contextmanager
def make_tempfile(mode='r'):
f = tempfile.TemporaryFile(mode=mode)
yield f
f.close()
@contextlib.contextmanager
def make_tempdir():
d = Path(tempfile.mkdtemp())
yield d
shutil.rmtree(path2str(d))
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
"""Create Doc object from given vocab, words and annotations.""" """Create Doc object from given vocab, words and annotations."""
pos = pos or [''] * len(words) pos = pos or [''] * len(words)

View File

@ -54,17 +54,16 @@ cdef class Vocab:
self._by_hash = PreshMap() self._by_hash = PreshMap()
self._by_orth = PreshMap() self._by_orth = PreshMap()
self.strings = StringStore() self.strings = StringStore()
self.length = 0
if strings: if strings:
for string in strings: for string in strings:
self.strings.add(string) _ = self[string]
for name in tag_map.keys(): for name in tag_map.keys():
if name: if name:
self.strings.add(name) self.strings.add(name)
self.lex_attr_getters = lex_attr_getters self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.length = 1
property lang: property lang:
def __get__(self): def __get__(self):
langfunc = None langfunc = None
@ -329,7 +328,8 @@ cdef class Vocab:
('strings', lambda b: self.strings.from_bytes(b)), ('strings', lambda b: self.strings.from_bytes(b)),
('lexemes', lambda b: self.lexemes_from_bytes(b)), ('lexemes', lambda b: self.lexemes_from_bytes(b)),
)) ))
return util.from_bytes(bytes_data, setters, exclude) util.from_bytes(bytes_data, setters, exclude)
return self
def lexemes_to_bytes(self): def lexemes_to_bytes(self):
cdef hash_t key cdef hash_t key