diff --git a/.appveyor.yml b/.appveyor.yml index 8f0a21967..4dcd75e9c 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -8,16 +8,24 @@ environment: matrix: # Python 2.7.10 is the latest version and is not pre-installed. - - PYTHON: "C:\\Python27.10-x64" PYTHON_VERSION: "2.7.10" PYTHON_ARCH: "64" + - PYTHON: "C:\\Python27.10-x32" + PYTHON_VERSION: "2.7.10" + PYTHON_ARCH: "32" + # The lastest Python 3.4. - PYTHON: "C:\\Python34-x64" PYTHON_VERSION: "3.4.x" # currently 3.4.3 PYTHON_ARCH: "64" + #- PYTHON: "C:\\Python34-x32" + # PYTHON_VERSION: "3.4.x" # currently 3.4.3 + # PYTHON_ARCH: "32" + + install: # Install Python (from the official .msi of http://python.org) and pip when # not already installed. @@ -30,10 +38,11 @@ install: - "SET PYTHONPATH=%CD%;%PYTHONPATH%" # Filesystem root - # - ps: "ls \"C:/\"" + #- ps: "ls \"C:/\"" + #- SET # Installed SDKs - # - ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\"" + #- ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\"" # Checking stdint.h #- ps: "ls \"C:/projects/spacy/include/\"" diff --git a/README.md b/README.md index ad384fd2b..8eb39ba01 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ + + + spaCy: Industrial-strength NLP ============================== @@ -49,3 +52,6 @@ Difficult to support: * PyPy 2.7 * PyPy 3.4 + + + diff --git a/bin/init_model.py b/bin/init_model.py index eb07f6494..991b5dd58 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -191,7 +191,8 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): else: lexeme.cluster = 0 vocab.dump(str(dst_dir / 'lexemes.bin')) - vocab.strings.dump(str(dst_dir / 'strings.txt')) + with (dst_dir / 'strings.json').open('w') as file_: + vocab.strings.dump(file_) with (dst_dir / 'oov_prob').open('w') as file_: file_.write('%f' % oov_prob) diff --git a/setup.py b/setup.py index 090b80f96..48e72ff99 100644 --- a/setup.py +++ b/setup.py @@ -129,11 +129,13 @@ def cython_setup(mod_names, language, includes): version=VERSION, url="http://honnibal.github.io/spaCy/", package_data={"spacy": ["*.pxd"], + "spacy.tokens": ["*.pxd"], + "spacy.serialize": ["*.pxd"], "spacy.en": ["*.pxd", "data/pos/*", "data/wordnet/*", "data/tokenizer/*", "data/vocab/tag_map.json", "data/vocab/lexemes.bin", - "data/vocab/strings.txt"], + "data/vocab/strings.json"], "spacy.syntax": ["*.pxd"]}, ext_modules=exts, cmdclass={'build_ext': build_ext_cython_subclass}, @@ -175,7 +177,7 @@ def run_setup(exts): headers_workaround.install_headers('numpy') -VERSION = '0.96' +VERSION = '0.97' def main(modules, is_pypy): language = "cpp" includes = ['.', path.join(sys.prefix, 'include')] diff --git a/spacy/en/download.py b/spacy/en/download.py index b95288422..748e0542d 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -7,13 +7,15 @@ import plac from . import uget + try: FileExistsError except NameError: FileExistsError = Exception + # TODO: Read this from the same source as the setup -VERSION = '0.9.5' +VERSION = '0.9.6' AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com' diff --git a/spacy/language.py b/spacy/language.py index 65425bc45..691b3e97e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,5 +1,6 @@ from os import path from warnings import warn +import io try: import ujson as json @@ -247,7 +248,10 @@ class Language(object): self.parser.model.end_training(path.join(data_dir, 'deps', 'model')) self.entity.model.end_training(path.join(data_dir, 'ner', 'model')) self.tagger.model.end_training(path.join(data_dir, 'pos', 'model')) - self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) + + strings_loc = path.join(data_dir, 'vocab', 'strings.json') + with io.open(strings_loc, 'w', encoding='utf8') as file_: + self.vocab.strings.dump(file_) with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: file_.write( diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index c1d296d7c..08e511f68 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -19,7 +19,7 @@ class Lemmatizer(object): index[pos] = read_index(path.join(data_dir, 'wordnet', 'index.%s' % pos)) exc[pos] = read_exc(path.join(data_dir, 'wordnet', '%s.exc' % pos)) if path.exists(path.join(data_dir, 'vocab', 'lemma_rules.json')): - rules = json.load(open(path.join(data_dir, 'vocab', 'lemma_rules.json'))) + rules = json.load(codecs.open(path.join(data_dir, 'vocab', 'lemma_rules.json'), encoding='utf_8')) else: rules = {} return cls(index, exc, rules) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 2208d3bdf..cf9c224e6 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -12,8 +12,15 @@ from libc.stdint cimport int64_t from .typedefs cimport hash_t, attr_t +try: + import codecs as io +except ImportError: + import io -SEPARATOR = '\n|-SEP-|\n' +try: + import ujson as json +except ImportError: + import json cpdef hash_t hash_string(unicode string) except 0: @@ -114,7 +121,11 @@ cdef class StringStore: def __iter__(self): cdef int i for i in range(self.size): - yield self[i] + if i == 0: + yield u'' + else: + utf8str = &self.c[i] + yield _decode(utf8str) def __reduce__(self): strings = [""] @@ -138,28 +149,22 @@ cdef class StringStore: self.size += 1 return &self.c[self.size-1] - def dump(self, loc): - cdef Utf8Str* string - cdef unicode py_string - cdef int i - with codecs.open(loc, 'w', 'utf8') as file_: - for i in range(1, self.size): - string = &self.c[i] - py_string = _decode(string) - file_.write(py_string) - if (i+1) != self.size: - file_.write(SEPARATOR) + def dump(self, file_): + string_data = json.dumps([s for s in self]) + if not isinstance(string_data, unicode): + string_data = string_data.decode('utf8') + file_.write(string_data) - def load(self, loc): - with codecs.open(loc, 'r', 'utf8') as file_: - strings = file_.read().split(SEPARATOR) + def load(self, file_): + strings = json.load(file_) if strings == ['']: return None cdef unicode string cdef bytes byte_string for string in strings: - byte_string = string.encode('utf8') - self.intern(byte_string, len(byte_string)) + if string: + byte_string = string.encode('utf8') + self.intern(byte_string, len(byte_string)) def _realloc(self): # We want to map straight to pointers, but they'll be invalidated if diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 9d60d2a6e..c07e87bbc 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -19,6 +19,7 @@ cdef class Tokenizer: cdef object _prefix_re cdef object _suffix_re cdef object _infix_re + cdef object _rules cpdef Doc tokens_from_list(self, list strings) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index ef9c26c01..f0d664c09 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -29,6 +29,16 @@ cdef class Tokenizer: self._infix_re = infix_re self.vocab = vocab self._load_special_tokenization(rules) + self._rules = rules + + def __reduce__(self): + args = (self.vocab, + self._rules, + self._prefix_re, + self._suffix_re, + self._infix_re) + + return (self.__class__, args, None, None) @classmethod def from_dir(cls, Vocab vocab, data_dir): diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index c0cc6803b..93be3e363 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -120,6 +120,9 @@ cdef class Doc: def __str__(self): return u''.join([t.string for t in self]) + def __repr__(self): + return u''.join([t.string for t in self]) + def similarity(self, other): if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index e8d2f2e59..e1b881f79 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -46,6 +46,12 @@ cdef class Span: return 0 return self.end - self.start + def __repr__(self): + text = self.text_with_ws + if self[-1].whitespace_: + text = text[:-1] + return text + def __getitem__(self, object i): if isinstance(i, slice): start, end = normalize_slice(len(self), i.start, i.stop, i.step) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index a7447fb79..cce8eeeb4 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -43,6 +43,9 @@ cdef class Token: def __str__(self): return self.string + def __repr__(self): + return self.string + cpdef bint check_flag(self, attr_id_t flag_id) except -1: return Lexeme.c_check_flag(self.c.lex, flag_id) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index aec01f239..f14a8e8c6 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -62,7 +62,9 @@ cdef class Vocab: cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) - self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin')) + with io.open(path.join(data_dir, 'strings.json'), 'r', encoding='utf8') as file_: + self.strings.load(file_) + self.load_lexemes(path.join(data_dir, 'lexemes.bin')) if path.exists(path.join(data_dir, 'vec.bin')): self.vectors_length = self.load_vectors_from_bin_loc(path.join(data_dir, 'vec.bin')) return self @@ -106,11 +108,12 @@ cdef class Vocab: # TODO: Dump vectors tmp_dir = tempfile.mkdtemp() lex_loc = path.join(tmp_dir, 'lexemes.bin') - str_loc = path.join(tmp_dir, 'strings.txt') + str_loc = path.join(tmp_dir, 'strings.json') vec_loc = path.join(self.data_dir, 'vec.bin') if self.data_dir is not None else None self.dump(lex_loc) - self.strings.dump(str_loc) + with io.open(str_loc, 'w', encoding='utf8') as file_: + self.strings.dump(file_) state = (str_loc, lex_loc, vec_loc, self.morphology, self.get_lex_attr, self.serializer_freqs, self.data_dir) @@ -250,8 +253,7 @@ cdef class Vocab: fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1) fp.close() - def load_lexemes(self, strings_loc, loc): - self.strings.load(strings_loc) + def load_lexemes(self, loc): if not path.exists(loc): raise IOError('LexemeCs file not found at %s' % loc) fp = CFile(loc, 'rb') @@ -369,7 +371,9 @@ def unpickle_vocab(strings_loc, lex_loc, vec_loc, morphology, get_lex_attr, vocab.data_dir = data_dir vocab.serializer_freqs = serializer_freqs - vocab.load_lexemes(strings_loc, lex_loc) + with io.open(strings_loc, 'r', encoding='utf8') as file_: + vocab.strings.load(file_) + vocab.load_lexemes(lex_loc) if vec_loc is not None: vocab.load_vectors_from_bin_loc(vec_loc) return vocab diff --git a/tests/test_pickle.py b/tests/test_pickle.py index a3d54c627..540e54486 100644 --- a/tests/test_pickle.py +++ b/tests/test_pickle.py @@ -1,8 +1,9 @@ -import pytest -import io import cloudpickle +import io +import os import pickle - +import pytest +import tempfile @pytest.mark.models def test_pickle_english(EN): @@ -12,4 +13,15 @@ def test_pickle_english(EN): file_.seek(0) loaded = pickle.load(file_) + assert loaded is not None +@pytest.mark.models +def test_cloudpickle_to_file(EN): + f = tempfile.NamedTemporaryFile(delete=False) + p = cloudpickle.CloudPickler(f) + p.dump(EN) + f.close() + loaded_en = cloudpickle.load(open(f.name)) + os.unlink(f.name) + doc = loaded_en(unicode('test parse')) + assert len(doc) == 2 diff --git a/tests/tokenizer/test_tokenizer.py b/tests/tokenizer/test_tokenizer.py index abf09dd03..be93b9953 100644 --- a/tests/tokenizer/test_tokenizer.py +++ b/tests/tokenizer/test_tokenizer.py @@ -2,6 +2,19 @@ from __future__ import unicode_literals import pytest +import io +import pickle +import cloudpickle +import tempfile + + +@pytest.mark.models +def test_pickle(en_tokenizer): + file_ = io.BytesIO() + cloudpickle.dump(en_tokenizer, file_) + file_.seek(0) + loaded = pickle.load(file_) + assert loaded is not None def test_no_word(en_tokenizer): @@ -108,7 +121,7 @@ def test_cnts5(en_tokenizer): # text = """Today is Tuesday.Mr.""" # tokens = en_tokenizer(text) # assert len(tokens) == 5 -# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.'] +# assert [w.orth_ for w in tokens] == ['Today', 'is', 'Tuesday', '.', 'Mr.'] def test_cnts6(en_tokenizer): diff --git a/tests/vocab/test_intern.py b/tests/vocab/test_intern.py index 15fd6870f..2c2cfd4f1 100644 --- a/tests/vocab/test_intern.py +++ b/tests/vocab/test_intern.py @@ -1,12 +1,13 @@ # -*- coding: utf8 -*- from __future__ import unicode_literals import pickle -import io from spacy.strings import StringStore import pytest +import io + @pytest.fixture def sstore(): @@ -92,4 +93,12 @@ def test_pickle_string_store(sstore): assert loaded[hello_id] == u'Hi' - +def test_dump_load(sstore): + id_ = sstore[u'qqqqq'] + loc = '/tmp/sstore.json' + with io.open(loc, 'w', encoding='utf8') as file_: + sstore.dump(file_) + new_store = StringStore() + with io.open(loc, 'r', encoding='utf8') as file_: + new_store.load(file_) + assert new_store[id_] == u'qqqqq'