From 36517516ef093bea4b517fda5c87790ea377b4c9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 03:21:26 +1100 Subject: [PATCH 01/33] * Replace deprecated repvec reference in twitter-filter --- website/src/jade/tutorials/twitter-filter/index.jade | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/website/src/jade/tutorials/twitter-filter/index.jade b/website/src/jade/tutorials/twitter-filter/index.jade index c322612e4..2a6f19823 100644 --- a/website/src/jade/tutorials/twitter-filter/index.jade +++ b/website/src/jade/tutorials/twitter-filter/index.jade @@ -80,10 +80,10 @@ include ./meta.jade | | def match_tweet(spacy, text, query): | def get_vector(word): - | return spacy.vocab[word].repvec + | return spacy.vocab[word].vector | | tweet = spacy(text) - | tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query] + | tweet = [w.vector for w in tweet if w.is_alpha and w.lower_ != query] | if tweet: | accept = map(get_vector, 'Jeb Cheney Republican 9/11h'.split()) | reject = map(get_vector, 'garden Reggie hairy'.split()) @@ -147,11 +147,11 @@ include ./meta.jade pre.language-python: code | def handle_tweet(spacy, resp, query): | def get_vector(word): - | return spacy.vocab[word].repvec + | return spacy.vocab[word].vector | | text = resp.get('text', '').decode('utf8') | tweet = spacy(text) - | tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query] + | tweet = [w.vector for w in tweet if w.is_alpha and w.lower_ != query] | if tweet: | accept = map(get_vector, 'Jeb Cheney Republican 9/11h'.split()) | reject = map(get_vector, 'garden Reggie hairy'.split()) From f7dd377575d4a68d7e2d83dcc10b4a9ff25b465b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 13:23:22 +1100 Subject: [PATCH 02/33] * Adjust conjuncts iterator in Token --- spacy/tokens/token.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index cce8eeeb4..92a32b8a2 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -253,14 +253,11 @@ cdef class Token: def __get__(self): """Get a list of conjoined words.""" cdef Token word - conjuncts = [] if self.dep_ != 'conj': for word in self.rights: if word.dep_ == 'conj': yield word yield from word.conjuncts - conjuncts.append(word) - conjuncts.extend(word.conjuncts) property ent_type: def __get__(self): From 5887506f5deeeb187b1e92193322f3b66c5a0b46 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 13:23:39 +1100 Subject: [PATCH 03/33] * Don't expect lexemes.bin in Vocab --- spacy/vocab.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index cda1d6ddb..aa2a069dd 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -62,9 +62,11 @@ cdef class Vocab: cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) - with io.open(path.join(data_dir, 'strings.json'), 'r', encoding='utf8') as file_: - self.strings.load(file_) - self.load_lexemes(path.join(data_dir, 'lexemes.bin')) + if path.exists(path.join(data_dir, 'strings.json')): + with io.open(path.join(data_dir, 'strings.json'), 'r', encoding='utf8') as file_: + self.strings.load(file_) + self.load_lexemes(path.join(data_dir, 'lexemes.bin')) + if path.exists(path.join(data_dir, 'vec.bin')): self.vectors_length = self.load_vectors_from_bin_loc(path.join(data_dir, 'vec.bin')) return self From 5ca31e05fb4d128dfdfd9a51fecf1d5f19b71ef3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 13:30:37 +1100 Subject: [PATCH 04/33] * Prune down package data, as models are distributed entirely within the data download. --- setup.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 1d719c626..4957cb41c 100644 --- a/setup.py +++ b/setup.py @@ -166,12 +166,13 @@ def run_setup(exts): version=VERSION, url="http://honnibal.github.io/spaCy/", package_data={"spacy": ["*.pxd"], - "spacy.en": ["*.pxd", "data/pos/*", - "data/wordnet/*", "data/tokenizer/*", - "data/vocab/lexemes.bin", - "data/vocab/serializer.json", - "data/vocab/oov_prob", - "data/vocab/strings.txt"], + "spacy.tokens": ["*.pxd"], + "spacy.serialize": ["*.pxd"], + "spacy.en": ["*.pxd", + "data/wordnet/*.exc", + "data/wordnet/index.*", + "data/tokenizer/*", + "data/vocab/serializer.json"], "spacy.syntax": ["*.pxd"]}, ext_modules=exts, license="MIT", From 5668feb235dae456843507e7054508f55b79e8c3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 04:57:02 +0100 Subject: [PATCH 05/33] * Fix pickle test for python3 --- spacy/tests/test_pickle.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/tests/test_pickle.py b/spacy/tests/test_pickle.py index 540e54486..2577f58a6 100644 --- a/spacy/tests/test_pickle.py +++ b/spacy/tests/test_pickle.py @@ -5,6 +5,11 @@ import pickle import pytest import tempfile +try: + unicode +except NameError: + unicode = str + @pytest.mark.models def test_pickle_english(EN): file_ = io.BytesIO() @@ -21,7 +26,7 @@ def test_cloudpickle_to_file(EN): p = cloudpickle.CloudPickler(f) p.dump(EN) f.close() - loaded_en = cloudpickle.load(open(f.name)) + loaded_en = cloudpickle.load(open(f.name, 'rb')) os.unlink(f.name) doc = loaded_en(unicode('test parse')) assert len(doc) == 2 From bb5598b816767b1ed8d69fe420ed216d43ce5871 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 05:32:18 +0100 Subject: [PATCH 06/33] * Fix test command in fabfile --- fabfile.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fabfile.py b/fabfile.py index 8def13386..8f0097615 100644 --- a/fabfile.py +++ b/fabfile.py @@ -121,9 +121,8 @@ def clean(): def test(): with virtualenv(VENV_DIR): - # Run each test file separately. pytest is performing poorly, not sure why with lcd(path.dirname(__file__)): - local('py.test -x tests/') + local('py.test -x --models spacy/tests') def train(json_dir=None, dev_loc=None, model_dir=None): From 3656f06e35dac681b736644692bc04534a5dd932 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 06:39:30 +0100 Subject: [PATCH 07/33] * Don't use models in fab test --- fabfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fabfile.py b/fabfile.py index 8f0097615..f7eac3f28 100644 --- a/fabfile.py +++ b/fabfile.py @@ -122,7 +122,7 @@ def clean(): def test(): with virtualenv(VENV_DIR): with lcd(path.dirname(__file__)): - local('py.test -x --models spacy/tests') + local('py.test -x spacy/tests') def train(json_dir=None, dev_loc=None, model_dir=None): From 64531d5a3ae545c15b7bbc103318ccd0f43d82cb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 17:07:43 +1100 Subject: [PATCH 08/33] * Define package_data in one place --- setup.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/setup.py b/setup.py index 4957cb41c..bdbc463a4 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,21 @@ from distutils.command.build_ext import build_ext import platform +PACKAGE_DATA = { + "spacy": ["*.pxd"], + "spacy.tokens": ["*.pxd"], + "spacy.serialize": ["*.pxd"], + "spacy.syntax": ["*.pxd"], + "spacy.en": [ + "*.pxd", + "data/wordnet/*.exc", + "data/wordnet/index.*", + "data/tokenizer/*", + "data/vocab/serializer.json" + ] +} + + # By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used compile_options = {'msvc' : ['/Ox', '/EHsc'] , @@ -81,6 +96,8 @@ except OSError: pass + + def clean(mod_names): for name in mod_names: name = name.replace('.', '/') @@ -128,15 +145,7 @@ def cython_setup(mod_names, language, includes): author_email='honnibal@gmail.com', version=VERSION, url="http://honnibal.github.io/spaCy/", - package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"], - "spacy.tokens": ["*.pxd"], - "spacy.serialize": ["*.pxd"], - "spacy.en": ["*.pxd", "data/pos/*", - "data/wordnet/*", "data/tokenizer/*", - "data/vocab/tag_map.json", - "data/vocab/lexemes.bin", - "data/vocab/strings.json"], - "spacy.syntax": ["*.pxd"]}, + package_data=PACKAGE_DATA, ext_modules=exts, cmdclass={'build_ext': build_ext_cython_subclass}, license="MIT", @@ -165,15 +174,7 @@ def run_setup(exts): author_email='honnibal@gmail.com', version=VERSION, url="http://honnibal.github.io/spaCy/", - package_data={"spacy": ["*.pxd"], - "spacy.tokens": ["*.pxd"], - "spacy.serialize": ["*.pxd"], - "spacy.en": ["*.pxd", - "data/wordnet/*.exc", - "data/wordnet/index.*", - "data/tokenizer/*", - "data/vocab/serializer.json"], - "spacy.syntax": ["*.pxd"]}, + package_data=PACKAGE_DATA, ext_modules=exts, license="MIT", install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43', From 8bde2bba58b7c685d1f09faebfa3ddda5232e82c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 07:11:59 +0100 Subject: [PATCH 09/33] * Fiddle with prebuild command --- fabfile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fabfile.py b/fabfile.py index f7eac3f28..0fb1e80a6 100644 --- a/fabfile.py +++ b/fabfile.py @@ -54,10 +54,10 @@ def prebuild(build_dir='/tmp/build_spacy'): local('pip install --no-cache-dir -r requirements.txt') local('fab clean make') local('cp -r %s/corpora/en/wordnet corpora/en/' % spacy_dir) - local('cp %s/corpora/en/freqs.txt.gz corpora/en/' % spacy_dir) local('PYTHONPATH=`pwd` python bin/init_model.py en lang_data corpora spacy/en/data') local('fab test') - local('python setup.py sdist') + local('PYTHONPATH=`pwd` python spacy.en.download --force all en') + local('py.test --models spacy/tests/') def docs(): From 2714fb47331d05d99113bbbc79d5cff244d38433 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 07:21:03 +0100 Subject: [PATCH 10/33] * Fix prebuild command --- fabfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fabfile.py b/fabfile.py index 0fb1e80a6..5fd934820 100644 --- a/fabfile.py +++ b/fabfile.py @@ -56,7 +56,7 @@ def prebuild(build_dir='/tmp/build_spacy'): local('cp -r %s/corpora/en/wordnet corpora/en/' % spacy_dir) local('PYTHONPATH=`pwd` python bin/init_model.py en lang_data corpora spacy/en/data') local('fab test') - local('PYTHONPATH=`pwd` python spacy.en.download --force all en') + local('PYTHONPATH=`pwd` python -m spacy.en.download --force all') local('py.test --models spacy/tests/') From 5e040855a5c5e7725fd875e4b85e38d53e113796 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 17:56:50 +1100 Subject: [PATCH 11/33] * Ensure morphological features and lemmas are loaded in from_array, re Issue #152 --- spacy/tests/serialize/test_io.py | 12 ++++++++++++ spacy/tokens/doc.pyx | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/spacy/tests/serialize/test_io.py b/spacy/tests/serialize/test_io.py index a64d0cabc..4157ee309 100644 --- a/spacy/tests/serialize/test_io.py +++ b/spacy/tests/serialize/test_io.py @@ -38,3 +38,15 @@ def test_left_right(EN): for child in word.rights: assert child.head.i == word.i + +@pytest.mark.models +def test_lemmas(EN): + orig = EN(u'The geese are flying') + result = Doc(orig.vocab).from_bytes(orig.to_bytes()) + the, geese, are, flying = result + assert the.lemma_ == 'the' + assert geese.lemma_ == 'goose' + assert are.lemma_ == 'be' + assert flying.lemma_ == 'fly' + + diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 01ccb4fd9..2ad1a1d4a 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -398,7 +398,7 @@ cdef class Doc: self.is_parsed = True elif attr_id == TAG: for i in range(length): - tokens[i].tag = values[i] + self.vocab.morphology.assign_tag(&tokens[i], values[i]) if not self.is_tagged and tokens[i].tag != 0: self.is_tagged = True elif attr_id == POS: @@ -413,6 +413,8 @@ cdef class Doc: elif attr_id == ENT_TYPE: for i in range(length): tokens[i].ent_type = values[i] + else: + raise ValueError("Unknown attribute ID: %d" % attr_id) set_children_from_heads(self.data, self.length) return self From 604ceac4c651b7263d252f1bb53abea2f27a6739 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 17:57:51 +1100 Subject: [PATCH 12/33] * Fix morphological assignment in doc.merge() --- spacy/tokens/doc.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2ad1a1d4a..7a8822b5f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -471,8 +471,7 @@ cdef class Doc: # Update fields token.lex = lex token.spacy = self.data[end-1].spacy - # What to do about morphology?? - # TODO: token.morph = ??? + self.vocab.morphology.assign_tag(token, self.vocab.strings[tag]) token.tag = self.vocab.strings[tag] token.lemma = self.vocab.strings[lemma] if ent_type == 'O': From 7adef3f8316d4acfe5862cbcc91cb1d9f2700287 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 07:58:59 +0100 Subject: [PATCH 13/33] * Increment version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bdbc463a4..e65ab9fbe 100644 --- a/setup.py +++ b/setup.py @@ -191,7 +191,7 @@ def run_setup(exts): headers_workaround.install_headers('numpy') -VERSION = '0.97' +VERSION = '0.98' def main(modules, is_pypy): language = "cpp" includes = ['.', path.join(sys.prefix, 'include')] From 3f44b3e43f62630493975c022934ea8df0fd7383 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 18:07:08 +1100 Subject: [PATCH 14/33] * Mark serializer test as requiring models --- spacy/tests/serialize/test_packer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/serialize/test_packer.py b/spacy/tests/serialize/test_packer.py index 6ec583d08..1e79c8247 100644 --- a/spacy/tests/serialize/test_packer.py +++ b/spacy/tests/serialize/test_packer.py @@ -64,6 +64,7 @@ def test_packer_unannotated(tokenizer): assert result.string == 'the dog jumped' +@pytest.mark.models def test_packer_annotated(tokenizer): vocab = tokenizer.vocab nn = vocab.strings['NN'] From 09664177d79364a04c0ea588704884533ba1ce9c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 18:14:53 +1100 Subject: [PATCH 15/33] * Fix tag handling in doc.merge, and assign sent_start when setting heads. --- spacy/tokens/doc.pyx | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 7a8822b5f..9c3387059 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -471,7 +471,10 @@ cdef class Doc: # Update fields token.lex = lex token.spacy = self.data[end-1].spacy - self.vocab.morphology.assign_tag(token, self.vocab.strings[tag]) + if tag in self.vocab.morphology.tag_map: + self.vocab.morphology.assign_tag(token, self.vocab.strings[tag]) + else: + token.tag = self.vocab.strings[tag] token.tag = self.vocab.strings[tag] token.lemma = self.vocab.strings[lemma] if ent_type == 'O': @@ -545,3 +548,9 @@ cdef int set_children_from_heads(TokenC* tokens, int length) except -1: if child.r_edge > head.r_edge: head.r_edge = child.r_edge head.r_kids += 1 + + # Set sentence starts + for i in range(length): + if tokens[i].head == 0 and tokens[i].dep != 0: + tokens[tokens[i].l_edge].sent_start = True + From 833eb35c57703b3a265febd57ebc2dfe1f1736b2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 18:45:54 +1100 Subject: [PATCH 16/33] * Fix tag assignment in doc.from_array --- spacy/morphology.pyx | 2 ++ spacy/tokens/doc.pyx | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index e8b1f3520..442aebd68 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -38,6 +38,8 @@ cdef class Morphology: tag_id = self.reverse_index[self.strings[tag]] else: tag_id = tag + if tag_id >= self.n_tags: + raise ValueError("Unknown tag: %s" % tag) analysis = self._cache.get(tag_id, token.lex.orth) if analysis is NULL: analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 9c3387059..6b14d761c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -398,7 +398,8 @@ cdef class Doc: self.is_parsed = True elif attr_id == TAG: for i in range(length): - self.vocab.morphology.assign_tag(&tokens[i], values[i]) + self.vocab.morphology.assign_tag(&tokens[i], + self.vocab.strings[values[i]]) if not self.is_tagged and tokens[i].tag != 0: self.is_tagged = True elif attr_id == POS: From 85372468e361f2e6bdeb10cd6c3c080775d9f0ab Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 08:51:33 +0100 Subject: [PATCH 17/33] * Fix serialize test --- spacy/tests/serialize/test_io.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/serialize/test_io.py b/spacy/tests/serialize/test_io.py index 4157ee309..f90bb20c2 100644 --- a/spacy/tests/serialize/test_io.py +++ b/spacy/tests/serialize/test_io.py @@ -44,7 +44,6 @@ def test_lemmas(EN): orig = EN(u'The geese are flying') result = Doc(orig.vocab).from_bytes(orig.to_bytes()) the, geese, are, flying = result - assert the.lemma_ == 'the' assert geese.lemma_ == 'goose' assert are.lemma_ == 'be' assert flying.lemma_ == 'fly' From ffedff9e6c544b4becb455e3b483bed0fea32ff6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 18:54:05 +1100 Subject: [PATCH 18/33] * Remove the archive after download, to save disk space --- spacy/en/download.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/en/download.py b/spacy/en/download.py index 748e0542d..1cc029c09 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -38,6 +38,7 @@ def install_data(url, extract_path, download_path): assert tmp == download_path t = tarfile.open(download_path) t.extractall(extract_path) + os.unlink(download_path) @plac.annotations( From dde9e1357cf9599a63d3dec806f96faddbae9df1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 18:54:35 +1100 Subject: [PATCH 19/33] * Add todo to morphology.lemmatize --- spacy/morphology.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 442aebd68..dff9d39e2 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -88,6 +88,8 @@ cdef class Morphology: return orth cdef unicode py_string = self.strings[orth] if pos != NOUN and pos != VERB and pos != ADJ and pos != PUNCT: + # TODO: This should lower-case + # return self.strings[py_string.lower()] return orth cdef set lemma_strings cdef unicode lemma_string From 9e37437ba82995c91d0037ca01e70cd50a571d5f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 19:07:02 +1100 Subject: [PATCH 20/33] * Fix assign_tag in doc.merge --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 6b14d761c..927c01147 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -473,7 +473,7 @@ cdef class Doc: token.lex = lex token.spacy = self.data[end-1].spacy if tag in self.vocab.morphology.tag_map: - self.vocab.morphology.assign_tag(token, self.vocab.strings[tag]) + self.vocab.morphology.assign_tag(token, tag) else: token.tag = self.vocab.strings[tag] token.tag = self.vocab.strings[tag] From d06ba2637132eea686aa58afcdebde8e84abe78f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 19:43:27 +1100 Subject: [PATCH 21/33] * Fix test of serializer --- spacy/tests/serialize/test_packer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/serialize/test_packer.py b/spacy/tests/serialize/test_packer.py index 1e79c8247..b1f290589 100644 --- a/spacy/tests/serialize/test_packer.py +++ b/spacy/tests/serialize/test_packer.py @@ -20,7 +20,7 @@ from spacy.serialize.bits import BitArray @pytest.fixture def vocab(): - vocab = Vocab(Language.default_lex_attrs()) + vocab = Language.default_vocab() lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' lex = vocab['the'] From ee3f9ba58141ef5e2e1c7d7cd2f13282751221bb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 19:45:16 +1100 Subject: [PATCH 22/33] * Fix test of serializer --- spacy/tests/serialize/test_packer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/tests/serialize/test_packer.py b/spacy/tests/serialize/test_packer.py index b1f290589..e0d24208a 100644 --- a/spacy/tests/serialize/test_packer.py +++ b/spacy/tests/serialize/test_packer.py @@ -6,6 +6,7 @@ import pytest import numpy from spacy.language import Language +from spacy.en import English from spacy.vocab import Vocab from spacy.tokens.doc import Doc from spacy.tokenizer import Tokenizer @@ -20,7 +21,7 @@ from spacy.serialize.bits import BitArray @pytest.fixture def vocab(): - vocab = Language.default_vocab() + vocab = English.default_vocab() lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' lex = vocab['the'] From f81389abe06df0e1b8796c20bc8705d82fcb49c2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 23:12:13 +1100 Subject: [PATCH 23/33] * Pin to specific cymem, preshed and thinc versions. --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index e65ab9fbe..715998956 100644 --- a/setup.py +++ b/setup.py @@ -177,8 +177,8 @@ def run_setup(exts): package_data=PACKAGE_DATA, ext_modules=exts, license="MIT", - install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43', - 'thinc >= 3.4.1', "text_unidecode", 'plac', 'six', + install_requires=['numpy', 'murmurhash', 'cymem == 1.30', 'preshed == 0.43', + 'thinc == 3.4.1', "text_unidecode", 'plac', 'six', 'ujson', 'cloudpickle'], setup_requires=["headers_workaround"], cmdclass = {'build_ext': build_ext_subclass }, @@ -191,7 +191,7 @@ def run_setup(exts): headers_workaround.install_headers('numpy') -VERSION = '0.98' +VERSION = '0.99' def main(modules, is_pypy): language = "cpp" includes = ['.', path.join(sys.prefix, 'include')] From 1e99fcd413083c3639b8bf50f0d2ac3692ed5e6d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 23:47:59 +1100 Subject: [PATCH 24/33] * Rename .repvec to .vector in C API --- spacy/lexeme.pyx | 8 ++++---- spacy/structs.pxd | 2 +- spacy/tokens/doc.pyx | 6 +----- spacy/tokens/token.pyx | 6 +++--- spacy/vocab.pyx | 18 +++++++++--------- 5 files changed, 18 insertions(+), 22 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 26acff407..048fae1df 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -51,7 +51,7 @@ cdef class Lexeme: def __get__(self): cdef int i for i in range(self.vocab.vectors_length): - if self.c.repvec[i] != 0: + if self.c.vector[i] != 0: return True else: return False @@ -74,14 +74,14 @@ cdef class Lexeme: "to install the data." ) - repvec_view = self.c.repvec - return numpy.asarray(repvec_view) + vector_view = self.c.vector + return numpy.asarray(vector_view) def __set__(self, vector): assert len(vector) == self.vocab.vectors_length cdef float value for i, value in enumerate(vector): - self.c.repvec[i] = value + self.c.vector[i] = value property repvec: def __get__(self): diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 04078c0f5..de0fe2167 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -5,7 +5,7 @@ from .parts_of_speech cimport univ_pos_t cdef struct LexemeC: - float* repvec + float* vector flags_t flags diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 927c01147..a606b92f4 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -134,10 +134,6 @@ cdef class Doc: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) - property repvec: - def __get__(self): - return self.vector - property vector: def __get__(self): if self._vector is None: @@ -399,7 +395,7 @@ cdef class Doc: elif attr_id == TAG: for i in range(length): self.vocab.morphology.assign_tag(&tokens[i], - self.vocab.strings[values[i]]) + self.vocab.morphology.reverse_index[values[i]]) if not self.is_tagged and tokens[i].tag != 0: self.is_tagged = True elif attr_id == POS: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index a4eaf93fc..5174cb480 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -143,7 +143,7 @@ cdef class Token: def __get__(self): cdef int i for i in range(self.vocab.vectors_length): - if self.c.lex.repvec[i] != 0: + if self.c.lex.vector[i] != 0: return True else: return False @@ -158,8 +158,8 @@ cdef class Token: "\npython -m spacy.en.download all\n" "to install the data." ) - repvec_view = self.c.lex.repvec - return numpy.asarray(repvec_view) + vector_view = self.c.lex.vector + return numpy.asarray(vector_view) property repvec: def __get__(self): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index aa2a069dd..b78249973 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -40,7 +40,7 @@ DEF MAX_VEC_SIZE = 100000 cdef float[MAX_VEC_SIZE] EMPTY_VEC memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC)) memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) -EMPTY_LEXEME.repvec = EMPTY_VEC +EMPTY_LEXEME.vector = EMPTY_VEC cdef class Vocab: @@ -162,7 +162,7 @@ cdef class Vocab: lex.orth = self.strings[string] lex.length = len(string) lex.id = self.length - lex.repvec = mem.alloc(self.vectors_length, sizeof(float)) + lex.vector = mem.alloc(self.vectors_length, sizeof(float)) if self.get_lex_attr is not None: for attr, func in self.get_lex_attr.items(): value = func(string) @@ -287,7 +287,7 @@ cdef class Vocab: fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment)) fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) - lexeme.repvec = EMPTY_VEC + lexeme.vector = EMPTY_VEC py_str = self.strings[lexeme.orth] key = hash_string(py_str) self._by_hash.set(key, lexeme) @@ -306,7 +306,7 @@ cdef class Vocab: cdef CFile out_file = CFile(out_loc, 'wb') for lexeme in self: word_str = lexeme.orth_.encode('utf8') - vec = lexeme.c.repvec + vec = lexeme.c.vector word_len = len(word_str) out_file.write_from(&word_len, 1, sizeof(word_len)) @@ -331,10 +331,10 @@ cdef class Vocab: vec_len, len(pieces)) orth = self.strings[word_str] lexeme = self.get_by_orth(self.mem, orth) - lexeme.repvec = self.mem.alloc(self.vectors_length, sizeof(float)) + lexeme.vector = self.mem.alloc(self.vectors_length, sizeof(float)) for i, val_str in enumerate(pieces): - lexeme.repvec[i] = float(val_str) + lexeme.vector[i] = float(val_str) return vec_len def load_vectors_from_bin_loc(self, loc): @@ -376,12 +376,12 @@ cdef class Vocab: for orth, lex_addr in self._by_orth.items(): lex = lex_addr if lex.lower < vectors.size(): - lex.repvec = vectors[lex.lower] + lex.vector = vectors[lex.lower] for i in range(vec_len): - lex.l2_norm += (lex.repvec[i] * lex.repvec[i]) + lex.l2_norm += (lex.vector[i] * lex.vector[i]) lex.l2_norm = math.sqrt(lex.l2_norm) else: - lex.repvec = EMPTY_VEC + lex.vector = EMPTY_VEC return vec_len From 9ec7b9c454e92558824f2d6af9f4dedbfcb61dc5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 23:48:21 +1100 Subject: [PATCH 25/33] * Clean up unused Constituent struct. --- spacy/structs.pxd | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index de0fe2167..733ce3022 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -32,18 +32,8 @@ cdef struct Entity: int label -cdef struct Constituent: - const TokenC* head - const Constituent* parent - const Constituent* first - const Constituent* last - int label - int length - - cdef struct TokenC: const LexemeC* lex - const Constituent* ctnt uint64_t morph univ_pos_t pos bint spacy From 116da5990a9f97baa9f399a50ff0707046e60024 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 23:48:57 +1100 Subject: [PATCH 26/33] * Clean up setting of tag in doc.from_bytes --- spacy/tokens/doc.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index a606b92f4..c1cef5789 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -472,7 +472,6 @@ cdef class Doc: self.vocab.morphology.assign_tag(token, tag) else: token.tag = self.vocab.strings[tag] - token.tag = self.vocab.strings[tag] token.lemma = self.vocab.strings[lemma] if ent_type == 'O': token.ent_iob = 2 From 9482d616bc505e002242e58396f340058860d480 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Nov 2015 23:51:05 +1100 Subject: [PATCH 27/33] * Rename spans.pyx to span.pyx --- setup.py | 2 +- spacy/tokens/doc.pyx | 2 +- spacy/tokens/{spans.pxd => span.pxd} | 0 spacy/tokens/{spans.pyx => span.pyx} | 0 4 files changed, 2 insertions(+), 2 deletions(-) rename spacy/tokens/{spans.pxd => span.pxd} (100%) rename spacy/tokens/{spans.pyx => span.pyx} (100%) diff --git a/setup.py b/setup.py index 715998956..c3b91cfb9 100644 --- a/setup.py +++ b/setup.py @@ -163,7 +163,7 @@ def run_setup(exts): 'spacy.tests.munge', 'spacy.tests.parser', 'spacy.tests.serialize', - 'spacy.tests.spans', + 'spacy.tests.span', 'spacy.tests.tagger', 'spacy.tests.tokenizer', 'spacy.tests.tokens', diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index c1cef5789..b677b3385 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -18,7 +18,7 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..parts_of_speech cimport univ_pos_t from ..lexeme cimport Lexeme -from .spans cimport Span +from .span cimport Span from .token cimport Token from ..serialize.bits cimport BitArray from ..util import normalize_slice diff --git a/spacy/tokens/spans.pxd b/spacy/tokens/span.pxd similarity index 100% rename from spacy/tokens/spans.pxd rename to spacy/tokens/span.pxd diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/span.pyx similarity index 100% rename from spacy/tokens/spans.pyx rename to spacy/tokens/span.pyx From 3ddea19b2b8c3bd1112c5ac39bb551c3e23af20a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 4 Nov 2015 00:14:40 +1100 Subject: [PATCH 28/33] * Rename spans.pyx to span.pyx --- setup.py | 2 +- spacy/tokens/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index c3b91cfb9..d153ae3da 100644 --- a/setup.py +++ b/setup.py @@ -217,7 +217,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.syntax.arc_eager', 'spacy.syntax._parse_features', 'spacy.gold', 'spacy.orth', - 'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token', + 'spacy.tokens.doc', 'spacy.tokens.span', 'spacy.tokens.token', 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', 'spacy.cfile', 'spacy.matcher', 'spacy.syntax.ner', diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index 9950ee703..bc3794126 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -1,5 +1,5 @@ from .doc import Doc from .token import Token -from .spans import Span +from .span import Span __all__ = [Doc, Token, Span] From 68f479e8216511db0d3847610fb902df9369dac8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 4 Nov 2015 00:15:14 +1100 Subject: [PATCH 29/33] * Rename Doc.data to Doc.c --- spacy/matcher.pyx | 6 ++-- spacy/syntax/parser.pyx | 6 ++-- spacy/tagger.pyx | 14 ++++---- spacy/tokenizer.pyx | 6 ++-- spacy/tokens/doc.pxd | 2 +- spacy/tokens/doc.pyx | 78 ++++++++++++++++++++--------------------- spacy/tokens/span.pyx | 6 ++-- spacy/tokens/token.pyx | 2 +- 8 files changed, 60 insertions(+), 60 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 7c21ee086..1fa91fab1 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -215,7 +215,7 @@ cdef class Matcher: cdef Pattern* state matches = [] for token_i in range(doc.length): - token = &doc.data[token_i] + token = &doc.c[token_i] q = 0 # Go over the open matches, extending or finalizing if able. Otherwise, # we over-write them (q doesn't advance) @@ -286,7 +286,7 @@ cdef class PhraseMatcher: for i in range(self.max_length): self._phrase_key[i] = 0 for i, tag in enumerate(tags): - lexeme = self.vocab[tokens.data[i].lex.orth] + lexeme = self.vocab[tokens.c[i].lex.orth] lexeme.set_flag(tag, True) self._phrase_key[i] = lexeme.orth cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) @@ -309,7 +309,7 @@ cdef class PhraseMatcher: for i in range(self.max_length): self._phrase_key[i] = 0 for i, j in enumerate(range(start, end)): - self._phrase_key[i] = doc.data[j].lex.orth + self._phrase_key[i] = doc.c[j].lex.orth cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0) if self.phrase_ids.get(key): return True diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 25932a0a4..40569b1aa 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -84,7 +84,7 @@ cdef class Parser: return cls(strings, moves, model) def __call__(self, Doc tokens): - cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) + cdef StateClass stcls = StateClass.init(tokens.c, tokens.length) self.moves.initialize_state(stcls) cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, @@ -112,7 +112,7 @@ cdef class Parser: def train(self, Doc tokens, GoldParse gold): self.moves.preprocess_gold(gold) - cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) + cdef StateClass stcls = StateClass.init(tokens.c, tokens.length) self.moves.initialize_state(stcls) cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats, self.model.n_feats) @@ -143,7 +143,7 @@ cdef class StepwiseState: def __init__(self, Parser parser, Doc doc): self.parser = parser self.doc = doc - self.stcls = StateClass.init(doc.data, doc.length) + self.stcls = StateClass.init(doc.c, doc.length) self.parser.moves.initialize_state(self.stcls) self.eg = Example(self.parser.model.n_classes, CONTEXT_SIZE, self.parser.model.n_feats, self.parser.model.n_feats) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 69925ff89..e98b28067 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -141,9 +141,9 @@ cdef class Tagger: cdef int i cdef const weight_t* scores for i in range(tokens.length): - if tokens.data[i].pos == 0: - guess = self.predict(i, tokens.data) - self.vocab.morphology.assign_tag(&tokens.data[i], guess) + if tokens.c[i].pos == 0: + guess = self.predict(i, tokens.c) + self.vocab.morphology.assign_tag(&tokens.c[i], guess) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length @@ -154,7 +154,7 @@ cdef class Tagger: def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): - self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i]) + self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i]) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length @@ -170,13 +170,13 @@ cdef class Tagger: [g for g in gold_tag_strs if g is not None and g not in self.tag_names]) correct = 0 for i in range(tokens.length): - guess = self.update(i, tokens.data, golds[i]) + guess = self.update(i, tokens.c, golds[i]) loss = golds[i] != -1 and guess != golds[i] - self.vocab.morphology.assign_tag(&tokens.data[i], guess) + self.vocab.morphology.assign_tag(&tokens.c[i], guess) correct += loss == 0 - self.freqs[TAG][tokens.data[i].tag] += 1 + self.freqs[TAG][tokens.c[i].tag] += 1 return correct cdef int predict(self, int i, const TokenC* tokens) except -1: diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f0d664c09..1cde1e76e 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -113,7 +113,7 @@ cdef class Tokenizer: self._tokenize(tokens, span, key) in_ws = not in_ws if uc == ' ': - tokens.data[tokens.length - 1].spacy = True + tokens.c[tokens.length - 1].spacy = True start = i + 1 else: start = i @@ -125,7 +125,7 @@ cdef class Tokenizer: cache_hit = self._try_cache(key, tokens) if not cache_hit: self._tokenize(tokens, span, key) - tokens.data[tokens.length - 1].spacy = string[-1] == ' ' + tokens.c[tokens.length - 1].spacy = string[-1] == ' ' return tokens cdef int _try_cache(self, hash_t key, Doc tokens) except -1: @@ -148,7 +148,7 @@ cdef class Tokenizer: orig_size = tokens.length span = self._split_affixes(span, &prefixes, &suffixes) self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.data[orig_size], orig_key, tokens.length - orig_size) + self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) cdef unicode _split_affixes(self, unicode string, vector[const LexemeC*] *prefixes, vector[const LexemeC*] *suffixes): diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index ce1cfecc0..6c11cd25c 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -26,7 +26,7 @@ cdef class Doc: cdef public object _vector cdef public object _vector_norm - cdef TokenC* data + cdef TokenC* c cdef public bint is_tagged cdef public bint is_parsed diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index b677b3385..b0a193173 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -73,7 +73,7 @@ cdef class Doc: data_start[i].lex = &EMPTY_LEXEME data_start[i].l_edge = i data_start[i].r_edge = i - self.data = data_start + PADDING + self.c = data_start + PADDING self.max_length = size self.length = 0 self.is_tagged = False @@ -97,7 +97,7 @@ cdef class Doc: if self._py_tokens[i] is not None: return self._py_tokens[i] else: - return Token.cinit(self.vocab, &self.data[i], i, self) + return Token.cinit(self.vocab, &self.c[i], i, self) def __iter__(self): """Iterate over the tokens. @@ -110,7 +110,7 @@ cdef class Doc: if self._py_tokens[i] is not None: yield self._py_tokens[i] else: - yield Token.cinit(self.vocab, &self.data[i], i, self) + yield Token.cinit(self.vocab, &self.c[i], i, self) def __len__(self): return self.length @@ -187,7 +187,7 @@ cdef class Doc: cdef int label = 0 output = [] for i in range(self.length): - token = &self.data[i] + token = &self.c[i] if token.ent_iob == 1: assert start != -1 elif token.ent_iob == 2 or token.ent_iob == 0: @@ -212,23 +212,23 @@ cdef class Doc: # 4. Test more nuanced date and currency regex cdef int i for i in range(self.length): - self.data[i].ent_type = 0 - self.data[i].ent_iob = 0 + self.c[i].ent_type = 0 + self.c[i].ent_iob = 0 cdef attr_t ent_type cdef int start, end for ent_type, start, end in ents: if ent_type is None or ent_type < 0: # Mark as O for i in range(start, end): - self.data[i].ent_type = 0 - self.data[i].ent_iob = 2 + self.c[i].ent_type = 0 + self.c[i].ent_iob = 2 else: # Mark (inside) as I for i in range(start, end): - self.data[i].ent_type = ent_type - self.data[i].ent_iob = 1 + self.c[i].ent_type = ent_type + self.c[i].ent_iob = 1 # Set start as B - self.data[start].ent_iob = 3 + self.c[start].ent_iob = 3 @property def noun_chunks(self): @@ -245,7 +245,7 @@ cdef class Doc: np_deps = [self.vocab.strings[label] for label in labels] np_label = self.vocab.strings['NP'] for i in range(self.length): - word = &self.data[i] + word = &self.c[i] if word.pos == NOUN and word.dep in np_deps: yield Span(self, word.l_edge, i+1, label=np_label) @@ -263,7 +263,7 @@ cdef class Doc: cdef int i start = 0 for i in range(1, self.length): - if self.data[i].sent_start: + if self.c[i].sent_start: yield Span(self, start, i) start = i yield Span(self, start, self.length) @@ -271,7 +271,7 @@ cdef class Doc: cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: if self.length == self.max_length: self._realloc(self.length * 2) - cdef TokenC* t = &self.data[self.length] + cdef TokenC* t = &self.c[self.length] if LexemeOrToken is const_TokenC_ptr: t[0] = lex_or_tok[0] else: @@ -310,7 +310,7 @@ cdef class Doc: output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32) for i in range(self.length): for j, feature in enumerate(attr_ids): - output[i, j] = get_token_attr(&self.data[i], feature) + output[i, j] = get_token_attr(&self.c[i], feature) return output def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): @@ -340,11 +340,11 @@ cdef class Doc: # Take this check out of the loop, for a bit of extra speed if exclude is None: for i in range(self.length): - counts.inc(get_token_attr(&self.data[i], attr_id), 1) + counts.inc(get_token_attr(&self.c[i], attr_id), 1) else: for i in range(self.length): if not exclude(self[i]): - attr = get_token_attr(&self.data[i], attr_id) + attr = get_token_attr(&self.c[i], attr_id) counts.inc(attr, 1) if output_dict: return dict(counts) @@ -357,12 +357,12 @@ cdef class Doc: # words out-of-bounds, and get out-of-bounds markers. # Now that we want to realloc, we need the address of the true start, # so we jump the pointer back PADDING places. - cdef TokenC* data_start = self.data - PADDING + cdef TokenC* data_start = self.c - PADDING data_start = self.mem.realloc(data_start, n * sizeof(TokenC)) - self.data = data_start + PADDING + self.c = data_start + PADDING cdef int i for i in range(self.length, self.max_length + PADDING): - self.data[i].lex = &EMPTY_LEXEME + self.c[i].lex = &EMPTY_LEXEME cdef int set_parse(self, const TokenC* parsed) except -1: # TODO: This method is fairly misleading atm. It's used by Parser @@ -371,14 +371,14 @@ cdef class Doc: # Probably we should use from_array? self.is_parsed = True for i in range(self.length): - self.data[i] = parsed[i] - assert self.data[i].l_edge <= i - assert self.data[i].r_edge >= i + self.c[i] = parsed[i] + assert self.c[i].l_edge <= i + assert self.c[i].r_edge >= i def from_array(self, attrs, array): cdef int i, col cdef attr_id_t attr_id - cdef TokenC* tokens = self.data + cdef TokenC* tokens = self.c cdef int length = len(array) cdef attr_t[:] values for col, attr_id in enumerate(attrs): @@ -412,7 +412,7 @@ cdef class Doc: tokens[i].ent_type = values[i] else: raise ValueError("Unknown attribute ID: %d" % attr_id) - set_children_from_heads(self.data, self.length) + set_children_from_heads(self.c, self.length) return self def to_bytes(self): @@ -447,9 +447,9 @@ cdef class Doc: cdef int start = -1 cdef int end = -1 for i in range(self.length): - if self.data[i].idx == start_idx: + if self.c[i].idx == start_idx: start = i - if (self.data[i].idx + self.data[i].lex.length) == end_idx: + if (self.c[i].idx + self.c[i].lex.length) == end_idx: if start == -1: return None end = i + 1 @@ -464,10 +464,10 @@ cdef class Doc: new_orth = new_orth[:-len(span[-1].whitespace_)] cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth) # House the new merged token where it starts - cdef TokenC* token = &self.data[start] + cdef TokenC* token = &self.c[start] # Update fields token.lex = lex - token.spacy = self.data[end-1].spacy + token.spacy = self.c[end-1].spacy if tag in self.vocab.morphology.tag_map: self.vocab.morphology.assign_tag(token, tag) else: @@ -486,31 +486,31 @@ cdef class Doc: span_root = span.root.i token.dep = span.root.dep for i in range(self.length): - self.data[i].head += i + self.c[i].head += i # Set the head of the merged token, and its dep relation, from the Span - token.head = self.data[span_root].head + token.head = self.c[span_root].head # Adjust deps before shrinking tokens # Tokens which point into the merged token should now point to it # Subtract the offset from all tokens which point to >= end offset = (end - start) - 1 for i in range(self.length): - head_idx = self.data[i].head + head_idx = self.c[i].head if start <= head_idx < end: - self.data[i].head = start + self.c[i].head = start elif head_idx >= end: - self.data[i].head -= offset + self.c[i].head -= offset # Now compress the token array for i in range(end, self.length): - self.data[i - offset] = self.data[i] + self.c[i - offset] = self.c[i] for i in range(self.length - offset, self.length): - memset(&self.data[i], 0, sizeof(TokenC)) - self.data[i].lex = &EMPTY_LEXEME + memset(&self.c[i], 0, sizeof(TokenC)) + self.c[i].lex = &EMPTY_LEXEME self.length -= offset for i in range(self.length): # ...And, set heads back to a relative position - self.data[i].head -= i + self.c[i].head -= i # Set the left/right children, left/right edges - set_children_from_heads(self.data, self.length) + set_children_from_heads(self.c, self.length) # Clear the cached Python objects self._py_tokens = [None] * self.length # Return the merged Python object diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 95b8e0de1..7b74be492 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -139,12 +139,12 @@ cdef class Span: def __get__(self): # This should probably be called 'head', and the other one called # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/ - cdef const TokenC* start = &self.doc.data[self.start] - cdef const TokenC* end = &self.doc.data[self.end] + cdef const TokenC* start = &self.doc.c[self.start] + cdef const TokenC* end = &self.doc.c[self.end] head = start while start <= (head + head.head) < end and head.head != 0: head += head.head - return self.doc[head - self.doc.data] + return self.doc[head - self.doc.c] property lefts: """Tokens that are to the left of the Span, whose head is within the Span.""" diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 5174cb480..3a76bf220 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -31,7 +31,7 @@ cdef class Token: def __cinit__(self, Vocab vocab, Doc doc, int offset): self.vocab = vocab self.doc = doc - self.c = &self.doc.data[offset] + self.c = &self.doc.c[offset] self.i = offset self.array_len = doc.length From 1ce5d5602d68e828b8e2be3778e15155be1a6ee1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 4 Nov 2015 00:17:13 +1100 Subject: [PATCH 30/33] * Rename Doc.data to Doc.c --- spacy/serialize/packer.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/serialize/packer.pyx b/spacy/serialize/packer.pyx index ae1c4ec99..62451073c 100644 --- a/spacy/serialize/packer.pyx +++ b/spacy/serialize/packer.pyx @@ -155,10 +155,10 @@ cdef class Packer: self.char_codec.encode(bytearray(utf8_str), bits) cdef int i, j for i in range(doc.length): - for j in range(doc.data[i].lex.length-1): + for j in range(doc.c[i].lex.length-1): bits.append(False) bits.append(True) - if doc.data[i].spacy: + if doc.c[i].spacy: bits.append(False) return bits From 65934b7cd46587814c51bab6cb5be51e5551a084 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 4 Nov 2015 00:32:02 +1100 Subject: [PATCH 31/33] * Enforce import of ujson in strings.pyx, because otherwise it's too slow --- spacy/strings.pyx | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index cf9c224e6..760d9bea1 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -17,10 +17,7 @@ try: except ImportError: import io -try: - import ujson as json -except ImportError: - import json +import ujson as json cpdef hash_t hash_string(unicode string) except 0: From e96faf29e7d3983075f44cae870344d6b71e3fba Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 4 Nov 2015 22:01:44 +1100 Subject: [PATCH 32/33] * Rename like_number to like_num, to fix inconsistency re Issue #166 --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 691b3e97e..373b744c1 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -88,7 +88,7 @@ class Language(object): return orth.like_url(string) @staticmethod - def like_number(string): + def like_num(string): return orth.like_number(string) @staticmethod From adc7bbd6cf4e0343f19c1aafd27c25dc0680d334 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 4 Nov 2015 22:02:47 +1100 Subject: [PATCH 33/33] * Fix name of like_num in default_lex_attrs --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 373b744c1..3087a2373 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -119,7 +119,7 @@ class Language(object): attrs.IS_TITLE: cls.is_title, attrs.IS_UPPER: cls.is_upper, attrs.LIKE_URL: cls.like_url, - attrs.LIKE_NUM: cls.like_number, + attrs.LIKE_NUM: cls.like_num, attrs.LIKE_EMAIL: cls.like_email, attrs.IS_STOP: cls.is_stop, attrs.IS_OOV: lambda string: True