From 92ac9316b5f3ff79db1c3ec44be54f8c4dfe95dc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 13:59:24 +0200 Subject: [PATCH 1/8] Fix initialization of vectors, to address serialization problem --- spacy/vectors.pyx | 12 +++++------- spacy/vocab.pyx | 10 ++++------ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 5512279ae..cea583110 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -32,22 +32,20 @@ cdef class Vectors: cdef public object keys cdef public int i - def __init__(self, strings, data_or_width=0): + def __init__(self, strings, data=None, width=0): if isinstance(strings, StringStore): self.strings = strings else: self.strings = StringStore() for string in strings: self.strings.add(string) - if isinstance(data_or_width, int): - self.data = data = numpy.zeros((len(strings), data_or_width), - dtype='f') + if data is not None: + self.data = numpy.asarray(data, dtype='f') else: - data = data_or_width + self.data = numpy.zeros((len(self.strings), width), dtype='f') self.i = 0 - self.data = data self.key2row = {} - self.keys = np.ndarray((self.data.shape[0],), dtype='uint64') + self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64') def __reduce__(self): return (Vectors, (self.strings, self.data)) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 205e5a2af..e6ba9944b 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -62,12 +62,10 @@ cdef class Vocab: if strings: for string in strings: _ = self[string] - for name in tag_map.keys(): - if name: - self.strings.add(name) self.lex_attr_getters = lex_attr_getters + print("Create morphology", list(self.strings), tag_map) self.morphology = Morphology(self.strings, tag_map, lemmatizer) - self.vectors = Vectors(self.strings) + self.vectors = Vectors(self.strings, width=0) property lang: def __get__(self): @@ -338,7 +336,7 @@ cdef class Vocab: if self.vectors is None: return None else: - return self.vectors.to_bytes(exclude='strings.json') + return self.vectors.to_bytes() getters = OrderedDict(( ('strings', lambda: self.strings.to_bytes()), @@ -358,7 +356,7 @@ cdef class Vocab: if self.vectors is None: return None else: - return self.vectors.from_bytes(b, exclude='strings') + return self.vectors.from_bytes(b) setters = OrderedDict(( ('strings', lambda b: self.strings.from_bytes(b)), ('lexemes', lambda b: self.lexemes_from_bytes(b)), From 6218af0105d1514089ecd76c4cbf6fec31d50423 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 13:59:57 +0200 Subject: [PATCH 2/8] Remove cpdef enum, to avoid too much code generation --- spacy/morphology.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index be6711bfd..9192f351f 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -44,7 +44,7 @@ cdef class Morphology: cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 -cpdef enum univ_morph_t: +cdef enum univ_morph_t: NIL = 0 Animacy_anim = symbols.Animacy_anim Animacy_inam From 506cf2eb1389da6149f97de7db80df52ed0d2d1f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 14:00:23 +0200 Subject: [PATCH 3/8] Remove cpdef enum, to avoid too much code generation --- spacy/morphology.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 4a1a0aa54..65b46fe08 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -426,3 +426,7 @@ IDS = { NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] +# Unfortunate hack here, to work around problem with long cpdef enum +# (which is generating an enormous amount of C++ in Cython 0.24+) +# We keep the enum cdef, and just make sure the names are available to Python +locals().update(IDS) From 49895fbef69598d18fd00197661ec3ad939de849 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 14:01:12 +0200 Subject: [PATCH 4/8] Rename 'SP' special tag to '_SP' Renaming the tag with an underscore lets us add it to the tag map without worrying that we'll change the sequence of tags, which throws off the tag-to-ID mapping. For instance, if we inserted a 'SP' tag, the "VERB" tag is pushed to a different class ID, and the model is all messed up. --- spacy/lang/de/tag_map.py | 2 +- spacy/lang/en/tag_map.py | 4 +-- spacy/lang/es/tag_map.py | 2 +- spacy/lang/th/tag_map.py | 77 ++++++++++++++++++++-------------------- spacy/morphology.pyx | 14 ++++++-- 5 files changed, 54 insertions(+), 45 deletions(-) diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py index d16bd17e0..730c15cfc 100644 --- a/spacy/lang/de/tag_map.py +++ b/spacy/lang/de/tag_map.py @@ -62,5 +62,5 @@ TAG_MAP = { "VVIZU": {POS: VERB, "VerbForm": "inf"}, "VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"}, "XY": {POS: X}, - "SP": {POS: SPACE} + "_SP": {POS: SPACE} } diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index a674c17e3..76eabf307 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -55,11 +55,11 @@ TAG_MAP = { "WP": {POS: NOUN, "PronType": "int|rel"}, "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, "WRB": {POS: ADV, "PronType": "int|rel"}, - "SP": {POS: SPACE}, "ADD": {POS: X}, "NFP": {POS: PUNCT}, "GW": {POS: X}, "XX": {POS: X}, "BES": {POS: VERB}, - "HVS": {POS: VERB} + "HVS": {POS: VERB}, + "_SP": {POS: SPACE}, } diff --git a/spacy/lang/es/tag_map.py b/spacy/lang/es/tag_map.py index 86dd48620..2095d23b1 100644 --- a/spacy/lang/es/tag_map.py +++ b/spacy/lang/es/tag_map.py @@ -303,5 +303,5 @@ TAG_MAP = { "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"}, "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"}, "X___": {"morph": "_", "pos": "X"}, - "SP": {"morph": "_", "pos": "SPACE"}, + "_SP": {"morph": "_", "pos": "SPACE"}, } diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py index 40e5ac44c..570871820 100644 --- a/spacy/lang/th/tag_map.py +++ b/spacy/lang/th/tag_map.py @@ -19,63 +19,64 @@ TAG_MAP = { "NPRP": {POS: PRON}, # ADJ "ADJ": {POS: ADJ}, - "NONM": {POS: ADJ}, - "VATT": {POS: ADJ}, - "DONM": {POS: ADJ}, + "NONM": {POS: ADJ}, + "VATT": {POS: ADJ}, + "DONM": {POS: ADJ}, # ADV "ADV": {POS: ADV}, - "ADVN": {POS: ADV}, - "ADVI": {POS: ADV}, - "ADVP": {POS: ADV}, - "ADVS": {POS: ADV}, + "ADVN": {POS: ADV}, + "ADVI": {POS: ADV}, + "ADVP": {POS: ADV}, + "ADVS": {POS: ADV}, # INT "INT": {POS: INTJ}, # PRON "PROPN": {POS: PROPN}, - "PPRS": {POS: PROPN}, - "PDMN": {POS: PROPN}, - "PNTR": {POS: PROPN}, + "PPRS": {POS: PROPN}, + "PDMN": {POS: PROPN}, + "PNTR": {POS: PROPN}, # DET "DET": {POS: DET}, - "DDAN": {POS: DET}, - "DDAC": {POS: DET}, - "DDBQ": {POS: DET}, - "DDAQ": {POS: DET}, - "DIAC": {POS: DET}, - "DIBQ": {POS: DET}, - "DIAQ": {POS: DET}, - "DCNM": {POS: DET}, + "DDAN": {POS: DET}, + "DDAC": {POS: DET}, + "DDBQ": {POS: DET}, + "DDAQ": {POS: DET}, + "DIAC": {POS: DET}, + "DIBQ": {POS: DET}, + "DIAQ": {POS: DET}, + "DCNM": {POS: DET}, # NUM "NUM": {POS: NUM}, - "NCNM": {POS: NUM}, - "NLBL": {POS: NUM}, - "DCNM": {POS: NUM}, + "NCNM": {POS: NUM}, + "NLBL": {POS: NUM}, + "DCNM": {POS: NUM}, # AUX "AUX": {POS: AUX}, - "XVBM": {POS: AUX}, - "XVAM": {POS: AUX}, - "XVMM": {POS: AUX}, - "XVBB": {POS: AUX}, - "XVAE": {POS: AUX}, + "XVBM": {POS: AUX}, + "XVAM": {POS: AUX}, + "XVMM": {POS: AUX}, + "XVBB": {POS: AUX}, + "XVAE": {POS: AUX}, # ADP "ADP": {POS: ADP}, - "RPRE": {POS: ADP}, + "RPRE": {POS: ADP}, # CCONJ "CCONJ": {POS: CCONJ}, - "JCRG": {POS: CCONJ}, + "JCRG": {POS: CCONJ}, # SCONJ "SCONJ": {POS: SCONJ}, - "PREL": {POS: SCONJ}, - "JSBR": {POS: SCONJ}, - "JCMP": {POS: SCONJ}, + "PREL": {POS: SCONJ}, + "JSBR": {POS: SCONJ}, + "JCMP": {POS: SCONJ}, # PART - "PART": {POS: PART}, - "FIXN": {POS: PART}, - "FIXV": {POS: PART}, - "EAFF": {POS: PART}, - "AITT": {POS: PART}, - "NEG": {POS: PART}, + "PART": {POS: PART}, + "FIXN": {POS: PART}, + "FIXV": {POS: PART}, + "EAFF": {POS: PART}, + "AITT": {POS: PART}, + "NEG": {POS: PART}, # PUNCT "PUNCT": {POS: PUNCT}, - "PUNC": {POS: PUNCT} + "PUNC": {POS: PUNCT}, + "_SP": {POS: SPACE} } diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 65b46fe08..7845ab4e7 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -4,7 +4,7 @@ from __future__ import unicode_literals from libc.string cimport memset -from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT +from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE from .attrs cimport POS, IS_SPACE from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme @@ -36,14 +36,22 @@ cdef class Morphology: def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): self.mem = Pool() self.strings = string_store + # Add special space symbol. We prefix with underscore, to make sure it + # always sorts to the end. + space_attrs = tag_map.pop('SP', {POS: SPACE}) + if '_SP' not in tag_map: + self.strings.add('_SP') + tag_map = dict(tag_map) + tag_map['_SP'] = space_attrs + self.tag_names = tuple(sorted(tag_map.keys())) self.tag_map = {} self.lemmatizer = lemmatizer self.n_tags = len(tag_map) - self.tag_names = tuple(sorted(tag_map.keys())) self.reverse_index = {} self.rich_tags = self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): + self.strings.add(tag_str) self.tag_map[tag_str] = dict(attrs) attrs = _normalize_props(attrs) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) @@ -93,7 +101,7 @@ cdef class Morphology: # the statistical model fails. # Related to Issue #220 if Lexeme.c_check_flag(token.lex, IS_SPACE): - tag_id = self.reverse_index[self.strings.add('SP')] + tag_id = self.reverse_index[self.strings.add('_SP')] rich_tag = self.rich_tags[tag_id] analysis = self._cache.get(tag_id, token.lex.orth) if analysis is NULL: From ebecaddb765713aaaf7f5b2f51488f39f66655d9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 14:17:15 +0200 Subject: [PATCH 5/8] Make 'data_or_width' two keyword args in Vectors.__init__ Previously the data and width options were one argument in Vectors, which meant you couldn't say vectors = Vectors(strings, width=300). It's better to have two keywords. --- spacy/tests/vectors/test_vectors.py | 8 ++++---- website/api/vectors.jade | 15 +++++++++------ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/spacy/tests/vectors/test_vectors.py b/spacy/tests/vectors/test_vectors.py index 798871edd..74ac26a10 100644 --- a/spacy/tests/vectors/test_vectors.py +++ b/spacy/tests/vectors/test_vectors.py @@ -35,18 +35,18 @@ def vocab(en_vocab, vectors): def test_init_vectors_with_data(strings, data): - v = Vectors(strings, data) + v = Vectors(strings, data=data) assert v.shape == data.shape def test_init_vectors_with_width(strings): - v = Vectors(strings, 3) + v = Vectors(strings, width=3) for string in strings: v.add(string) assert v.shape == (len(strings), 3) def test_get_vector(strings, data): - v = Vectors(strings, data) + v = Vectors(strings, data=data) for string in strings: v.add(string) assert list(v[strings[0]]) == list(data[0]) @@ -56,7 +56,7 @@ def test_get_vector(strings, data): def test_set_vector(strings, data): orig = data.copy() - v = Vectors(strings, data) + v = Vectors(strings, data=data) for string in strings: v.add(string) assert list(v[strings[0]]) == list(orig[0]) diff --git a/website/api/vectors.jade b/website/api/vectors.jade index a58736506..e08f34643 100644 --- a/website/api/vectors.jade +++ b/website/api/vectors.jade @@ -12,7 +12,7 @@ p p | Create a new vector store. To keep the vector table empty, pass - | #[code data_or_width=0]. You can also create the vector table and add + | #[code width=0]. You can also create the vector table and add | vectors one by one, or set the vector values directly on initialisation. +aside-code("Example"). @@ -21,11 +21,11 @@ p empty_vectors = Vectors(StringStore()) - vectors = Vectors([u'cat'], 300) + vectors = Vectors([u'cat'], width=300) vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,)) vector_table = numpy.zeros((3, 300), dtype='f') - vectors = Vectors(StringStore(), vector_table) + vectors = Vectors(StringStore(), data=vector_table) +table(["Name", "Type", "Description"]) +row @@ -36,9 +36,12 @@ p | that maps strings to hash values, and vice versa. +row - +cell #[code data_or_width] - +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] or int - +cell Vector data or number of dimensions. + +cell #[code data] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + + +row + +cell #[code width] + +cell Number of dimensions. +row("foot") +cell returns From cfae54c507ab24a1da36d3008484d2ac8edb3071 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 14:19:04 +0200 Subject: [PATCH 6/8] Make change to Vectors.__init__ --- spacy/vectors.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index cea583110..fa5fcf624 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -32,7 +32,7 @@ cdef class Vectors: cdef public object keys cdef public int i - def __init__(self, strings, data=None, width=0): + def __init__(self, strings, width=0, data=None): if isinstance(strings, StringStore): self.strings = strings else: @@ -46,6 +46,10 @@ cdef class Vectors: self.i = 0 self.key2row = {} self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64') + for i, string in enumerate(self.strings): + if i >= self.data.shape[0]: + break + self.add(self.strings[string], self.data[i]) def __reduce__(self): return (Vectors, (self.strings, self.data)) From 33229b1c9ef53a49a3bbd00d61ca02c28c5481c8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 14:19:29 +0200 Subject: [PATCH 7/8] Remove print statement --- spacy/vocab.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index e6ba9944b..2e189a02b 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -63,7 +63,6 @@ cdef class Vocab: for string in strings: _ = self[string] self.lex_attr_getters = lex_attr_getters - print("Create morphology", list(self.strings), tag_map) self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.vectors = Vectors(self.strings, width=0) From 9010a1a0603fba85143bcd859b88aaed59937a9a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 20 Oct 2017 14:19:46 +0200 Subject: [PATCH 8/8] Create vectors correctly --- spacy/vocab.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 2e189a02b..3f96b5144 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -252,7 +252,7 @@ cdef class Vocab: """ if new_dim is None: new_dim = self.vectors.data.shape[1] - self.vectors = Vectors(self.strings, new_dim) + self.vectors = Vectors(self.strings, width=new_dim) def get_vector(self, orth): """Retrieve a vector for a word in the vocabulary.