diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py index d16bd17e0..730c15cfc 100644 --- a/spacy/lang/de/tag_map.py +++ b/spacy/lang/de/tag_map.py @@ -62,5 +62,5 @@ TAG_MAP = { "VVIZU": {POS: VERB, "VerbForm": "inf"}, "VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"}, "XY": {POS: X}, - "SP": {POS: SPACE} + "_SP": {POS: SPACE} } diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index a674c17e3..76eabf307 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -55,11 +55,11 @@ TAG_MAP = { "WP": {POS: NOUN, "PronType": "int|rel"}, "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, "WRB": {POS: ADV, "PronType": "int|rel"}, - "SP": {POS: SPACE}, "ADD": {POS: X}, "NFP": {POS: PUNCT}, "GW": {POS: X}, "XX": {POS: X}, "BES": {POS: VERB}, - "HVS": {POS: VERB} + "HVS": {POS: VERB}, + "_SP": {POS: SPACE}, } diff --git a/spacy/lang/es/tag_map.py b/spacy/lang/es/tag_map.py index 86dd48620..2095d23b1 100644 --- a/spacy/lang/es/tag_map.py +++ b/spacy/lang/es/tag_map.py @@ -303,5 +303,5 @@ TAG_MAP = { "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"}, "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"}, "X___": {"morph": "_", "pos": "X"}, - "SP": {"morph": "_", "pos": "SPACE"}, + "_SP": {"morph": "_", "pos": "SPACE"}, } diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py index 40e5ac44c..570871820 100644 --- a/spacy/lang/th/tag_map.py +++ b/spacy/lang/th/tag_map.py @@ -19,63 +19,64 @@ TAG_MAP = { "NPRP": {POS: PRON}, # ADJ "ADJ": {POS: ADJ}, - "NONM": {POS: ADJ}, - "VATT": {POS: ADJ}, - "DONM": {POS: ADJ}, + "NONM": {POS: ADJ}, + "VATT": {POS: ADJ}, + "DONM": {POS: ADJ}, # ADV "ADV": {POS: ADV}, - "ADVN": {POS: ADV}, - "ADVI": {POS: ADV}, - "ADVP": {POS: ADV}, - "ADVS": {POS: ADV}, + "ADVN": {POS: ADV}, + "ADVI": {POS: ADV}, + "ADVP": {POS: ADV}, + "ADVS": {POS: ADV}, # INT "INT": {POS: INTJ}, # PRON "PROPN": {POS: PROPN}, - "PPRS": {POS: PROPN}, - "PDMN": {POS: PROPN}, - "PNTR": {POS: PROPN}, + "PPRS": {POS: PROPN}, + "PDMN": {POS: PROPN}, + "PNTR": {POS: PROPN}, # DET "DET": {POS: DET}, - "DDAN": {POS: DET}, - "DDAC": {POS: DET}, - "DDBQ": {POS: DET}, - "DDAQ": {POS: DET}, - "DIAC": {POS: DET}, - "DIBQ": {POS: DET}, - "DIAQ": {POS: DET}, - "DCNM": {POS: DET}, + "DDAN": {POS: DET}, + "DDAC": {POS: DET}, + "DDBQ": {POS: DET}, + "DDAQ": {POS: DET}, + "DIAC": {POS: DET}, + "DIBQ": {POS: DET}, + "DIAQ": {POS: DET}, + "DCNM": {POS: DET}, # NUM "NUM": {POS: NUM}, - "NCNM": {POS: NUM}, - "NLBL": {POS: NUM}, - "DCNM": {POS: NUM}, + "NCNM": {POS: NUM}, + "NLBL": {POS: NUM}, + "DCNM": {POS: NUM}, # AUX "AUX": {POS: AUX}, - "XVBM": {POS: AUX}, - "XVAM": {POS: AUX}, - "XVMM": {POS: AUX}, - "XVBB": {POS: AUX}, - "XVAE": {POS: AUX}, + "XVBM": {POS: AUX}, + "XVAM": {POS: AUX}, + "XVMM": {POS: AUX}, + "XVBB": {POS: AUX}, + "XVAE": {POS: AUX}, # ADP "ADP": {POS: ADP}, - "RPRE": {POS: ADP}, + "RPRE": {POS: ADP}, # CCONJ "CCONJ": {POS: CCONJ}, - "JCRG": {POS: CCONJ}, + "JCRG": {POS: CCONJ}, # SCONJ "SCONJ": {POS: SCONJ}, - "PREL": {POS: SCONJ}, - "JSBR": {POS: SCONJ}, - "JCMP": {POS: SCONJ}, + "PREL": {POS: SCONJ}, + "JSBR": {POS: SCONJ}, + "JCMP": {POS: SCONJ}, # PART - "PART": {POS: PART}, - "FIXN": {POS: PART}, - "FIXV": {POS: PART}, - "EAFF": {POS: PART}, - "AITT": {POS: PART}, - "NEG": {POS: PART}, + "PART": {POS: PART}, + "FIXN": {POS: PART}, + "FIXV": {POS: PART}, + "EAFF": {POS: PART}, + "AITT": {POS: PART}, + "NEG": {POS: PART}, # PUNCT "PUNCT": {POS: PUNCT}, - "PUNC": {POS: PUNCT} + "PUNC": {POS: PUNCT}, + "_SP": {POS: SPACE} } diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index be6711bfd..9192f351f 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -44,7 +44,7 @@ cdef class Morphology: cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 -cpdef enum univ_morph_t: +cdef enum univ_morph_t: NIL = 0 Animacy_anim = symbols.Animacy_anim Animacy_inam diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 4a1a0aa54..7845ab4e7 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -4,7 +4,7 @@ from __future__ import unicode_literals from libc.string cimport memset -from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT +from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE from .attrs cimport POS, IS_SPACE from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme @@ -36,14 +36,22 @@ cdef class Morphology: def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): self.mem = Pool() self.strings = string_store + # Add special space symbol. We prefix with underscore, to make sure it + # always sorts to the end. + space_attrs = tag_map.pop('SP', {POS: SPACE}) + if '_SP' not in tag_map: + self.strings.add('_SP') + tag_map = dict(tag_map) + tag_map['_SP'] = space_attrs + self.tag_names = tuple(sorted(tag_map.keys())) self.tag_map = {} self.lemmatizer = lemmatizer self.n_tags = len(tag_map) - self.tag_names = tuple(sorted(tag_map.keys())) self.reverse_index = {} self.rich_tags = self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): + self.strings.add(tag_str) self.tag_map[tag_str] = dict(attrs) attrs = _normalize_props(attrs) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) @@ -93,7 +101,7 @@ cdef class Morphology: # the statistical model fails. # Related to Issue #220 if Lexeme.c_check_flag(token.lex, IS_SPACE): - tag_id = self.reverse_index[self.strings.add('SP')] + tag_id = self.reverse_index[self.strings.add('_SP')] rich_tag = self.rich_tags[tag_id] analysis = self._cache.get(tag_id, token.lex.orth) if analysis is NULL: @@ -426,3 +434,7 @@ IDS = { NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] +# Unfortunate hack here, to work around problem with long cpdef enum +# (which is generating an enormous amount of C++ in Cython 0.24+) +# We keep the enum cdef, and just make sure the names are available to Python +locals().update(IDS) diff --git a/spacy/tests/vectors/test_vectors.py b/spacy/tests/vectors/test_vectors.py index 798871edd..74ac26a10 100644 --- a/spacy/tests/vectors/test_vectors.py +++ b/spacy/tests/vectors/test_vectors.py @@ -35,18 +35,18 @@ def vocab(en_vocab, vectors): def test_init_vectors_with_data(strings, data): - v = Vectors(strings, data) + v = Vectors(strings, data=data) assert v.shape == data.shape def test_init_vectors_with_width(strings): - v = Vectors(strings, 3) + v = Vectors(strings, width=3) for string in strings: v.add(string) assert v.shape == (len(strings), 3) def test_get_vector(strings, data): - v = Vectors(strings, data) + v = Vectors(strings, data=data) for string in strings: v.add(string) assert list(v[strings[0]]) == list(data[0]) @@ -56,7 +56,7 @@ def test_get_vector(strings, data): def test_set_vector(strings, data): orig = data.copy() - v = Vectors(strings, data) + v = Vectors(strings, data=data) for string in strings: v.add(string) assert list(v[strings[0]]) == list(orig[0]) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 5512279ae..fa5fcf624 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -32,22 +32,24 @@ cdef class Vectors: cdef public object keys cdef public int i - def __init__(self, strings, data_or_width=0): + def __init__(self, strings, width=0, data=None): if isinstance(strings, StringStore): self.strings = strings else: self.strings = StringStore() for string in strings: self.strings.add(string) - if isinstance(data_or_width, int): - self.data = data = numpy.zeros((len(strings), data_or_width), - dtype='f') + if data is not None: + self.data = numpy.asarray(data, dtype='f') else: - data = data_or_width + self.data = numpy.zeros((len(self.strings), width), dtype='f') self.i = 0 - self.data = data self.key2row = {} - self.keys = np.ndarray((self.data.shape[0],), dtype='uint64') + self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64') + for i, string in enumerate(self.strings): + if i >= self.data.shape[0]: + break + self.add(self.strings[string], self.data[i]) def __reduce__(self): return (Vectors, (self.strings, self.data)) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index da4d21026..bcd1f3c10 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -62,12 +62,9 @@ cdef class Vocab: if strings: for string in strings: _ = self[string] - for name in tag_map.keys(): - if name: - self.strings.add(name) self.lex_attr_getters = lex_attr_getters self.morphology = Morphology(self.strings, tag_map, lemmatizer) - self.vectors = Vectors(self.strings) + self.vectors = Vectors(self.strings, width=0) property lang: def __get__(self): @@ -255,7 +252,7 @@ cdef class Vocab: """ if new_dim is None: new_dim = self.vectors.data.shape[1] - self.vectors = Vectors(self.strings, new_dim) + self.vectors = Vectors(self.strings, width=new_dim) def get_vector(self, orth): """Retrieve a vector for a word in the vocabulary. @@ -338,7 +335,7 @@ cdef class Vocab: if self.vectors is None: return None else: - return self.vectors.to_bytes(exclude='strings.json') + return self.vectors.to_bytes() getters = OrderedDict(( ('strings', lambda: self.strings.to_bytes()), @@ -358,7 +355,7 @@ cdef class Vocab: if self.vectors is None: return None else: - return self.vectors.from_bytes(b, exclude='strings') + return self.vectors.from_bytes(b) setters = OrderedDict(( ('strings', lambda b: self.strings.from_bytes(b)), ('lexemes', lambda b: self.lexemes_from_bytes(b)), diff --git a/website/api/vectors.jade b/website/api/vectors.jade index a58736506..e08f34643 100644 --- a/website/api/vectors.jade +++ b/website/api/vectors.jade @@ -12,7 +12,7 @@ p p | Create a new vector store. To keep the vector table empty, pass - | #[code data_or_width=0]. You can also create the vector table and add + | #[code width=0]. You can also create the vector table and add | vectors one by one, or set the vector values directly on initialisation. +aside-code("Example"). @@ -21,11 +21,11 @@ p empty_vectors = Vectors(StringStore()) - vectors = Vectors([u'cat'], 300) + vectors = Vectors([u'cat'], width=300) vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,)) vector_table = numpy.zeros((3, 300), dtype='f') - vectors = Vectors(StringStore(), vector_table) + vectors = Vectors(StringStore(), data=vector_table) +table(["Name", "Type", "Description"]) +row @@ -36,9 +36,12 @@ p | that maps strings to hash values, and vice versa. +row - +cell #[code data_or_width] - +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] or int - +cell Vector data or number of dimensions. + +cell #[code data] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + + +row + +cell #[code width] + +cell Number of dimensions. +row("foot") +cell returns