From 7996d2171761f7ba9d96e9b71fca93622b7545ca Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 28 May 2017 11:09:27 -0500 Subject: [PATCH] Fixes for new StringStore --- spacy/strings.pyx | 24 +++++++++++++++++------- spacy/syntax/_state.pxd | 7 ++++--- spacy/syntax/ner.pyx | 3 ++- spacy/syntax/stateclass.pxd | 7 ++++--- spacy/syntax/transition_system.pyx | 25 ++++++++++++++++--------- spacy/tokens/doc.pyx | 2 +- spacy/tokens/span.pxd | 3 ++- spacy/tokens/span.pyx | 2 ++ 8 files changed, 48 insertions(+), 25 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 8095e01a9..b1b707c6a 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -112,9 +112,9 @@ cdef class StringStore: elif isinstance(string_or_id, bytes): key = hash_utf8(string_or_id, len(string_or_id)) return key + elif string_or_id < len(SYMBOLS_BY_INT): + return SYMBOLS_BY_INT[string_or_id] else: - if string_or_id < len(SYMBOLS_BY_INT): - return SYMBOLS_BY_INT[string_or_id] key = string_or_id utf8str = self._map.get(key) if utf8str is NULL: @@ -151,14 +151,24 @@ cdef class StringStore: string (unicode): The string to check. RETURNS (bool): Whether the store contains the string. """ - if len(string) == 0: + cdef hash_t key + if isinstance(string, int) or isinstance(string, long): + if string == 0: + return True + key = string + elif len(string) == 0: return True - if string in SYMBOLS_BY_STR: + elif string in SYMBOLS_BY_STR: return True - if isinstance(string, unicode): + elif isinstance(string, unicode): + key = hash_string(string) + else: string = string.encode('utf8') - cdef hash_t key = hash_utf8(string, len(string)) - return self._map.get(key) is not NULL + key = hash_utf8(string, len(string)) + if key < len(SYMBOLS_BY_INT): + return True + else: + return self._map.get(key) is not NULL def __iter__(self): """Iterate over the strings in the store, in order. diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 0b29412bf..9e7ebcec0 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -9,6 +9,7 @@ from ..structs cimport TokenC, Entity from ..lexeme cimport Lexeme from ..symbols cimport punct from ..attrs cimport IS_SPACE +from ..typedefs cimport attr_t cdef inline bint is_space_token(const TokenC* token) nogil: @@ -268,7 +269,7 @@ cdef cppclass StateC: this._s_i -= 1 this.shifted[this.B(0)] = True - void add_arc(int head, int child, int label) nogil: + void add_arc(int head, int child, attr_t label) nogil: if this.has_head(child): this.del_arc(this.H(child), child) @@ -312,7 +313,7 @@ cdef cppclass StateC: h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i h.l_kids -= 1 - void open_ent(int label) nogil: + void open_ent(attr_t label) nogil: this._ents[this._e_i].start = this.B(0) this._ents[this._e_i].label = label this._ents[this._e_i].end = -1 @@ -324,7 +325,7 @@ cdef cppclass StateC: this._ents[this._e_i-1].end = this.B(0)+1 this._sent[this.B(0)].ent_iob = 1 - void set_ent_tag(int i, int ent_iob, int ent_type) nogil: + void set_ent_tag(int i, int ent_iob, attr_t ent_type) nogil: if 0 <= i < this.length: this._sent[i].ent_iob = ent_iob this._sent[i].ent_type = ent_type diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 4537c4523..93d98a8cd 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -123,6 +123,7 @@ cdef class BiluoPushDown(TransitionSystem): return gold cdef Transition lookup_transition(self, object name) except *: + cdef attr_t label if name == '-' or name == None: move_str = 'M' label = 0 @@ -241,7 +242,7 @@ cdef class Begin: @staticmethod cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: cdef int g_act = gold.ner[s.B(0)].move - cdef int g_tag = gold.ner[s.B(0)].label + cdef attr_t g_tag = gold.ner[s.B(0)].label if g_act == MISSING: return 0 diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index 62fda5ade..0ae83ee27 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -4,6 +4,7 @@ from cymem.cymem cimport Pool cimport cython from ..structs cimport TokenC, Entity +from ..typedefs cimport attr_t from ..vocab cimport EMPTY_LEXEME from ._state cimport StateC @@ -105,19 +106,19 @@ cdef class StateClass: cdef inline void unshift(self) nogil: self.c.unshift() - cdef inline void add_arc(self, int head, int child, int label) nogil: + cdef inline void add_arc(self, int head, int child, attr_t label) nogil: self.c.add_arc(head, child, label) cdef inline void del_arc(self, int head, int child) nogil: self.c.del_arc(head, child) - cdef inline void open_ent(self, int label) nogil: + cdef inline void open_ent(self, attr_t label) nogil: self.c.open_ent(label) cdef inline void close_ent(self) nogil: self.c.close_ent() - cdef inline void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil: + cdef inline void set_ent_tag(self, int i, int ent_iob, attr_t ent_type) nogil: self.c.set_ent_tag(i, ent_iob, ent_type) cdef inline void set_break(self, int i) nogil: diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 885319717..a5506e537 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -10,6 +10,7 @@ from collections import defaultdict, OrderedDict from ..structs cimport TokenC from .stateclass cimport StateClass from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB +from ..typedefs cimport attr_t cdef weight_t MIN_SCORE = -90000 @@ -37,7 +38,7 @@ cdef class TransitionSystem: for action, label_strs in labels_by_action.items(): for label_str in label_strs: self.add_action(int(action), label_str) - self.root_label = self.strings['ROOT'] + self.root_label = self.strings.add('ROOT') self.init_beam_state = _init_state def __reduce__(self): @@ -125,24 +126,30 @@ cdef class TransitionSystem: if n_gold <= 0: print(gold.words) print(gold.ner) + print([gold.c.ner[i].clas for i in range(gold.length)]) + print([gold.c.ner[i].move for i in range(gold.length)]) + print([gold.c.ner[i].label for i in range(gold.length)]) + print("Self labels", [self.c[i].label for i in range(self.n_moves)]) raise ValueError( "Could not find a gold-standard action to supervise " "the entity recognizer\n" - "The transition system has %d actions.\n" - "%s" % (self.n_moves)) + "The transition system has %d actions." % (self.n_moves)) - def add_action(self, int action, label): - if not isinstance(label, int): - label = self.strings[label] + def add_action(self, int action, label_name): + cdef attr_t label_id + if not isinstance(label_name, int): + label_id = self.strings.add(label_name) + else: + label_id = label_name # Check we're not creating a move we already have, so that this is # idempotent for trans in self.c[:self.n_moves]: - if trans.move == action and trans.label == label: + if trans.move == action and trans.label == label_id: return 0 if self.n_moves >= self._size: self._size *= 2 self.c = self.mem.realloc(self.c, self._size * sizeof(self.c[0])) - - self.c[self.n_moves] = self.init_transition(self.n_moves, action, label) + self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) + assert self.c[self.n_moves].label == label_id self.n_moves += 1 return 1 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index a55d3fb3a..51e61507e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -336,7 +336,7 @@ cdef class Doc: cdef int i cdef const TokenC* token cdef int start = -1 - cdef int label = 0 + cdef attr_t label = 0 output = [] for i in range(self.length): token = &self.c[i] diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd index 303933d42..8d675c04f 100644 --- a/spacy/tokens/span.pxd +++ b/spacy/tokens/span.pxd @@ -1,6 +1,7 @@ cimport numpy as np from .doc cimport Doc +from ..typedefs cimport attr_t cdef class Span: @@ -9,7 +10,7 @@ cdef class Span: cdef readonly int end cdef readonly int start_char cdef readonly int end_char - cdef readonly int label + cdef readonly attr_t label cdef public _vector cdef public _vector_norm diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index ed5e44ea8..9f2115fe1 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -43,6 +43,7 @@ cdef class Span: self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1]) else: self.end_char = 0 + assert label in doc.vocab.strings, label self.label = label self._vector = vector self._vector_norm = vector_norm @@ -256,6 +257,7 @@ cdef class Span: # The tricky thing here is that Span accepts its tokenisation changing, # so it's okay once we have the Span objects. See Issue #375 spans = [] + cdef attr_t label for start, end, label in self.doc.noun_chunks_iterator(self): spans.append(Span(self, start, end, label=label)) for span in spans: