Fixes for new StringStore

This commit is contained in:
Matthew Honnibal 2017-05-28 11:09:27 -05:00
parent 8a24c60c1e
commit 7996d21717
8 changed files with 48 additions and 25 deletions

View File

@ -112,9 +112,9 @@ cdef class StringStore:
elif isinstance(string_or_id, bytes): elif isinstance(string_or_id, bytes):
key = hash_utf8(string_or_id, len(string_or_id)) key = hash_utf8(string_or_id, len(string_or_id))
return key return key
else: elif string_or_id < len(SYMBOLS_BY_INT):
if string_or_id < len(SYMBOLS_BY_INT):
return SYMBOLS_BY_INT[string_or_id] return SYMBOLS_BY_INT[string_or_id]
else:
key = string_or_id key = string_or_id
utf8str = <Utf8Str*>self._map.get(key) utf8str = <Utf8Str*>self._map.get(key)
if utf8str is NULL: if utf8str is NULL:
@ -151,13 +151,23 @@ cdef class StringStore:
string (unicode): The string to check. string (unicode): The string to check.
RETURNS (bool): Whether the store contains the string. RETURNS (bool): Whether the store contains the string.
""" """
if len(string) == 0: cdef hash_t key
if isinstance(string, int) or isinstance(string, long):
if string == 0:
return True return True
if string in SYMBOLS_BY_STR: key = string
elif len(string) == 0:
return True return True
if isinstance(string, unicode): elif string in SYMBOLS_BY_STR:
return True
elif isinstance(string, unicode):
key = hash_string(string)
else:
string = string.encode('utf8') string = string.encode('utf8')
cdef hash_t key = hash_utf8(string, len(string)) key = hash_utf8(string, len(string))
if key < len(SYMBOLS_BY_INT):
return True
else:
return self._map.get(key) is not NULL return self._map.get(key) is not NULL
def __iter__(self): def __iter__(self):

View File

@ -9,6 +9,7 @@ from ..structs cimport TokenC, Entity
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..symbols cimport punct from ..symbols cimport punct
from ..attrs cimport IS_SPACE from ..attrs cimport IS_SPACE
from ..typedefs cimport attr_t
cdef inline bint is_space_token(const TokenC* token) nogil: cdef inline bint is_space_token(const TokenC* token) nogil:
@ -268,7 +269,7 @@ cdef cppclass StateC:
this._s_i -= 1 this._s_i -= 1
this.shifted[this.B(0)] = True this.shifted[this.B(0)] = True
void add_arc(int head, int child, int label) nogil: void add_arc(int head, int child, attr_t label) nogil:
if this.has_head(child): if this.has_head(child):
this.del_arc(this.H(child), child) this.del_arc(this.H(child), child)
@ -312,7 +313,7 @@ cdef cppclass StateC:
h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
h.l_kids -= 1 h.l_kids -= 1
void open_ent(int label) nogil: void open_ent(attr_t label) nogil:
this._ents[this._e_i].start = this.B(0) this._ents[this._e_i].start = this.B(0)
this._ents[this._e_i].label = label this._ents[this._e_i].label = label
this._ents[this._e_i].end = -1 this._ents[this._e_i].end = -1
@ -324,7 +325,7 @@ cdef cppclass StateC:
this._ents[this._e_i-1].end = this.B(0)+1 this._ents[this._e_i-1].end = this.B(0)+1
this._sent[this.B(0)].ent_iob = 1 this._sent[this.B(0)].ent_iob = 1
void set_ent_tag(int i, int ent_iob, int ent_type) nogil: void set_ent_tag(int i, int ent_iob, attr_t ent_type) nogil:
if 0 <= i < this.length: if 0 <= i < this.length:
this._sent[i].ent_iob = ent_iob this._sent[i].ent_iob = ent_iob
this._sent[i].ent_type = ent_type this._sent[i].ent_type = ent_type

View File

@ -123,6 +123,7 @@ cdef class BiluoPushDown(TransitionSystem):
return gold return gold
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label
if name == '-' or name == None: if name == '-' or name == None:
move_str = 'M' move_str = 'M'
label = 0 label = 0
@ -241,7 +242,7 @@ cdef class Begin:
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING: if g_act == MISSING:
return 0 return 0

View File

@ -4,6 +4,7 @@ from cymem.cymem cimport Pool
cimport cython cimport cython
from ..structs cimport TokenC, Entity from ..structs cimport TokenC, Entity
from ..typedefs cimport attr_t
from ..vocab cimport EMPTY_LEXEME from ..vocab cimport EMPTY_LEXEME
from ._state cimport StateC from ._state cimport StateC
@ -105,19 +106,19 @@ cdef class StateClass:
cdef inline void unshift(self) nogil: cdef inline void unshift(self) nogil:
self.c.unshift() self.c.unshift()
cdef inline void add_arc(self, int head, int child, int label) nogil: cdef inline void add_arc(self, int head, int child, attr_t label) nogil:
self.c.add_arc(head, child, label) self.c.add_arc(head, child, label)
cdef inline void del_arc(self, int head, int child) nogil: cdef inline void del_arc(self, int head, int child) nogil:
self.c.del_arc(head, child) self.c.del_arc(head, child)
cdef inline void open_ent(self, int label) nogil: cdef inline void open_ent(self, attr_t label) nogil:
self.c.open_ent(label) self.c.open_ent(label)
cdef inline void close_ent(self) nogil: cdef inline void close_ent(self) nogil:
self.c.close_ent() self.c.close_ent()
cdef inline void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil: cdef inline void set_ent_tag(self, int i, int ent_iob, attr_t ent_type) nogil:
self.c.set_ent_tag(i, ent_iob, ent_type) self.c.set_ent_tag(i, ent_iob, ent_type)
cdef inline void set_break(self, int i) nogil: cdef inline void set_break(self, int i) nogil:

View File

@ -10,6 +10,7 @@ from collections import defaultdict, OrderedDict
from ..structs cimport TokenC from ..structs cimport TokenC
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
from ..typedefs cimport attr_t
cdef weight_t MIN_SCORE = -90000 cdef weight_t MIN_SCORE = -90000
@ -37,7 +38,7 @@ cdef class TransitionSystem:
for action, label_strs in labels_by_action.items(): for action, label_strs in labels_by_action.items():
for label_str in label_strs: for label_str in label_strs:
self.add_action(int(action), label_str) self.add_action(int(action), label_str)
self.root_label = self.strings['ROOT'] self.root_label = self.strings.add('ROOT')
self.init_beam_state = _init_state self.init_beam_state = _init_state
def __reduce__(self): def __reduce__(self):
@ -125,24 +126,30 @@ cdef class TransitionSystem:
if n_gold <= 0: if n_gold <= 0:
print(gold.words) print(gold.words)
print(gold.ner) print(gold.ner)
print([gold.c.ner[i].clas for i in range(gold.length)])
print([gold.c.ner[i].move for i in range(gold.length)])
print([gold.c.ner[i].label for i in range(gold.length)])
print("Self labels", [self.c[i].label for i in range(self.n_moves)])
raise ValueError( raise ValueError(
"Could not find a gold-standard action to supervise " "Could not find a gold-standard action to supervise "
"the entity recognizer\n" "the entity recognizer\n"
"The transition system has %d actions.\n" "The transition system has %d actions." % (self.n_moves))
"%s" % (self.n_moves))
def add_action(self, int action, label): def add_action(self, int action, label_name):
if not isinstance(label, int): cdef attr_t label_id
label = self.strings[label] if not isinstance(label_name, int):
label_id = self.strings.add(label_name)
else:
label_id = label_name
# Check we're not creating a move we already have, so that this is # Check we're not creating a move we already have, so that this is
# idempotent # idempotent
for trans in self.c[:self.n_moves]: for trans in self.c[:self.n_moves]:
if trans.move == action and trans.label == label: if trans.move == action and trans.label == label_id:
return 0 return 0
if self.n_moves >= self._size: if self.n_moves >= self._size:
self._size *= 2 self._size *= 2
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0])) self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label) assert self.c[self.n_moves].label == label_id
self.n_moves += 1 self.n_moves += 1
return 1 return 1

View File

@ -336,7 +336,7 @@ cdef class Doc:
cdef int i cdef int i
cdef const TokenC* token cdef const TokenC* token
cdef int start = -1 cdef int start = -1
cdef int label = 0 cdef attr_t label = 0
output = [] output = []
for i in range(self.length): for i in range(self.length):
token = &self.c[i] token = &self.c[i]

View File

@ -1,6 +1,7 @@
cimport numpy as np cimport numpy as np
from .doc cimport Doc from .doc cimport Doc
from ..typedefs cimport attr_t
cdef class Span: cdef class Span:
@ -9,7 +10,7 @@ cdef class Span:
cdef readonly int end cdef readonly int end
cdef readonly int start_char cdef readonly int start_char
cdef readonly int end_char cdef readonly int end_char
cdef readonly int label cdef readonly attr_t label
cdef public _vector cdef public _vector
cdef public _vector_norm cdef public _vector_norm

View File

@ -43,6 +43,7 @@ cdef class Span:
self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1]) self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1])
else: else:
self.end_char = 0 self.end_char = 0
assert label in doc.vocab.strings, label
self.label = label self.label = label
self._vector = vector self._vector = vector
self._vector_norm = vector_norm self._vector_norm = vector_norm
@ -256,6 +257,7 @@ cdef class Span:
# The tricky thing here is that Span accepts its tokenisation changing, # The tricky thing here is that Span accepts its tokenisation changing,
# so it's okay once we have the Span objects. See Issue #375 # so it's okay once we have the Span objects. See Issue #375
spans = [] spans = []
cdef attr_t label
for start, end, label in self.doc.noun_chunks_iterator(self): for start, end, label in self.doc.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label)) spans.append(Span(self, start, end, label=label))
for span in spans: for span in spans: