Fixes for new StringStore

This commit is contained in:
Matthew Honnibal 2017-05-28 11:09:27 -05:00
parent 8a24c60c1e
commit 7996d21717
8 changed files with 48 additions and 25 deletions

View File

@ -112,9 +112,9 @@ cdef class StringStore:
elif isinstance(string_or_id, bytes):
key = hash_utf8(string_or_id, len(string_or_id))
return key
elif string_or_id < len(SYMBOLS_BY_INT):
return SYMBOLS_BY_INT[string_or_id]
else:
if string_or_id < len(SYMBOLS_BY_INT):
return SYMBOLS_BY_INT[string_or_id]
key = string_or_id
utf8str = <Utf8Str*>self._map.get(key)
if utf8str is NULL:
@ -151,14 +151,24 @@ cdef class StringStore:
string (unicode): The string to check.
RETURNS (bool): Whether the store contains the string.
"""
if len(string) == 0:
cdef hash_t key
if isinstance(string, int) or isinstance(string, long):
if string == 0:
return True
key = string
elif len(string) == 0:
return True
if string in SYMBOLS_BY_STR:
elif string in SYMBOLS_BY_STR:
return True
if isinstance(string, unicode):
elif isinstance(string, unicode):
key = hash_string(string)
else:
string = string.encode('utf8')
cdef hash_t key = hash_utf8(string, len(string))
return self._map.get(key) is not NULL
key = hash_utf8(string, len(string))
if key < len(SYMBOLS_BY_INT):
return True
else:
return self._map.get(key) is not NULL
def __iter__(self):
"""Iterate over the strings in the store, in order.

View File

@ -9,6 +9,7 @@ from ..structs cimport TokenC, Entity
from ..lexeme cimport Lexeme
from ..symbols cimport punct
from ..attrs cimport IS_SPACE
from ..typedefs cimport attr_t
cdef inline bint is_space_token(const TokenC* token) nogil:
@ -268,7 +269,7 @@ cdef cppclass StateC:
this._s_i -= 1
this.shifted[this.B(0)] = True
void add_arc(int head, int child, int label) nogil:
void add_arc(int head, int child, attr_t label) nogil:
if this.has_head(child):
this.del_arc(this.H(child), child)
@ -312,7 +313,7 @@ cdef cppclass StateC:
h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
h.l_kids -= 1
void open_ent(int label) nogil:
void open_ent(attr_t label) nogil:
this._ents[this._e_i].start = this.B(0)
this._ents[this._e_i].label = label
this._ents[this._e_i].end = -1
@ -324,7 +325,7 @@ cdef cppclass StateC:
this._ents[this._e_i-1].end = this.B(0)+1
this._sent[this.B(0)].ent_iob = 1
void set_ent_tag(int i, int ent_iob, int ent_type) nogil:
void set_ent_tag(int i, int ent_iob, attr_t ent_type) nogil:
if 0 <= i < this.length:
this._sent[i].ent_iob = ent_iob
this._sent[i].ent_type = ent_type

View File

@ -123,6 +123,7 @@ cdef class BiluoPushDown(TransitionSystem):
return gold
cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label
if name == '-' or name == None:
move_str = 'M'
label = 0
@ -241,7 +242,7 @@ cdef class Begin:
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING:
return 0

View File

@ -4,6 +4,7 @@ from cymem.cymem cimport Pool
cimport cython
from ..structs cimport TokenC, Entity
from ..typedefs cimport attr_t
from ..vocab cimport EMPTY_LEXEME
from ._state cimport StateC
@ -105,19 +106,19 @@ cdef class StateClass:
cdef inline void unshift(self) nogil:
self.c.unshift()
cdef inline void add_arc(self, int head, int child, int label) nogil:
cdef inline void add_arc(self, int head, int child, attr_t label) nogil:
self.c.add_arc(head, child, label)
cdef inline void del_arc(self, int head, int child) nogil:
self.c.del_arc(head, child)
cdef inline void open_ent(self, int label) nogil:
cdef inline void open_ent(self, attr_t label) nogil:
self.c.open_ent(label)
cdef inline void close_ent(self) nogil:
self.c.close_ent()
cdef inline void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil:
cdef inline void set_ent_tag(self, int i, int ent_iob, attr_t ent_type) nogil:
self.c.set_ent_tag(i, ent_iob, ent_type)
cdef inline void set_break(self, int i) nogil:

View File

@ -10,6 +10,7 @@ from collections import defaultdict, OrderedDict
from ..structs cimport TokenC
from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
from ..typedefs cimport attr_t
cdef weight_t MIN_SCORE = -90000
@ -37,7 +38,7 @@ cdef class TransitionSystem:
for action, label_strs in labels_by_action.items():
for label_str in label_strs:
self.add_action(int(action), label_str)
self.root_label = self.strings['ROOT']
self.root_label = self.strings.add('ROOT')
self.init_beam_state = _init_state
def __reduce__(self):
@ -125,24 +126,30 @@ cdef class TransitionSystem:
if n_gold <= 0:
print(gold.words)
print(gold.ner)
print([gold.c.ner[i].clas for i in range(gold.length)])
print([gold.c.ner[i].move for i in range(gold.length)])
print([gold.c.ner[i].label for i in range(gold.length)])
print("Self labels", [self.c[i].label for i in range(self.n_moves)])
raise ValueError(
"Could not find a gold-standard action to supervise "
"the entity recognizer\n"
"The transition system has %d actions.\n"
"%s" % (self.n_moves))
"The transition system has %d actions." % (self.n_moves))
def add_action(self, int action, label):
if not isinstance(label, int):
label = self.strings[label]
def add_action(self, int action, label_name):
cdef attr_t label_id
if not isinstance(label_name, int):
label_id = self.strings.add(label_name)
else:
label_id = label_name
# Check we're not creating a move we already have, so that this is
# idempotent
for trans in self.c[:self.n_moves]:
if trans.move == action and trans.label == label:
if trans.move == action and trans.label == label_id:
return 0
if self.n_moves >= self._size:
self._size *= 2
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label)
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
assert self.c[self.n_moves].label == label_id
self.n_moves += 1
return 1

View File

@ -336,7 +336,7 @@ cdef class Doc:
cdef int i
cdef const TokenC* token
cdef int start = -1
cdef int label = 0
cdef attr_t label = 0
output = []
for i in range(self.length):
token = &self.c[i]

View File

@ -1,6 +1,7 @@
cimport numpy as np
from .doc cimport Doc
from ..typedefs cimport attr_t
cdef class Span:
@ -9,7 +10,7 @@ cdef class Span:
cdef readonly int end
cdef readonly int start_char
cdef readonly int end_char
cdef readonly int label
cdef readonly attr_t label
cdef public _vector
cdef public _vector_norm

View File

@ -43,6 +43,7 @@ cdef class Span:
self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1])
else:
self.end_char = 0
assert label in doc.vocab.strings, label
self.label = label
self._vector = vector
self._vector_norm = vector_norm
@ -256,6 +257,7 @@ cdef class Span:
# The tricky thing here is that Span accepts its tokenisation changing,
# so it's okay once we have the Span objects. See Issue #375
spans = []
cdef attr_t label
for start, end, label in self.doc.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label))
for span in spans: