mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Fixes for new StringStore
This commit is contained in:
parent
8a24c60c1e
commit
7996d21717
|
@ -112,9 +112,9 @@ cdef class StringStore:
|
||||||
elif isinstance(string_or_id, bytes):
|
elif isinstance(string_or_id, bytes):
|
||||||
key = hash_utf8(string_or_id, len(string_or_id))
|
key = hash_utf8(string_or_id, len(string_or_id))
|
||||||
return key
|
return key
|
||||||
else:
|
elif string_or_id < len(SYMBOLS_BY_INT):
|
||||||
if string_or_id < len(SYMBOLS_BY_INT):
|
|
||||||
return SYMBOLS_BY_INT[string_or_id]
|
return SYMBOLS_BY_INT[string_or_id]
|
||||||
|
else:
|
||||||
key = string_or_id
|
key = string_or_id
|
||||||
utf8str = <Utf8Str*>self._map.get(key)
|
utf8str = <Utf8Str*>self._map.get(key)
|
||||||
if utf8str is NULL:
|
if utf8str is NULL:
|
||||||
|
@ -151,13 +151,23 @@ cdef class StringStore:
|
||||||
string (unicode): The string to check.
|
string (unicode): The string to check.
|
||||||
RETURNS (bool): Whether the store contains the string.
|
RETURNS (bool): Whether the store contains the string.
|
||||||
"""
|
"""
|
||||||
if len(string) == 0:
|
cdef hash_t key
|
||||||
|
if isinstance(string, int) or isinstance(string, long):
|
||||||
|
if string == 0:
|
||||||
return True
|
return True
|
||||||
if string in SYMBOLS_BY_STR:
|
key = string
|
||||||
|
elif len(string) == 0:
|
||||||
return True
|
return True
|
||||||
if isinstance(string, unicode):
|
elif string in SYMBOLS_BY_STR:
|
||||||
|
return True
|
||||||
|
elif isinstance(string, unicode):
|
||||||
|
key = hash_string(string)
|
||||||
|
else:
|
||||||
string = string.encode('utf8')
|
string = string.encode('utf8')
|
||||||
cdef hash_t key = hash_utf8(string, len(string))
|
key = hash_utf8(string, len(string))
|
||||||
|
if key < len(SYMBOLS_BY_INT):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
return self._map.get(key) is not NULL
|
return self._map.get(key) is not NULL
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
|
|
@ -9,6 +9,7 @@ from ..structs cimport TokenC, Entity
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..symbols cimport punct
|
from ..symbols cimport punct
|
||||||
from ..attrs cimport IS_SPACE
|
from ..attrs cimport IS_SPACE
|
||||||
|
from ..typedefs cimport attr_t
|
||||||
|
|
||||||
|
|
||||||
cdef inline bint is_space_token(const TokenC* token) nogil:
|
cdef inline bint is_space_token(const TokenC* token) nogil:
|
||||||
|
@ -268,7 +269,7 @@ cdef cppclass StateC:
|
||||||
this._s_i -= 1
|
this._s_i -= 1
|
||||||
this.shifted[this.B(0)] = True
|
this.shifted[this.B(0)] = True
|
||||||
|
|
||||||
void add_arc(int head, int child, int label) nogil:
|
void add_arc(int head, int child, attr_t label) nogil:
|
||||||
if this.has_head(child):
|
if this.has_head(child):
|
||||||
this.del_arc(this.H(child), child)
|
this.del_arc(this.H(child), child)
|
||||||
|
|
||||||
|
@ -312,7 +313,7 @@ cdef cppclass StateC:
|
||||||
h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
|
h.l_edge = this.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
|
||||||
h.l_kids -= 1
|
h.l_kids -= 1
|
||||||
|
|
||||||
void open_ent(int label) nogil:
|
void open_ent(attr_t label) nogil:
|
||||||
this._ents[this._e_i].start = this.B(0)
|
this._ents[this._e_i].start = this.B(0)
|
||||||
this._ents[this._e_i].label = label
|
this._ents[this._e_i].label = label
|
||||||
this._ents[this._e_i].end = -1
|
this._ents[this._e_i].end = -1
|
||||||
|
@ -324,7 +325,7 @@ cdef cppclass StateC:
|
||||||
this._ents[this._e_i-1].end = this.B(0)+1
|
this._ents[this._e_i-1].end = this.B(0)+1
|
||||||
this._sent[this.B(0)].ent_iob = 1
|
this._sent[this.B(0)].ent_iob = 1
|
||||||
|
|
||||||
void set_ent_tag(int i, int ent_iob, int ent_type) nogil:
|
void set_ent_tag(int i, int ent_iob, attr_t ent_type) nogil:
|
||||||
if 0 <= i < this.length:
|
if 0 <= i < this.length:
|
||||||
this._sent[i].ent_iob = ent_iob
|
this._sent[i].ent_iob = ent_iob
|
||||||
this._sent[i].ent_type = ent_type
|
this._sent[i].ent_type = ent_type
|
||||||
|
|
|
@ -123,6 +123,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
return gold
|
return gold
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
|
cdef attr_t label
|
||||||
if name == '-' or name == None:
|
if name == '-' or name == None:
|
||||||
move_str = 'M'
|
move_str = 'M'
|
||||||
label = 0
|
label = 0
|
||||||
|
@ -241,7 +242,7 @@ cdef class Begin:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef int g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return 0
|
return 0
|
||||||
|
|
|
@ -4,6 +4,7 @@ from cymem.cymem cimport Pool
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
from ..structs cimport TokenC, Entity
|
from ..structs cimport TokenC, Entity
|
||||||
|
from ..typedefs cimport attr_t
|
||||||
|
|
||||||
from ..vocab cimport EMPTY_LEXEME
|
from ..vocab cimport EMPTY_LEXEME
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
|
@ -105,19 +106,19 @@ cdef class StateClass:
|
||||||
cdef inline void unshift(self) nogil:
|
cdef inline void unshift(self) nogil:
|
||||||
self.c.unshift()
|
self.c.unshift()
|
||||||
|
|
||||||
cdef inline void add_arc(self, int head, int child, int label) nogil:
|
cdef inline void add_arc(self, int head, int child, attr_t label) nogil:
|
||||||
self.c.add_arc(head, child, label)
|
self.c.add_arc(head, child, label)
|
||||||
|
|
||||||
cdef inline void del_arc(self, int head, int child) nogil:
|
cdef inline void del_arc(self, int head, int child) nogil:
|
||||||
self.c.del_arc(head, child)
|
self.c.del_arc(head, child)
|
||||||
|
|
||||||
cdef inline void open_ent(self, int label) nogil:
|
cdef inline void open_ent(self, attr_t label) nogil:
|
||||||
self.c.open_ent(label)
|
self.c.open_ent(label)
|
||||||
|
|
||||||
cdef inline void close_ent(self) nogil:
|
cdef inline void close_ent(self) nogil:
|
||||||
self.c.close_ent()
|
self.c.close_ent()
|
||||||
|
|
||||||
cdef inline void set_ent_tag(self, int i, int ent_iob, int ent_type) nogil:
|
cdef inline void set_ent_tag(self, int i, int ent_iob, attr_t ent_type) nogil:
|
||||||
self.c.set_ent_tag(i, ent_iob, ent_type)
|
self.c.set_ent_tag(i, ent_iob, ent_type)
|
||||||
|
|
||||||
cdef inline void set_break(self, int i) nogil:
|
cdef inline void set_break(self, int i) nogil:
|
||||||
|
|
|
@ -10,6 +10,7 @@ from collections import defaultdict, OrderedDict
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||||
|
from ..typedefs cimport attr_t
|
||||||
|
|
||||||
|
|
||||||
cdef weight_t MIN_SCORE = -90000
|
cdef weight_t MIN_SCORE = -90000
|
||||||
|
@ -37,7 +38,7 @@ cdef class TransitionSystem:
|
||||||
for action, label_strs in labels_by_action.items():
|
for action, label_strs in labels_by_action.items():
|
||||||
for label_str in label_strs:
|
for label_str in label_strs:
|
||||||
self.add_action(int(action), label_str)
|
self.add_action(int(action), label_str)
|
||||||
self.root_label = self.strings['ROOT']
|
self.root_label = self.strings.add('ROOT')
|
||||||
self.init_beam_state = _init_state
|
self.init_beam_state = _init_state
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
|
@ -125,24 +126,30 @@ cdef class TransitionSystem:
|
||||||
if n_gold <= 0:
|
if n_gold <= 0:
|
||||||
print(gold.words)
|
print(gold.words)
|
||||||
print(gold.ner)
|
print(gold.ner)
|
||||||
|
print([gold.c.ner[i].clas for i in range(gold.length)])
|
||||||
|
print([gold.c.ner[i].move for i in range(gold.length)])
|
||||||
|
print([gold.c.ner[i].label for i in range(gold.length)])
|
||||||
|
print("Self labels", [self.c[i].label for i in range(self.n_moves)])
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Could not find a gold-standard action to supervise "
|
"Could not find a gold-standard action to supervise "
|
||||||
"the entity recognizer\n"
|
"the entity recognizer\n"
|
||||||
"The transition system has %d actions.\n"
|
"The transition system has %d actions." % (self.n_moves))
|
||||||
"%s" % (self.n_moves))
|
|
||||||
|
|
||||||
def add_action(self, int action, label):
|
def add_action(self, int action, label_name):
|
||||||
if not isinstance(label, int):
|
cdef attr_t label_id
|
||||||
label = self.strings[label]
|
if not isinstance(label_name, int):
|
||||||
|
label_id = self.strings.add(label_name)
|
||||||
|
else:
|
||||||
|
label_id = label_name
|
||||||
# Check we're not creating a move we already have, so that this is
|
# Check we're not creating a move we already have, so that this is
|
||||||
# idempotent
|
# idempotent
|
||||||
for trans in self.c[:self.n_moves]:
|
for trans in self.c[:self.n_moves]:
|
||||||
if trans.move == action and trans.label == label:
|
if trans.move == action and trans.label == label_id:
|
||||||
return 0
|
return 0
|
||||||
if self.n_moves >= self._size:
|
if self.n_moves >= self._size:
|
||||||
self._size *= 2
|
self._size *= 2
|
||||||
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
|
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
|
||||||
|
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||||
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label)
|
assert self.c[self.n_moves].label == label_id
|
||||||
self.n_moves += 1
|
self.n_moves += 1
|
||||||
return 1
|
return 1
|
||||||
|
|
|
@ -336,7 +336,7 @@ cdef class Doc:
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef const TokenC* token
|
cdef const TokenC* token
|
||||||
cdef int start = -1
|
cdef int start = -1
|
||||||
cdef int label = 0
|
cdef attr_t label = 0
|
||||||
output = []
|
output = []
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
token = &self.c[i]
|
token = &self.c[i]
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
|
||||||
from .doc cimport Doc
|
from .doc cimport Doc
|
||||||
|
from ..typedefs cimport attr_t
|
||||||
|
|
||||||
|
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
|
@ -9,7 +10,7 @@ cdef class Span:
|
||||||
cdef readonly int end
|
cdef readonly int end
|
||||||
cdef readonly int start_char
|
cdef readonly int start_char
|
||||||
cdef readonly int end_char
|
cdef readonly int end_char
|
||||||
cdef readonly int label
|
cdef readonly attr_t label
|
||||||
|
|
||||||
cdef public _vector
|
cdef public _vector
|
||||||
cdef public _vector_norm
|
cdef public _vector_norm
|
||||||
|
|
|
@ -43,6 +43,7 @@ cdef class Span:
|
||||||
self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1])
|
self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1])
|
||||||
else:
|
else:
|
||||||
self.end_char = 0
|
self.end_char = 0
|
||||||
|
assert label in doc.vocab.strings, label
|
||||||
self.label = label
|
self.label = label
|
||||||
self._vector = vector
|
self._vector = vector
|
||||||
self._vector_norm = vector_norm
|
self._vector_norm = vector_norm
|
||||||
|
@ -256,6 +257,7 @@ cdef class Span:
|
||||||
# The tricky thing here is that Span accepts its tokenisation changing,
|
# The tricky thing here is that Span accepts its tokenisation changing,
|
||||||
# so it's okay once we have the Span objects. See Issue #375
|
# so it's okay once we have the Span objects. See Issue #375
|
||||||
spans = []
|
spans = []
|
||||||
|
cdef attr_t label
|
||||||
for start, end, label in self.doc.noun_chunks_iterator(self):
|
for start, end, label in self.doc.noun_chunks_iterator(self):
|
||||||
spans.append(Span(self, start, end, label=label))
|
spans.append(Span(self, start, end, label=label))
|
||||||
for span in spans:
|
for span in spans:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user