mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
WIP on stringstore change. 27 failures
This commit is contained in:
parent
fe4a746300
commit
84e66ca6d4
|
@ -150,6 +150,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
else:
|
||||
int_key = IDS[name.upper()]
|
||||
if strings_map is not None and isinstance(value, basestring):
|
||||
value = strings_map[value]
|
||||
value = strings_map.add(value)
|
||||
inty_attrs[int_key] = value
|
||||
return inty_attrs
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from .structs cimport TokenC
|
||||
from .typedefs cimport attr_t
|
||||
from .syntax.transition_system cimport Transition
|
||||
|
||||
|
||||
cdef struct GoldParseC:
|
||||
int* tags
|
||||
int* heads
|
||||
int* labels
|
||||
attr_t* labels
|
||||
int** brackets
|
||||
Transition* ner
|
||||
|
||||
|
|
|
@ -384,7 +384,7 @@ cdef class GoldParse:
|
|||
# These are filled by the tagger/parser/entity recogniser
|
||||
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
||||
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
||||
|
||||
self.words = [None] * len(doc)
|
||||
|
|
|
@ -35,7 +35,7 @@ cdef class Lexeme:
|
|||
@staticmethod
|
||||
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
|
||||
buff = <unsigned char*>&lex.flags
|
||||
end = <unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
|
||||
end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
|
||||
for i in range(sizeof(lex_data.data)):
|
||||
buff[i] = lex_data.data[i]
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ cdef class Morphology:
|
|||
self.tag_map[tag_str] = dict(attrs)
|
||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||
self.rich_tags[i].id = i
|
||||
self.rich_tags[i].name = self.strings[tag_str]
|
||||
self.rich_tags[i].name = self.strings.add(tag_str)
|
||||
self.rich_tags[i].morph = 0
|
||||
self.rich_tags[i].pos = attrs[POS]
|
||||
self.reverse_index[self.rich_tags[i].name] = i
|
||||
|
@ -59,10 +59,12 @@ cdef class Morphology:
|
|||
|
||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||
if isinstance(tag, basestring):
|
||||
tag_id = self.reverse_index[self.strings[tag]]
|
||||
else:
|
||||
tag = self.strings.add(tag)
|
||||
if tag in self.reverse_index:
|
||||
tag_id = self.reverse_index[tag]
|
||||
self.assign_tag_id(token, tag_id)
|
||||
else:
|
||||
token.tag = tag
|
||||
|
||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||
if tag_id >= self.n_tags:
|
||||
|
@ -73,7 +75,7 @@ cdef class Morphology:
|
|||
# the statistical model fails.
|
||||
# Related to Issue #220
|
||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||
tag_id = self.reverse_index[self.strings['SP']]
|
||||
tag_id = self.reverse_index[self.strings.add('SP')]
|
||||
rich_tag = self.rich_tags[tag_id]
|
||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||
if analysis is NULL:
|
||||
|
@ -104,7 +106,7 @@ cdef class Morphology:
|
|||
tag (unicode): The part-of-speech tag to key the exception.
|
||||
orth (unicode): The word-form to key the exception.
|
||||
"""
|
||||
tag = self.strings[tag_str]
|
||||
tag = self.strings.add(tag_str)
|
||||
tag_id = self.reverse_index[tag]
|
||||
orth = self.strings[orth_str]
|
||||
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
||||
|
@ -140,9 +142,9 @@ cdef class Morphology:
|
|||
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
||||
cdef unicode py_string = self.strings[orth]
|
||||
if self.lemmatizer is None:
|
||||
return self.strings[py_string.lower()]
|
||||
return self.strings.add(py_string.lower())
|
||||
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
|
||||
return self.strings[py_string.lower()]
|
||||
return self.strings.add(py_string.lower())
|
||||
cdef set lemma_strings
|
||||
cdef unicode lemma_string
|
||||
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
||||
|
|
|
@ -23,7 +23,6 @@ cdef struct LexemeC:
|
|||
|
||||
float prob
|
||||
float sentiment
|
||||
float l2_norm
|
||||
|
||||
|
||||
cdef struct SerializedLexemeC:
|
||||
|
@ -48,7 +47,7 @@ cdef struct Entity:
|
|||
hash_t id
|
||||
int start
|
||||
int end
|
||||
int label
|
||||
attr_t label
|
||||
|
||||
|
||||
cdef struct TokenC:
|
||||
|
@ -56,10 +55,10 @@ cdef struct TokenC:
|
|||
uint64_t morph
|
||||
univ_pos_t pos
|
||||
bint spacy
|
||||
int tag
|
||||
attr_t tag
|
||||
int idx
|
||||
attr_t lemma
|
||||
int sense
|
||||
attr_t sense
|
||||
int head
|
||||
attr_t dep
|
||||
bint sent_start
|
||||
|
@ -70,5 +69,5 @@ cdef struct TokenC:
|
|||
uint32_t r_edge
|
||||
|
||||
int ent_iob
|
||||
int ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||
hash_t ent_id
|
||||
|
|
|
@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
|
|||
from thinc.typedefs cimport weight_t
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ..gold cimport GoldParseC
|
||||
|
|
|
@ -99,7 +99,7 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
|
|||
return False
|
||||
|
||||
|
||||
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil:
|
||||
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil:
|
||||
if gold.labels[child] == -1:
|
||||
return True
|
||||
elif label == -1:
|
||||
|
@ -116,16 +116,16 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
|
|||
|
||||
cdef class Shift:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.push()
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass st, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil:
|
||||
return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -133,17 +133,17 @@ cdef class Shift:
|
|||
return push_cost(s, gold, s.B(0))
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return 0
|
||||
|
||||
|
||||
cdef class Reduce:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return st.stack_depth() >= 2
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
if st.has_head(st.S(0)):
|
||||
st.pop()
|
||||
else:
|
||||
|
@ -151,7 +151,7 @@ cdef class Reduce:
|
|||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -170,23 +170,23 @@ cdef class Reduce:
|
|||
return cost
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return 0
|
||||
|
||||
|
||||
cdef class LeftArc:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return not st.B_(0).sent_start
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.add_arc(st.B(0), st.S(0), label)
|
||||
st.pop()
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -204,23 +204,23 @@ cdef class LeftArc:
|
|||
return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
|
||||
|
||||
|
||||
cdef class RightArc:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return not st.B_(0).sent_start
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.add_arc(st.S(0), st.B(0), label)
|
||||
st.push()
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -233,13 +233,13 @@ cdef class RightArc:
|
|||
return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
|
||||
|
||||
|
||||
cdef class Break:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int i
|
||||
if not USE_BREAK:
|
||||
return False
|
||||
|
@ -251,12 +251,12 @@ cdef class Break:
|
|||
return True
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.set_break(st.B_(0).l_edge)
|
||||
st.fast_forward()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
|
||||
|
||||
@staticmethod
|
||||
|
@ -281,7 +281,7 @@ cdef class Break:
|
|||
return cost + 1
|
||||
|
||||
@staticmethod
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return 0
|
||||
|
||||
cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
||||
|
@ -369,7 +369,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
if label.upper() == 'ROOT':
|
||||
label = 'ROOT'
|
||||
gold.c.heads[i] = gold.heads[i]
|
||||
gold.c.labels[i] = self.strings[label]
|
||||
gold.c.labels[i] = self.strings.add(label)
|
||||
return gold
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
|
@ -384,14 +384,14 @@ cdef class ArcEager(TransitionSystem):
|
|||
if self.c[i].move == move and self.c[i].label == label:
|
||||
return self.c[i]
|
||||
|
||||
def move_name(self, int move, int label):
|
||||
def move_name(self, int move, attr_t label):
|
||||
label_str = self.strings[label]
|
||||
if label_str:
|
||||
return MOVE_NAMES[move] + '-' + label_str
|
||||
else:
|
||||
return MOVE_NAMES[move]
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||
# constructor with the function pointers
|
||||
cdef Transition t
|
||||
|
@ -469,7 +469,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
label_cost_funcs[RIGHT] = RightArc.label_cost
|
||||
label_cost_funcs[BREAK] = Break.label_cost
|
||||
|
||||
cdef int* labels = gold.c.labels
|
||||
cdef attr_t* labels = gold.c.labels
|
||||
cdef int* heads = gold.c.heads
|
||||
|
||||
n_gold = 0
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from .transition_system cimport TransitionSystem
|
||||
from .transition_system cimport Transition
|
||||
from ..gold cimport GoldParseC
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
|
||||
cdef class BiluoPushDown(TransitionSystem):
|
||||
|
|
|
@ -100,7 +100,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
def __get__(self):
|
||||
return (BEGIN, IN, LAST, UNIT, OUT)
|
||||
|
||||
def move_name(self, int move, int label):
|
||||
def move_name(self, int move, attr_t label):
|
||||
if move == OUT:
|
||||
return 'O'
|
||||
elif move == MISSING:
|
||||
|
@ -132,7 +132,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
if label_str.startswith('!'):
|
||||
label_str = label_str[1:]
|
||||
move_str = 'x'
|
||||
label = self.strings[label_str]
|
||||
label = self.strings.add(label_str)
|
||||
else:
|
||||
move_str = name
|
||||
label = 0
|
||||
|
@ -145,7 +145,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
else:
|
||||
raise KeyError(name)
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||
# constructor with the function pointers
|
||||
cdef Transition t
|
||||
|
@ -194,21 +194,21 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
|
||||
cdef class Missing:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* s, int label) nogil:
|
||||
cdef int transition(StateC* s, attr_t label) nogil:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
return 9000
|
||||
|
||||
|
||||
cdef class Begin:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
# Ensure we don't clobber preset entities. If no entity preset,
|
||||
# ent_iob is 0
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
|
@ -232,14 +232,14 @@ cdef class Begin:
|
|||
return label != 0 and not st.entity_is_open()
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.open_ent(label)
|
||||
st.set_ent_tag(st.B(0), 3, label)
|
||||
st.push()
|
||||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
|
@ -261,7 +261,7 @@ cdef class Begin:
|
|||
|
||||
cdef class In:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
if preset_ent_iob == 2:
|
||||
return False
|
||||
|
@ -277,17 +277,17 @@ cdef class In:
|
|||
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.set_ent_tag(st.B(0), 1, label)
|
||||
st.push()
|
||||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
move = IN
|
||||
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
||||
|
||||
if g_act == MISSING:
|
||||
|
@ -313,24 +313,24 @@ cdef class In:
|
|||
|
||||
cdef class Last:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
if st.B_(1).ent_iob == 1:
|
||||
return False
|
||||
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.close_ent()
|
||||
st.set_ent_tag(st.B(0), 1, label)
|
||||
st.push()
|
||||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
move = LAST
|
||||
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
if g_act == MISSING:
|
||||
return 0
|
||||
|
@ -355,7 +355,7 @@ cdef class Last:
|
|||
|
||||
cdef class Unit:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
if preset_ent_iob == 2:
|
||||
return False
|
||||
|
@ -368,7 +368,7 @@ cdef class Unit:
|
|||
return label != 0 and not st.entity_is_open()
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.open_ent(label)
|
||||
st.close_ent()
|
||||
st.set_ent_tag(st.B(0), 3, label)
|
||||
|
@ -376,9 +376,9 @@ cdef class Unit:
|
|||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
if g_act == MISSING:
|
||||
return 0
|
||||
|
@ -398,7 +398,7 @@ cdef class Unit:
|
|||
|
||||
cdef class Out:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||
if preset_ent_iob == 3:
|
||||
return False
|
||||
|
@ -407,15 +407,15 @@ cdef class Out:
|
|||
return not st.entity_is_open()
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, int label) nogil:
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
st.set_ent_tag(st.B(0), 2, 0)
|
||||
st.push()
|
||||
st.pop()
|
||||
|
||||
@staticmethod
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||
cdef int g_act = gold.ner[s.B(0)].move
|
||||
cdef int g_tag = gold.ner[s.B(0)].label
|
||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||
|
||||
if g_act == MISSING or g_act == ISNT:
|
||||
return 0
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport weight_t
|
||||
|
||||
from ..typedefs cimport attr_t
|
||||
from ..structs cimport TokenC
|
||||
from ..gold cimport GoldParse
|
||||
from ..gold cimport GoldParseC
|
||||
|
@ -13,20 +14,22 @@ from ._state cimport StateC
|
|||
cdef struct Transition:
|
||||
int clas
|
||||
int move
|
||||
int label
|
||||
attr_t label
|
||||
|
||||
weight_t score
|
||||
|
||||
bint (*is_valid)(const StateC* state, int label) nogil
|
||||
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil
|
||||
int (*do)(StateC* state, int label) nogil
|
||||
bint (*is_valid)(const StateC* state, attr_t label) nogil
|
||||
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil
|
||||
int (*do)(StateC* state, attr_t label) nogil
|
||||
|
||||
|
||||
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
|
||||
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold,
|
||||
attr_tlabel) nogil
|
||||
ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
|
||||
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
|
||||
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC*
|
||||
gold, attr_t label) nogil
|
||||
|
||||
ctypedef int (*do_func_t)(StateC* state, int label) nogil
|
||||
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
|
||||
|
||||
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
|
||||
|
||||
|
@ -36,7 +39,7 @@ cdef class TransitionSystem:
|
|||
cdef Transition* c
|
||||
cdef readonly int n_moves
|
||||
cdef int _size
|
||||
cdef public int root_label
|
||||
cdef public attr_t root_label
|
||||
cdef public freqs
|
||||
cdef init_state_t init_beam_state
|
||||
|
||||
|
@ -45,7 +48,7 @@ cdef class TransitionSystem:
|
|||
|
||||
cdef Transition lookup_transition(self, object name) except *
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *
|
||||
|
||||
cdef int set_valid(self, int* output, const StateC* st) nogil
|
||||
|
||||
|
|
|
@ -99,7 +99,7 @@ cdef class TransitionSystem:
|
|||
cdef Transition lookup_transition(self, object name) except *:
|
||||
raise NotImplementedError
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||
raise NotImplementedError
|
||||
|
||||
def is_valid(self, StateClass stcls, move_name):
|
||||
|
|
|
@ -204,6 +204,7 @@ def test_doc_api_right_edge(en_tokenizer):
|
|||
assert doc[6].right_edge.text == ','
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text,vectors', [
|
||||
("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
|
||||
])
|
||||
|
|
|
@ -11,7 +11,6 @@ import struct
|
|||
import dill
|
||||
|
||||
from libc.string cimport memcpy, memset
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.math cimport sqrt
|
||||
|
||||
from .span cimport Span
|
||||
|
@ -21,6 +20,7 @@ from .token cimport Token
|
|||
from .printers import parse_tree
|
||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..attrs import intify_attrs
|
||||
from ..attrs cimport attr_id_t
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||
|
@ -494,8 +494,8 @@ cdef class Doc:
|
|||
cdef np.ndarray[attr_t, ndim=2] output
|
||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||
# dict iteration.
|
||||
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)
|
||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
|
||||
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
||||
for i in range(self.length):
|
||||
for j, feature in enumerate(attr_ids):
|
||||
output[i, j] = get_token_attr(&self.c[i], feature)
|
||||
|
@ -640,7 +640,7 @@ cdef class Doc:
|
|||
"""
|
||||
if self.length != 0:
|
||||
raise ValueError("Cannot load into non-empty Doc")
|
||||
cdef int[:, :] attrs
|
||||
cdef attr_t[:, :] attrs
|
||||
cdef int i, start, end, has_space
|
||||
fields = dill.loads(data)
|
||||
text, attrs = fields[:2]
|
||||
|
@ -679,17 +679,15 @@ cdef class Doc:
|
|||
if len(args) == 3:
|
||||
# TODO: Warn deprecation
|
||||
tag, lemma, ent_type = args
|
||||
attributes[TAG] = self.vocab.strings[tag]
|
||||
attributes[LEMMA] = self.vocab.strings[lemma]
|
||||
attributes[ENT_TYPE] = self.vocab.strings[ent_type]
|
||||
attributes[TAG] = tag
|
||||
attributes[LEMMA] = lemma
|
||||
attributes[ENT_TYPE] = ent_type
|
||||
elif not args:
|
||||
# TODO: This code makes little sense overall. We're still
|
||||
# ignoring most of the attributes?
|
||||
if "label" in attributes and 'ent_type' not in attributes:
|
||||
if type(attributes["label"]) == int:
|
||||
attributes[ENT_TYPE] = attributes["label"]
|
||||
else:
|
||||
attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]]
|
||||
attributes[ENT_TYPE] = self.vocab.strings.add(attributes["label"])
|
||||
if 'ent_type' in attributes:
|
||||
attributes[ENT_TYPE] = attributes['ent_type']
|
||||
elif args:
|
||||
|
@ -699,6 +697,8 @@ cdef class Doc:
|
|||
"Arguments supplied:\n%s\n"
|
||||
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
|
||||
|
||||
attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
|
||||
|
||||
cdef int start = token_by_start(self.c, self.length, start_idx)
|
||||
if start == -1:
|
||||
return None
|
||||
|
@ -708,13 +708,6 @@ cdef class Doc:
|
|||
# Currently we have the token index, we want the range-end index
|
||||
end += 1
|
||||
cdef Span span = self[start:end]
|
||||
tag = self.vocab.strings[attributes.get(TAG, span.root.tag)]
|
||||
lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)]
|
||||
ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)]
|
||||
ent_id = attributes.get('ent_id', span.root.ent_id)
|
||||
if isinstance(ent_id, basestring):
|
||||
ent_id = self.vocab.strings[ent_id]
|
||||
|
||||
# Get LexemeC for newly merged token
|
||||
new_orth = ''.join([t.text_with_ws for t in span])
|
||||
if span[-1].whitespace_:
|
||||
|
@ -723,18 +716,11 @@ cdef class Doc:
|
|||
# House the new merged token where it starts
|
||||
cdef TokenC* token = &self.c[start]
|
||||
token.spacy = self.c[end-1].spacy
|
||||
if tag in self.vocab.morphology.tag_map:
|
||||
self.vocab.morphology.assign_tag(token, tag)
|
||||
for attr_name, attr_value in attributes.items():
|
||||
if attr_name == TAG:
|
||||
self.vocab.morphology.assign_tag(token, attr_value)
|
||||
else:
|
||||
token.tag = self.vocab.strings[tag]
|
||||
token.lemma = self.vocab.strings[lemma]
|
||||
if ent_type == 'O':
|
||||
token.ent_iob = 2
|
||||
token.ent_type = 0
|
||||
else:
|
||||
token.ent_iob = 3
|
||||
token.ent_type = self.vocab.strings[ent_type]
|
||||
token.ent_id = ent_id
|
||||
Token.set_struct_attr(token, attr_name, attr_value)
|
||||
# Begin by setting all the head indices to absolute token positions
|
||||
# This is easier to work with for now than the offsets
|
||||
# Before thinking of something simpler, beware the case where a dependency
|
||||
|
|
|
@ -21,14 +21,14 @@ from .. import about
|
|||
|
||||
cdef class Span:
|
||||
"""A slice from a Doc object."""
|
||||
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
|
||||
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
|
||||
vector_norm=None):
|
||||
"""Create a `Span` object from the slice `doc[start : end]`.
|
||||
|
||||
doc (Doc): The parent document.
|
||||
start (int): The index of the first token of the span.
|
||||
end (int): The index of the first token after the span.
|
||||
label (int): A label to attach to the Span, e.g. for named entities.
|
||||
label (uint64): A label to attach to the Span, e.g. for named entities.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
|
@ -377,7 +377,7 @@ cdef class Span:
|
|||
property ent_id:
|
||||
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
|
||||
|
||||
RETURNS (int): The entity ID.
|
||||
RETURNS (uint64): The entity ID.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.root.ent_id
|
||||
|
|
Loading…
Reference in New Issue
Block a user