mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
WIP on stringstore change. 27 failures
This commit is contained in:
parent
fe4a746300
commit
84e66ca6d4
|
@ -150,6 +150,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
else:
|
else:
|
||||||
int_key = IDS[name.upper()]
|
int_key = IDS[name.upper()]
|
||||||
if strings_map is not None and isinstance(value, basestring):
|
if strings_map is not None and isinstance(value, basestring):
|
||||||
value = strings_map[value]
|
value = strings_map.add(value)
|
||||||
inty_attrs[int_key] = value
|
inty_attrs[int_key] = value
|
||||||
return inty_attrs
|
return inty_attrs
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .structs cimport TokenC
|
from .structs cimport TokenC
|
||||||
|
from .typedefs cimport attr_t
|
||||||
from .syntax.transition_system cimport Transition
|
from .syntax.transition_system cimport Transition
|
||||||
|
|
||||||
|
|
||||||
cdef struct GoldParseC:
|
cdef struct GoldParseC:
|
||||||
int* tags
|
int* tags
|
||||||
int* heads
|
int* heads
|
||||||
int* labels
|
attr_t* labels
|
||||||
int** brackets
|
int** brackets
|
||||||
Transition* ner
|
Transition* ner
|
||||||
|
|
||||||
|
|
|
@ -384,7 +384,7 @@ cdef class GoldParse:
|
||||||
# These are filled by the tagger/parser/entity recogniser
|
# These are filled by the tagger/parser/entity recogniser
|
||||||
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))
|
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
||||||
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
||||||
|
|
||||||
self.words = [None] * len(doc)
|
self.words = [None] * len(doc)
|
||||||
|
|
|
@ -35,7 +35,7 @@ cdef class Lexeme:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
|
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
|
||||||
buff = <unsigned char*>&lex.flags
|
buff = <unsigned char*>&lex.flags
|
||||||
end = <unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
|
end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
|
||||||
for i in range(sizeof(lex_data.data)):
|
for i in range(sizeof(lex_data.data)):
|
||||||
buff[i] = lex_data.data[i]
|
buff[i] = lex_data.data[i]
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,7 @@ cdef class Morphology:
|
||||||
self.tag_map[tag_str] = dict(attrs)
|
self.tag_map[tag_str] = dict(attrs)
|
||||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||||
self.rich_tags[i].id = i
|
self.rich_tags[i].id = i
|
||||||
self.rich_tags[i].name = self.strings[tag_str]
|
self.rich_tags[i].name = self.strings.add(tag_str)
|
||||||
self.rich_tags[i].morph = 0
|
self.rich_tags[i].morph = 0
|
||||||
self.rich_tags[i].pos = attrs[POS]
|
self.rich_tags[i].pos = attrs[POS]
|
||||||
self.reverse_index[self.rich_tags[i].name] = i
|
self.reverse_index[self.rich_tags[i].name] = i
|
||||||
|
@ -59,10 +59,12 @@ cdef class Morphology:
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
cdef int assign_tag(self, TokenC* token, tag) except -1:
|
||||||
if isinstance(tag, basestring):
|
if isinstance(tag, basestring):
|
||||||
tag_id = self.reverse_index[self.strings[tag]]
|
tag = self.strings.add(tag)
|
||||||
else:
|
if tag in self.reverse_index:
|
||||||
tag_id = self.reverse_index[tag]
|
tag_id = self.reverse_index[tag]
|
||||||
self.assign_tag_id(token, tag_id)
|
self.assign_tag_id(token, tag_id)
|
||||||
|
else:
|
||||||
|
token.tag = tag
|
||||||
|
|
||||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||||
if tag_id >= self.n_tags:
|
if tag_id >= self.n_tags:
|
||||||
|
@ -73,7 +75,7 @@ cdef class Morphology:
|
||||||
# the statistical model fails.
|
# the statistical model fails.
|
||||||
# Related to Issue #220
|
# Related to Issue #220
|
||||||
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
if Lexeme.c_check_flag(token.lex, IS_SPACE):
|
||||||
tag_id = self.reverse_index[self.strings['SP']]
|
tag_id = self.reverse_index[self.strings.add('SP')]
|
||||||
rich_tag = self.rich_tags[tag_id]
|
rich_tag = self.rich_tags[tag_id]
|
||||||
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
|
||||||
if analysis is NULL:
|
if analysis is NULL:
|
||||||
|
@ -104,7 +106,7 @@ cdef class Morphology:
|
||||||
tag (unicode): The part-of-speech tag to key the exception.
|
tag (unicode): The part-of-speech tag to key the exception.
|
||||||
orth (unicode): The word-form to key the exception.
|
orth (unicode): The word-form to key the exception.
|
||||||
"""
|
"""
|
||||||
tag = self.strings[tag_str]
|
tag = self.strings.add(tag_str)
|
||||||
tag_id = self.reverse_index[tag]
|
tag_id = self.reverse_index[tag]
|
||||||
orth = self.strings[orth_str]
|
orth = self.strings[orth_str]
|
||||||
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
||||||
|
@ -140,9 +142,9 @@ cdef class Morphology:
|
||||||
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
|
||||||
cdef unicode py_string = self.strings[orth]
|
cdef unicode py_string = self.strings[orth]
|
||||||
if self.lemmatizer is None:
|
if self.lemmatizer is None:
|
||||||
return self.strings[py_string.lower()]
|
return self.strings.add(py_string.lower())
|
||||||
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
|
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
|
||||||
return self.strings[py_string.lower()]
|
return self.strings.add(py_string.lower())
|
||||||
cdef set lemma_strings
|
cdef set lemma_strings
|
||||||
cdef unicode lemma_string
|
cdef unicode lemma_string
|
||||||
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
|
||||||
|
|
|
@ -23,7 +23,6 @@ cdef struct LexemeC:
|
||||||
|
|
||||||
float prob
|
float prob
|
||||||
float sentiment
|
float sentiment
|
||||||
float l2_norm
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct SerializedLexemeC:
|
cdef struct SerializedLexemeC:
|
||||||
|
@ -48,7 +47,7 @@ cdef struct Entity:
|
||||||
hash_t id
|
hash_t id
|
||||||
int start
|
int start
|
||||||
int end
|
int end
|
||||||
int label
|
attr_t label
|
||||||
|
|
||||||
|
|
||||||
cdef struct TokenC:
|
cdef struct TokenC:
|
||||||
|
@ -56,10 +55,10 @@ cdef struct TokenC:
|
||||||
uint64_t morph
|
uint64_t morph
|
||||||
univ_pos_t pos
|
univ_pos_t pos
|
||||||
bint spacy
|
bint spacy
|
||||||
int tag
|
attr_t tag
|
||||||
int idx
|
int idx
|
||||||
attr_t lemma
|
attr_t lemma
|
||||||
int sense
|
attr_t sense
|
||||||
int head
|
int head
|
||||||
attr_t dep
|
attr_t dep
|
||||||
bint sent_start
|
bint sent_start
|
||||||
|
@ -70,5 +69,5 @@ cdef struct TokenC:
|
||||||
uint32_t r_edge
|
uint32_t r_edge
|
||||||
|
|
||||||
int ent_iob
|
int ent_iob
|
||||||
int ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||||
hash_t ent_id
|
hash_t ent_id
|
||||||
|
|
|
@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
|
from ..typedefs cimport attr_t
|
||||||
|
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
|
|
|
@ -99,7 +99,7 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil:
|
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil:
|
||||||
if gold.labels[child] == -1:
|
if gold.labels[child] == -1:
|
||||||
return True
|
return True
|
||||||
elif label == -1:
|
elif label == -1:
|
||||||
|
@ -116,16 +116,16 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
|
||||||
|
|
||||||
cdef class Shift:
|
cdef class Shift:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
|
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.push()
|
st.push()
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass st, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
|
return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -133,17 +133,17 @@ cdef class Shift:
|
||||||
return push_cost(s, gold, s.B(0))
|
return push_cost(s, gold, s.B(0))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
cdef class Reduce:
|
cdef class Reduce:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
return st.stack_depth() >= 2
|
return st.stack_depth() >= 2
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
if st.has_head(st.S(0)):
|
if st.has_head(st.S(0)):
|
||||||
st.pop()
|
st.pop()
|
||||||
else:
|
else:
|
||||||
|
@ -151,7 +151,7 @@ cdef class Reduce:
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
|
return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -170,23 +170,23 @@ cdef class Reduce:
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
cdef class LeftArc:
|
cdef class LeftArc:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
return not st.B_(0).sent_start
|
return not st.B_(0).sent_start
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.add_arc(st.B(0), st.S(0), label)
|
st.add_arc(st.B(0), st.S(0), label)
|
||||||
st.pop()
|
st.pop()
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
|
return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -204,23 +204,23 @@ cdef class LeftArc:
|
||||||
return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
|
return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
|
return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
|
||||||
|
|
||||||
|
|
||||||
cdef class RightArc:
|
cdef class RightArc:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
return not st.B_(0).sent_start
|
return not st.B_(0).sent_start
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.add_arc(st.S(0), st.B(0), label)
|
st.add_arc(st.S(0), st.B(0), label)
|
||||||
st.push()
|
st.push()
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
|
return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -233,13 +233,13 @@ cdef class RightArc:
|
||||||
return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
|
return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
|
return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
|
||||||
|
|
||||||
|
|
||||||
cdef class Break:
|
cdef class Break:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int i
|
cdef int i
|
||||||
if not USE_BREAK:
|
if not USE_BREAK:
|
||||||
return False
|
return False
|
||||||
|
@ -251,12 +251,12 @@ cdef class Break:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.set_break(st.B_(0).l_edge)
|
st.set_break(st.B_(0).l_edge)
|
||||||
st.fast_forward()
|
st.fast_forward()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
|
return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -281,7 +281,7 @@ cdef class Break:
|
||||||
return cost + 1
|
return cost + 1
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
cdef int _get_root(int word, const GoldParseC* gold) nogil:
|
||||||
|
@ -369,7 +369,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
if label.upper() == 'ROOT':
|
if label.upper() == 'ROOT':
|
||||||
label = 'ROOT'
|
label = 'ROOT'
|
||||||
gold.c.heads[i] = gold.heads[i]
|
gold.c.heads[i] = gold.heads[i]
|
||||||
gold.c.labels[i] = self.strings[label]
|
gold.c.labels[i] = self.strings.add(label)
|
||||||
return gold
|
return gold
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
|
@ -384,14 +384,14 @@ cdef class ArcEager(TransitionSystem):
|
||||||
if self.c[i].move == move and self.c[i].label == label:
|
if self.c[i].move == move and self.c[i].label == label:
|
||||||
return self.c[i]
|
return self.c[i]
|
||||||
|
|
||||||
def move_name(self, int move, int label):
|
def move_name(self, int move, attr_t label):
|
||||||
label_str = self.strings[label]
|
label_str = self.strings[label]
|
||||||
if label_str:
|
if label_str:
|
||||||
return MOVE_NAMES[move] + '-' + label_str
|
return MOVE_NAMES[move] + '-' + label_str
|
||||||
else:
|
else:
|
||||||
return MOVE_NAMES[move]
|
return MOVE_NAMES[move]
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||||
# constructor with the function pointers
|
# constructor with the function pointers
|
||||||
cdef Transition t
|
cdef Transition t
|
||||||
|
@ -469,7 +469,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
label_cost_funcs[RIGHT] = RightArc.label_cost
|
label_cost_funcs[RIGHT] = RightArc.label_cost
|
||||||
label_cost_funcs[BREAK] = Break.label_cost
|
label_cost_funcs[BREAK] = Break.label_cost
|
||||||
|
|
||||||
cdef int* labels = gold.c.labels
|
cdef attr_t* labels = gold.c.labels
|
||||||
cdef int* heads = gold.c.heads
|
cdef int* heads = gold.c.heads
|
||||||
|
|
||||||
n_gold = 0
|
n_gold = 0
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from .transition_system cimport TransitionSystem
|
from .transition_system cimport TransitionSystem
|
||||||
from .transition_system cimport Transition
|
from .transition_system cimport Transition
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
|
from ..typedefs cimport attr_t
|
||||||
|
|
||||||
|
|
||||||
cdef class BiluoPushDown(TransitionSystem):
|
cdef class BiluoPushDown(TransitionSystem):
|
||||||
|
|
|
@ -100,7 +100,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return (BEGIN, IN, LAST, UNIT, OUT)
|
return (BEGIN, IN, LAST, UNIT, OUT)
|
||||||
|
|
||||||
def move_name(self, int move, int label):
|
def move_name(self, int move, attr_t label):
|
||||||
if move == OUT:
|
if move == OUT:
|
||||||
return 'O'
|
return 'O'
|
||||||
elif move == MISSING:
|
elif move == MISSING:
|
||||||
|
@ -132,7 +132,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
if label_str.startswith('!'):
|
if label_str.startswith('!'):
|
||||||
label_str = label_str[1:]
|
label_str = label_str[1:]
|
||||||
move_str = 'x'
|
move_str = 'x'
|
||||||
label = self.strings[label_str]
|
label = self.strings.add(label_str)
|
||||||
else:
|
else:
|
||||||
move_str = name
|
move_str = name
|
||||||
label = 0
|
label = 0
|
||||||
|
@ -145,7 +145,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
else:
|
else:
|
||||||
raise KeyError(name)
|
raise KeyError(name)
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||||
# constructor with the function pointers
|
# constructor with the function pointers
|
||||||
cdef Transition t
|
cdef Transition t
|
||||||
|
@ -194,21 +194,21 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
|
|
||||||
cdef class Missing:
|
cdef class Missing:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* s, int label) nogil:
|
cdef int transition(StateC* s, attr_t label) nogil:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
return 9000
|
return 9000
|
||||||
|
|
||||||
|
|
||||||
cdef class Begin:
|
cdef class Begin:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
# Ensure we don't clobber preset entities. If no entity preset,
|
# Ensure we don't clobber preset entities. If no entity preset,
|
||||||
# ent_iob is 0
|
# ent_iob is 0
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
|
@ -232,14 +232,14 @@ cdef class Begin:
|
||||||
return label != 0 and not st.entity_is_open()
|
return label != 0 and not st.entity_is_open()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.open_ent(label)
|
st.open_ent(label)
|
||||||
st.set_ent_tag(st.B(0), 3, label)
|
st.set_ent_tag(st.B(0), 3, label)
|
||||||
st.push()
|
st.push()
|
||||||
st.pop()
|
st.pop()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef int g_tag = gold.ner[s.B(0)].label
|
cdef int g_tag = gold.ner[s.B(0)].label
|
||||||
|
|
||||||
|
@ -261,7 +261,7 @@ cdef class Begin:
|
||||||
|
|
||||||
cdef class In:
|
cdef class In:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
if preset_ent_iob == 2:
|
if preset_ent_iob == 2:
|
||||||
return False
|
return False
|
||||||
|
@ -277,17 +277,17 @@ cdef class In:
|
||||||
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.set_ent_tag(st.B(0), 1, label)
|
st.set_ent_tag(st.B(0), 1, label)
|
||||||
st.push()
|
st.push()
|
||||||
st.pop()
|
st.pop()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
move = IN
|
move = IN
|
||||||
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
|
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef int g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
|
@ -313,24 +313,24 @@ cdef class In:
|
||||||
|
|
||||||
cdef class Last:
|
cdef class Last:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
if st.B_(1).ent_iob == 1:
|
if st.B_(1).ent_iob == 1:
|
||||||
return False
|
return False
|
||||||
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.close_ent()
|
st.close_ent()
|
||||||
st.set_ent_tag(st.B(0), 1, label)
|
st.set_ent_tag(st.B(0), 1, label)
|
||||||
st.push()
|
st.push()
|
||||||
st.pop()
|
st.pop()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
move = LAST
|
move = LAST
|
||||||
|
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef int g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return 0
|
return 0
|
||||||
|
@ -355,7 +355,7 @@ cdef class Last:
|
||||||
|
|
||||||
cdef class Unit:
|
cdef class Unit:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
if preset_ent_iob == 2:
|
if preset_ent_iob == 2:
|
||||||
return False
|
return False
|
||||||
|
@ -368,7 +368,7 @@ cdef class Unit:
|
||||||
return label != 0 and not st.entity_is_open()
|
return label != 0 and not st.entity_is_open()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.open_ent(label)
|
st.open_ent(label)
|
||||||
st.close_ent()
|
st.close_ent()
|
||||||
st.set_ent_tag(st.B(0), 3, label)
|
st.set_ent_tag(st.B(0), 3, label)
|
||||||
|
@ -376,9 +376,9 @@ cdef class Unit:
|
||||||
st.pop()
|
st.pop()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef int g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return 0
|
return 0
|
||||||
|
@ -398,7 +398,7 @@ cdef class Unit:
|
||||||
|
|
||||||
cdef class Out:
|
cdef class Out:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int preset_ent_iob = st.B_(0).ent_iob
|
cdef int preset_ent_iob = st.B_(0).ent_iob
|
||||||
if preset_ent_iob == 3:
|
if preset_ent_iob == 3:
|
||||||
return False
|
return False
|
||||||
|
@ -407,15 +407,15 @@ cdef class Out:
|
||||||
return not st.entity_is_open()
|
return not st.entity_is_open()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, attr_t label) nogil:
|
||||||
st.set_ent_tag(st.B(0), 2, 0)
|
st.set_ent_tag(st.B(0), 2, 0)
|
||||||
st.push()
|
st.push()
|
||||||
st.pop()
|
st.pop()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
|
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef int g_tag = gold.ner[s.B(0)].label
|
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
||||||
|
|
||||||
if g_act == MISSING or g_act == ISNT:
|
if g_act == MISSING or g_act == ISNT:
|
||||||
return 0
|
return 0
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
|
|
||||||
|
from ..typedefs cimport attr_t
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
|
@ -13,20 +14,22 @@ from ._state cimport StateC
|
||||||
cdef struct Transition:
|
cdef struct Transition:
|
||||||
int clas
|
int clas
|
||||||
int move
|
int move
|
||||||
int label
|
attr_t label
|
||||||
|
|
||||||
weight_t score
|
weight_t score
|
||||||
|
|
||||||
bint (*is_valid)(const StateC* state, int label) nogil
|
bint (*is_valid)(const StateC* state, attr_t label) nogil
|
||||||
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil
|
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil
|
||||||
int (*do)(StateC* state, int label) nogil
|
int (*do)(StateC* state, attr_t label) nogil
|
||||||
|
|
||||||
|
|
||||||
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
|
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold,
|
||||||
|
attr_tlabel) nogil
|
||||||
ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
|
ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
|
||||||
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
|
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC*
|
||||||
|
gold, attr_t label) nogil
|
||||||
|
|
||||||
ctypedef int (*do_func_t)(StateC* state, int label) nogil
|
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
|
||||||
|
|
||||||
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
|
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
|
||||||
|
|
||||||
|
@ -36,7 +39,7 @@ cdef class TransitionSystem:
|
||||||
cdef Transition* c
|
cdef Transition* c
|
||||||
cdef readonly int n_moves
|
cdef readonly int n_moves
|
||||||
cdef int _size
|
cdef int _size
|
||||||
cdef public int root_label
|
cdef public attr_t root_label
|
||||||
cdef public freqs
|
cdef public freqs
|
||||||
cdef init_state_t init_beam_state
|
cdef init_state_t init_beam_state
|
||||||
|
|
||||||
|
@ -45,7 +48,7 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *
|
cdef Transition lookup_transition(self, object name) except *
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *
|
cdef Transition init_transition(self, int clas, int move, attr_t label) except *
|
||||||
|
|
||||||
cdef int set_valid(self, int* output, const StateC* st) nogil
|
cdef int set_valid(self, int* output, const StateC* st) nogil
|
||||||
|
|
||||||
|
|
|
@ -99,7 +99,7 @@ cdef class TransitionSystem:
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def is_valid(self, StateClass stcls, move_name):
|
def is_valid(self, StateClass stcls, move_name):
|
||||||
|
|
|
@ -204,6 +204,7 @@ def test_doc_api_right_edge(en_tokenizer):
|
||||||
assert doc[6].right_edge.text == ','
|
assert doc[6].right_edge.text == ','
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text,vectors', [
|
@pytest.mark.parametrize('text,vectors', [
|
||||||
("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
|
("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
|
||||||
])
|
])
|
||||||
|
|
|
@ -11,7 +11,6 @@ import struct
|
||||||
import dill
|
import dill
|
||||||
|
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.stdint cimport uint32_t
|
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
|
@ -21,6 +20,7 @@ from .token cimport Token
|
||||||
from .printers import parse_tree
|
from .printers import parse_tree
|
||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
|
from ..attrs import intify_attrs
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport attr_id_t
|
||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||||
|
@ -494,8 +494,8 @@ cdef class Doc:
|
||||||
cdef np.ndarray[attr_t, ndim=2] output
|
cdef np.ndarray[attr_t, ndim=2] output
|
||||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||||
# dict iteration.
|
# dict iteration.
|
||||||
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)
|
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
||||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
|
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
for j, feature in enumerate(attr_ids):
|
for j, feature in enumerate(attr_ids):
|
||||||
output[i, j] = get_token_attr(&self.c[i], feature)
|
output[i, j] = get_token_attr(&self.c[i], feature)
|
||||||
|
@ -640,7 +640,7 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
if self.length != 0:
|
if self.length != 0:
|
||||||
raise ValueError("Cannot load into non-empty Doc")
|
raise ValueError("Cannot load into non-empty Doc")
|
||||||
cdef int[:, :] attrs
|
cdef attr_t[:, :] attrs
|
||||||
cdef int i, start, end, has_space
|
cdef int i, start, end, has_space
|
||||||
fields = dill.loads(data)
|
fields = dill.loads(data)
|
||||||
text, attrs = fields[:2]
|
text, attrs = fields[:2]
|
||||||
|
@ -679,17 +679,15 @@ cdef class Doc:
|
||||||
if len(args) == 3:
|
if len(args) == 3:
|
||||||
# TODO: Warn deprecation
|
# TODO: Warn deprecation
|
||||||
tag, lemma, ent_type = args
|
tag, lemma, ent_type = args
|
||||||
attributes[TAG] = self.vocab.strings[tag]
|
attributes[TAG] = tag
|
||||||
attributes[LEMMA] = self.vocab.strings[lemma]
|
attributes[LEMMA] = lemma
|
||||||
attributes[ENT_TYPE] = self.vocab.strings[ent_type]
|
attributes[ENT_TYPE] = ent_type
|
||||||
elif not args:
|
elif not args:
|
||||||
# TODO: This code makes little sense overall. We're still
|
|
||||||
# ignoring most of the attributes?
|
|
||||||
if "label" in attributes and 'ent_type' not in attributes:
|
if "label" in attributes and 'ent_type' not in attributes:
|
||||||
if type(attributes["label"]) == int:
|
if type(attributes["label"]) == int:
|
||||||
attributes[ENT_TYPE] = attributes["label"]
|
attributes[ENT_TYPE] = attributes["label"]
|
||||||
else:
|
else:
|
||||||
attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]]
|
attributes[ENT_TYPE] = self.vocab.strings.add(attributes["label"])
|
||||||
if 'ent_type' in attributes:
|
if 'ent_type' in attributes:
|
||||||
attributes[ENT_TYPE] = attributes['ent_type']
|
attributes[ENT_TYPE] = attributes['ent_type']
|
||||||
elif args:
|
elif args:
|
||||||
|
@ -699,6 +697,8 @@ cdef class Doc:
|
||||||
"Arguments supplied:\n%s\n"
|
"Arguments supplied:\n%s\n"
|
||||||
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
|
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
|
||||||
|
|
||||||
|
attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
|
||||||
|
|
||||||
cdef int start = token_by_start(self.c, self.length, start_idx)
|
cdef int start = token_by_start(self.c, self.length, start_idx)
|
||||||
if start == -1:
|
if start == -1:
|
||||||
return None
|
return None
|
||||||
|
@ -708,13 +708,6 @@ cdef class Doc:
|
||||||
# Currently we have the token index, we want the range-end index
|
# Currently we have the token index, we want the range-end index
|
||||||
end += 1
|
end += 1
|
||||||
cdef Span span = self[start:end]
|
cdef Span span = self[start:end]
|
||||||
tag = self.vocab.strings[attributes.get(TAG, span.root.tag)]
|
|
||||||
lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)]
|
|
||||||
ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)]
|
|
||||||
ent_id = attributes.get('ent_id', span.root.ent_id)
|
|
||||||
if isinstance(ent_id, basestring):
|
|
||||||
ent_id = self.vocab.strings[ent_id]
|
|
||||||
|
|
||||||
# Get LexemeC for newly merged token
|
# Get LexemeC for newly merged token
|
||||||
new_orth = ''.join([t.text_with_ws for t in span])
|
new_orth = ''.join([t.text_with_ws for t in span])
|
||||||
if span[-1].whitespace_:
|
if span[-1].whitespace_:
|
||||||
|
@ -723,18 +716,11 @@ cdef class Doc:
|
||||||
# House the new merged token where it starts
|
# House the new merged token where it starts
|
||||||
cdef TokenC* token = &self.c[start]
|
cdef TokenC* token = &self.c[start]
|
||||||
token.spacy = self.c[end-1].spacy
|
token.spacy = self.c[end-1].spacy
|
||||||
if tag in self.vocab.morphology.tag_map:
|
for attr_name, attr_value in attributes.items():
|
||||||
self.vocab.morphology.assign_tag(token, tag)
|
if attr_name == TAG:
|
||||||
|
self.vocab.morphology.assign_tag(token, attr_value)
|
||||||
else:
|
else:
|
||||||
token.tag = self.vocab.strings[tag]
|
Token.set_struct_attr(token, attr_name, attr_value)
|
||||||
token.lemma = self.vocab.strings[lemma]
|
|
||||||
if ent_type == 'O':
|
|
||||||
token.ent_iob = 2
|
|
||||||
token.ent_type = 0
|
|
||||||
else:
|
|
||||||
token.ent_iob = 3
|
|
||||||
token.ent_type = self.vocab.strings[ent_type]
|
|
||||||
token.ent_id = ent_id
|
|
||||||
# Begin by setting all the head indices to absolute token positions
|
# Begin by setting all the head indices to absolute token positions
|
||||||
# This is easier to work with for now than the offsets
|
# This is easier to work with for now than the offsets
|
||||||
# Before thinking of something simpler, beware the case where a dependency
|
# Before thinking of something simpler, beware the case where a dependency
|
||||||
|
|
|
@ -21,14 +21,14 @@ from .. import about
|
||||||
|
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
"""A slice from a Doc object."""
|
"""A slice from a Doc object."""
|
||||||
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
|
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
|
||||||
vector_norm=None):
|
vector_norm=None):
|
||||||
"""Create a `Span` object from the slice `doc[start : end]`.
|
"""Create a `Span` object from the slice `doc[start : end]`.
|
||||||
|
|
||||||
doc (Doc): The parent document.
|
doc (Doc): The parent document.
|
||||||
start (int): The index of the first token of the span.
|
start (int): The index of the first token of the span.
|
||||||
end (int): The index of the first token after the span.
|
end (int): The index of the first token after the span.
|
||||||
label (int): A label to attach to the Span, e.g. for named entities.
|
label (uint64): A label to attach to the Span, e.g. for named entities.
|
||||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||||
RETURNS (Span): The newly constructed object.
|
RETURNS (Span): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
|
@ -377,7 +377,7 @@ cdef class Span:
|
||||||
property ent_id:
|
property ent_id:
|
||||||
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
|
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
|
||||||
|
|
||||||
RETURNS (int): The entity ID.
|
RETURNS (uint64): The entity ID.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.root.ent_id
|
return self.root.ent_id
|
||||||
|
|
Loading…
Reference in New Issue
Block a user