WIP on stringstore change. 27 failures

This commit is contained in:
Matthew Honnibal 2017-05-28 14:06:40 +02:00
parent fe4a746300
commit 84e66ca6d4
15 changed files with 103 additions and 109 deletions

View File

@ -150,6 +150,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
else: else:
int_key = IDS[name.upper()] int_key = IDS[name.upper()]
if strings_map is not None and isinstance(value, basestring): if strings_map is not None and isinstance(value, basestring):
value = strings_map[value] value = strings_map.add(value)
inty_attrs[int_key] = value inty_attrs[int_key] = value
return inty_attrs return inty_attrs

View File

@ -1,13 +1,14 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from .structs cimport TokenC from .structs cimport TokenC
from .typedefs cimport attr_t
from .syntax.transition_system cimport Transition from .syntax.transition_system cimport Transition
cdef struct GoldParseC: cdef struct GoldParseC:
int* tags int* tags
int* heads int* heads
int* labels attr_t* labels
int** brackets int** brackets
Transition* ner Transition* ner

View File

@ -384,7 +384,7 @@ cdef class GoldParse:
# These are filled by the tagger/parser/entity recogniser # These are filled by the tagger/parser/entity recogniser
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int)) self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int)) self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int)) self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition)) self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.words = [None] * len(doc) self.words = [None] * len(doc)

View File

@ -35,7 +35,7 @@ cdef class Lexeme:
@staticmethod @staticmethod
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil: cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
buff = <unsigned char*>&lex.flags buff = <unsigned char*>&lex.flags
end = <unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm) end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
for i in range(sizeof(lex_data.data)): for i in range(sizeof(lex_data.data)):
buff[i] = lex_data.data[i] buff[i] = lex_data.data[i]

View File

@ -48,7 +48,7 @@ cdef class Morphology:
self.tag_map[tag_str] = dict(attrs) self.tag_map[tag_str] = dict(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.rich_tags[i].id = i self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str] self.rich_tags[i].name = self.strings.add(tag_str)
self.rich_tags[i].morph = 0 self.rich_tags[i].morph = 0
self.rich_tags[i].pos = attrs[POS] self.rich_tags[i].pos = attrs[POS]
self.reverse_index[self.rich_tags[i].name] = i self.reverse_index[self.rich_tags[i].name] = i
@ -59,10 +59,12 @@ cdef class Morphology:
cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int assign_tag(self, TokenC* token, tag) except -1:
if isinstance(tag, basestring): if isinstance(tag, basestring):
tag_id = self.reverse_index[self.strings[tag]] tag = self.strings.add(tag)
else: if tag in self.reverse_index:
tag_id = self.reverse_index[tag] tag_id = self.reverse_index[tag]
self.assign_tag_id(token, tag_id) self.assign_tag_id(token, tag_id)
else:
token.tag = tag
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id >= self.n_tags: if tag_id >= self.n_tags:
@ -73,7 +75,7 @@ cdef class Morphology:
# the statistical model fails. # the statistical model fails.
# Related to Issue #220 # Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE): if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings['SP']] tag_id = self.reverse_index[self.strings.add('SP')]
rich_tag = self.rich_tags[tag_id] rich_tag = self.rich_tags[tag_id]
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth) analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL: if analysis is NULL:
@ -104,7 +106,7 @@ cdef class Morphology:
tag (unicode): The part-of-speech tag to key the exception. tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception. orth (unicode): The word-form to key the exception.
""" """
tag = self.strings[tag_str] tag = self.strings.add(tag_str)
tag_id = self.reverse_index[tag] tag_id = self.reverse_index[tag]
orth = self.strings[orth_str] orth = self.strings[orth_str]
cdef RichTagC rich_tag = self.rich_tags[tag_id] cdef RichTagC rich_tag = self.rich_tags[tag_id]
@ -140,9 +142,9 @@ cdef class Morphology:
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
cdef unicode py_string = self.strings[orth] cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None: if self.lemmatizer is None:
return self.strings[py_string.lower()] return self.strings.add(py_string.lower())
if univ_pos not in (NOUN, VERB, ADJ, PUNCT): if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
return self.strings[py_string.lower()] return self.strings.add(py_string.lower())
cdef set lemma_strings cdef set lemma_strings
cdef unicode lemma_string cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)

View File

@ -23,7 +23,6 @@ cdef struct LexemeC:
float prob float prob
float sentiment float sentiment
float l2_norm
cdef struct SerializedLexemeC: cdef struct SerializedLexemeC:
@ -48,7 +47,7 @@ cdef struct Entity:
hash_t id hash_t id
int start int start
int end int end
int label attr_t label
cdef struct TokenC: cdef struct TokenC:
@ -56,10 +55,10 @@ cdef struct TokenC:
uint64_t morph uint64_t morph
univ_pos_t pos univ_pos_t pos
bint spacy bint spacy
int tag attr_t tag
int idx int idx
attr_t lemma attr_t lemma
int sense attr_t sense
int head int head
attr_t dep attr_t dep
bint sent_start bint sent_start
@ -70,5 +69,5 @@ cdef struct TokenC:
uint32_t r_edge uint32_t r_edge
int ent_iob int ent_iob
int ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
hash_t ent_id hash_t ent_id

View File

@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ..typedefs cimport attr_t
from .transition_system cimport TransitionSystem, Transition from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParseC from ..gold cimport GoldParseC

View File

@ -99,7 +99,7 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
return False return False
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil: cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil:
if gold.labels[child] == -1: if gold.labels[child] == -1:
return True return True
elif label == -1: elif label == -1:
@ -116,16 +116,16 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
cdef class Shift: cdef class Shift:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.push() st.push()
st.fast_forward() st.fast_forward()
@staticmethod @staticmethod
cdef weight_t cost(StateClass st, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil:
return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label) return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
@staticmethod @staticmethod
@ -133,17 +133,17 @@ cdef class Shift:
return push_cost(s, gold, s.B(0)) return push_cost(s, gold, s.B(0))
@staticmethod @staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0 return 0
cdef class Reduce: cdef class Reduce:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.stack_depth() >= 2 return st.stack_depth() >= 2
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
if st.has_head(st.S(0)): if st.has_head(st.S(0)):
st.pop() st.pop()
else: else:
@ -151,7 +151,7 @@ cdef class Reduce:
st.fast_forward() st.fast_forward()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
@staticmethod @staticmethod
@ -170,23 +170,23 @@ cdef class Reduce:
return cost return cost
@staticmethod @staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0 return 0
cdef class LeftArc: cdef class LeftArc:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
return not st.B_(0).sent_start return not st.B_(0).sent_start
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.B(0), st.S(0), label) st.add_arc(st.B(0), st.S(0), label)
st.pop() st.pop()
st.fast_forward() st.fast_forward()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
@staticmethod @staticmethod
@ -204,23 +204,23 @@ cdef class LeftArc:
return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
@staticmethod @staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label) return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
cdef class RightArc: cdef class RightArc:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
return not st.B_(0).sent_start return not st.B_(0).sent_start
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.S(0), st.B(0), label) st.add_arc(st.S(0), st.B(0), label)
st.push() st.push()
st.fast_forward() st.fast_forward()
@staticmethod @staticmethod
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
@staticmethod @staticmethod
@ -233,13 +233,13 @@ cdef class RightArc:
return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
@staticmethod @staticmethod
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label) return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
cdef class Break: cdef class Break:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int i cdef int i
if not USE_BREAK: if not USE_BREAK:
return False return False
@ -251,12 +251,12 @@ cdef class Break:
return True return True
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.set_break(st.B_(0).l_edge) st.set_break(st.B_(0).l_edge)
st.fast_forward() st.fast_forward()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return Break.move_cost(s, gold) + Break.label_cost(s, gold, label) return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
@staticmethod @staticmethod
@ -281,7 +281,7 @@ cdef class Break:
return cost + 1 return cost + 1
@staticmethod @staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0 return 0
cdef int _get_root(int word, const GoldParseC* gold) nogil: cdef int _get_root(int word, const GoldParseC* gold) nogil:
@ -369,7 +369,7 @@ cdef class ArcEager(TransitionSystem):
if label.upper() == 'ROOT': if label.upper() == 'ROOT':
label = 'ROOT' label = 'ROOT'
gold.c.heads[i] = gold.heads[i] gold.c.heads[i] = gold.heads[i]
gold.c.labels[i] = self.strings[label] gold.c.labels[i] = self.strings.add(label)
return gold return gold
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
@ -384,14 +384,14 @@ cdef class ArcEager(TransitionSystem):
if self.c[i].move == move and self.c[i].label == label: if self.c[i].move == move and self.c[i].label == label:
return self.c[i] return self.c[i]
def move_name(self, int move, int label): def move_name(self, int move, attr_t label):
label_str = self.strings[label] label_str = self.strings[label]
if label_str: if label_str:
return MOVE_NAMES[move] + '-' + label_str return MOVE_NAMES[move] + '-' + label_str
else: else:
return MOVE_NAMES[move] return MOVE_NAMES[move]
cdef Transition init_transition(self, int clas, int move, int label) except *: cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition() # TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers # constructor with the function pointers
cdef Transition t cdef Transition t
@ -469,7 +469,7 @@ cdef class ArcEager(TransitionSystem):
label_cost_funcs[RIGHT] = RightArc.label_cost label_cost_funcs[RIGHT] = RightArc.label_cost
label_cost_funcs[BREAK] = Break.label_cost label_cost_funcs[BREAK] = Break.label_cost
cdef int* labels = gold.c.labels cdef attr_t* labels = gold.c.labels
cdef int* heads = gold.c.heads cdef int* heads = gold.c.heads
n_gold = 0 n_gold = 0

View File

@ -1,6 +1,7 @@
from .transition_system cimport TransitionSystem from .transition_system cimport TransitionSystem
from .transition_system cimport Transition from .transition_system cimport Transition
from ..gold cimport GoldParseC from ..gold cimport GoldParseC
from ..typedefs cimport attr_t
cdef class BiluoPushDown(TransitionSystem): cdef class BiluoPushDown(TransitionSystem):

View File

@ -100,7 +100,7 @@ cdef class BiluoPushDown(TransitionSystem):
def __get__(self): def __get__(self):
return (BEGIN, IN, LAST, UNIT, OUT) return (BEGIN, IN, LAST, UNIT, OUT)
def move_name(self, int move, int label): def move_name(self, int move, attr_t label):
if move == OUT: if move == OUT:
return 'O' return 'O'
elif move == MISSING: elif move == MISSING:
@ -132,7 +132,7 @@ cdef class BiluoPushDown(TransitionSystem):
if label_str.startswith('!'): if label_str.startswith('!'):
label_str = label_str[1:] label_str = label_str[1:]
move_str = 'x' move_str = 'x'
label = self.strings[label_str] label = self.strings.add(label_str)
else: else:
move_str = name move_str = name
label = 0 label = 0
@ -145,7 +145,7 @@ cdef class BiluoPushDown(TransitionSystem):
else: else:
raise KeyError(name) raise KeyError(name)
cdef Transition init_transition(self, int clas, int move, int label) except *: cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition() # TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers # constructor with the function pointers
cdef Transition t cdef Transition t
@ -194,21 +194,21 @@ cdef class BiluoPushDown(TransitionSystem):
cdef class Missing: cdef class Missing:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
return False return False
@staticmethod @staticmethod
cdef int transition(StateC* s, int label) nogil: cdef int transition(StateC* s, attr_t label) nogil:
pass pass
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 9000 return 9000
cdef class Begin: cdef class Begin:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
# Ensure we don't clobber preset entities. If no entity preset, # Ensure we don't clobber preset entities. If no entity preset,
# ent_iob is 0 # ent_iob is 0
cdef int preset_ent_iob = st.B_(0).ent_iob cdef int preset_ent_iob = st.B_(0).ent_iob
@ -232,14 +232,14 @@ cdef class Begin:
return label != 0 and not st.entity_is_open() return label != 0 and not st.entity_is_open()
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label) st.open_ent(label)
st.set_ent_tag(st.B(0), 3, label) st.set_ent_tag(st.B(0), 3, label)
st.push() st.push()
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label cdef int g_tag = gold.ner[s.B(0)].label
@ -261,7 +261,7 @@ cdef class Begin:
cdef class In: cdef class In:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2: if preset_ent_iob == 2:
return False return False
@ -277,17 +277,17 @@ cdef class In:
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.set_ent_tag(st.B(0), 1, label) st.set_ent_tag(st.B(0), 1, label)
st.push() st.push()
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
move = IN move = IN
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef bint is_sunk = _entity_is_sunk(s, gold.ner) cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
if g_act == MISSING: if g_act == MISSING:
@ -313,24 +313,24 @@ cdef class In:
cdef class Last: cdef class Last:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.B_(1).ent_iob == 1: if st.B_(1).ent_iob == 1:
return False return False
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.close_ent() st.close_ent()
st.set_ent_tag(st.B(0), 1, label) st.set_ent_tag(st.B(0), 1, label)
st.push() st.push()
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
move = LAST move = LAST
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING: if g_act == MISSING:
return 0 return 0
@ -355,7 +355,7 @@ cdef class Last:
cdef class Unit: cdef class Unit:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2: if preset_ent_iob == 2:
return False return False
@ -368,7 +368,7 @@ cdef class Unit:
return label != 0 and not st.entity_is_open() return label != 0 and not st.entity_is_open()
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label) st.open_ent(label)
st.close_ent() st.close_ent()
st.set_ent_tag(st.B(0), 3, label) st.set_ent_tag(st.B(0), 3, label)
@ -376,9 +376,9 @@ cdef class Unit:
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING: if g_act == MISSING:
return 0 return 0
@ -398,7 +398,7 @@ cdef class Unit:
cdef class Out: cdef class Out:
@staticmethod @staticmethod
cdef bint is_valid(const StateC* st, int label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 3: if preset_ent_iob == 3:
return False return False
@ -407,15 +407,15 @@ cdef class Out:
return not st.entity_is_open() return not st.entity_is_open()
@staticmethod @staticmethod
cdef int transition(StateC* st, int label) nogil: cdef int transition(StateC* st, attr_t label) nogil:
st.set_ent_tag(st.B(0), 2, 0) st.set_ent_tag(st.B(0), 2, 0)
st.push() st.push()
st.pop() st.pop()
@staticmethod @staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING or g_act == ISNT: if g_act == MISSING or g_act == ISNT:
return 0 return 0

View File

@ -1,6 +1,7 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from ..typedefs cimport attr_t
from ..structs cimport TokenC from ..structs cimport TokenC
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..gold cimport GoldParseC from ..gold cimport GoldParseC
@ -13,20 +14,22 @@ from ._state cimport StateC
cdef struct Transition: cdef struct Transition:
int clas int clas
int move int move
int label attr_t label
weight_t score weight_t score
bint (*is_valid)(const StateC* state, int label) nogil bint (*is_valid)(const StateC* state, attr_t label) nogil
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil
int (*do)(StateC* state, int label) nogil int (*do)(StateC* state, attr_t label) nogil
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold,
attr_tlabel) nogil
ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC*
gold, attr_t label) nogil
ctypedef int (*do_func_t)(StateC* state, int label) nogil ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
@ -36,7 +39,7 @@ cdef class TransitionSystem:
cdef Transition* c cdef Transition* c
cdef readonly int n_moves cdef readonly int n_moves
cdef int _size cdef int _size
cdef public int root_label cdef public attr_t root_label
cdef public freqs cdef public freqs
cdef init_state_t init_beam_state cdef init_state_t init_beam_state
@ -45,7 +48,7 @@ cdef class TransitionSystem:
cdef Transition lookup_transition(self, object name) except * cdef Transition lookup_transition(self, object name) except *
cdef Transition init_transition(self, int clas, int move, int label) except * cdef Transition init_transition(self, int clas, int move, attr_t label) except *
cdef int set_valid(self, int* output, const StateC* st) nogil cdef int set_valid(self, int* output, const StateC* st) nogil

View File

@ -99,7 +99,7 @@ cdef class TransitionSystem:
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
raise NotImplementedError raise NotImplementedError
cdef Transition init_transition(self, int clas, int move, int label) except *: cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
raise NotImplementedError raise NotImplementedError
def is_valid(self, StateClass stcls, move_name): def is_valid(self, StateClass stcls, move_name):

View File

@ -204,6 +204,7 @@ def test_doc_api_right_edge(en_tokenizer):
assert doc[6].right_edge.text == ',' assert doc[6].right_edge.text == ','
@pytest.mark.xfail
@pytest.mark.parametrize('text,vectors', [ @pytest.mark.parametrize('text,vectors', [
("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"]) ("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
]) ])

View File

@ -11,7 +11,6 @@ import struct
import dill import dill
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t
from libc.math cimport sqrt from libc.math cimport sqrt
from .span cimport Span from .span cimport Span
@ -21,6 +20,7 @@ from .token cimport Token
from .printers import parse_tree from .printers import parse_tree
from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
from ..attrs import intify_attrs
from ..attrs cimport attr_id_t from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
@ -494,8 +494,8 @@ cdef class Doc:
cdef np.ndarray[attr_t, ndim=2] output cdef np.ndarray[attr_t, ndim=2] output
# Make an array from the attributes --- otherwise our inner loop is Python # Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration. # dict iteration.
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32) cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32) output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
for i in range(self.length): for i in range(self.length):
for j, feature in enumerate(attr_ids): for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.c[i], feature) output[i, j] = get_token_attr(&self.c[i], feature)
@ -640,7 +640,7 @@ cdef class Doc:
""" """
if self.length != 0: if self.length != 0:
raise ValueError("Cannot load into non-empty Doc") raise ValueError("Cannot load into non-empty Doc")
cdef int[:, :] attrs cdef attr_t[:, :] attrs
cdef int i, start, end, has_space cdef int i, start, end, has_space
fields = dill.loads(data) fields = dill.loads(data)
text, attrs = fields[:2] text, attrs = fields[:2]
@ -679,17 +679,15 @@ cdef class Doc:
if len(args) == 3: if len(args) == 3:
# TODO: Warn deprecation # TODO: Warn deprecation
tag, lemma, ent_type = args tag, lemma, ent_type = args
attributes[TAG] = self.vocab.strings[tag] attributes[TAG] = tag
attributes[LEMMA] = self.vocab.strings[lemma] attributes[LEMMA] = lemma
attributes[ENT_TYPE] = self.vocab.strings[ent_type] attributes[ENT_TYPE] = ent_type
elif not args: elif not args:
# TODO: This code makes little sense overall. We're still
# ignoring most of the attributes?
if "label" in attributes and 'ent_type' not in attributes: if "label" in attributes and 'ent_type' not in attributes:
if type(attributes["label"]) == int: if type(attributes["label"]) == int:
attributes[ENT_TYPE] = attributes["label"] attributes[ENT_TYPE] = attributes["label"]
else: else:
attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]] attributes[ENT_TYPE] = self.vocab.strings.add(attributes["label"])
if 'ent_type' in attributes: if 'ent_type' in attributes:
attributes[ENT_TYPE] = attributes['ent_type'] attributes[ENT_TYPE] = attributes['ent_type']
elif args: elif args:
@ -699,6 +697,8 @@ cdef class Doc:
"Arguments supplied:\n%s\n" "Arguments supplied:\n%s\n"
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
cdef int start = token_by_start(self.c, self.length, start_idx) cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1: if start == -1:
return None return None
@ -708,13 +708,6 @@ cdef class Doc:
# Currently we have the token index, we want the range-end index # Currently we have the token index, we want the range-end index
end += 1 end += 1
cdef Span span = self[start:end] cdef Span span = self[start:end]
tag = self.vocab.strings[attributes.get(TAG, span.root.tag)]
lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)]
ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)]
ent_id = attributes.get('ent_id', span.root.ent_id)
if isinstance(ent_id, basestring):
ent_id = self.vocab.strings[ent_id]
# Get LexemeC for newly merged token # Get LexemeC for newly merged token
new_orth = ''.join([t.text_with_ws for t in span]) new_orth = ''.join([t.text_with_ws for t in span])
if span[-1].whitespace_: if span[-1].whitespace_:
@ -723,18 +716,11 @@ cdef class Doc:
# House the new merged token where it starts # House the new merged token where it starts
cdef TokenC* token = &self.c[start] cdef TokenC* token = &self.c[start]
token.spacy = self.c[end-1].spacy token.spacy = self.c[end-1].spacy
if tag in self.vocab.morphology.tag_map: for attr_name, attr_value in attributes.items():
self.vocab.morphology.assign_tag(token, tag) if attr_name == TAG:
else: self.vocab.morphology.assign_tag(token, attr_value)
token.tag = self.vocab.strings[tag] else:
token.lemma = self.vocab.strings[lemma] Token.set_struct_attr(token, attr_name, attr_value)
if ent_type == 'O':
token.ent_iob = 2
token.ent_type = 0
else:
token.ent_iob = 3
token.ent_type = self.vocab.strings[ent_type]
token.ent_id = ent_id
# Begin by setting all the head indices to absolute token positions # Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets # This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a dependency # Before thinking of something simpler, beware the case where a dependency

View File

@ -21,14 +21,14 @@ from .. import about
cdef class Span: cdef class Span:
"""A slice from a Doc object.""" """A slice from a Doc object."""
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None, def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
vector_norm=None): vector_norm=None):
"""Create a `Span` object from the slice `doc[start : end]`. """Create a `Span` object from the slice `doc[start : end]`.
doc (Doc): The parent document. doc (Doc): The parent document.
start (int): The index of the first token of the span. start (int): The index of the first token of the span.
end (int): The index of the first token after the span. end (int): The index of the first token after the span.
label (int): A label to attach to the Span, e.g. for named entities. label (uint64): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
RETURNS (Span): The newly constructed object. RETURNS (Span): The newly constructed object.
""" """
@ -377,7 +377,7 @@ cdef class Span:
property ent_id: property ent_id:
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`. """An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
RETURNS (int): The entity ID. RETURNS (uint64): The entity ID.
""" """
def __get__(self): def __get__(self):
return self.root.ent_id return self.root.ent_id