WIP on stringstore change. 27 failures

This commit is contained in:
Matthew Honnibal 2017-05-28 14:06:40 +02:00
parent fe4a746300
commit 84e66ca6d4
15 changed files with 103 additions and 109 deletions

View File

@ -150,6 +150,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
else:
int_key = IDS[name.upper()]
if strings_map is not None and isinstance(value, basestring):
value = strings_map[value]
value = strings_map.add(value)
inty_attrs[int_key] = value
return inty_attrs

View File

@ -1,13 +1,14 @@
from cymem.cymem cimport Pool
from .structs cimport TokenC
from .typedefs cimport attr_t
from .syntax.transition_system cimport Transition
cdef struct GoldParseC:
int* tags
int* heads
int* labels
attr_t* labels
int** brackets
Transition* ner

View File

@ -384,7 +384,7 @@ cdef class GoldParse:
# These are filled by the tagger/parser/entity recogniser
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.words = [None] * len(doc)

View File

@ -35,7 +35,7 @@ cdef class Lexeme:
@staticmethod
cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
buff = <unsigned char*>&lex.flags
end = <unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
for i in range(sizeof(lex_data.data)):
buff[i] = lex_data.data[i]

View File

@ -48,7 +48,7 @@ cdef class Morphology:
self.tag_map[tag_str] = dict(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.rich_tags[i].id = i
self.rich_tags[i].name = self.strings[tag_str]
self.rich_tags[i].name = self.strings.add(tag_str)
self.rich_tags[i].morph = 0
self.rich_tags[i].pos = attrs[POS]
self.reverse_index[self.rich_tags[i].name] = i
@ -59,10 +59,12 @@ cdef class Morphology:
cdef int assign_tag(self, TokenC* token, tag) except -1:
if isinstance(tag, basestring):
tag_id = self.reverse_index[self.strings[tag]]
else:
tag = self.strings.add(tag)
if tag in self.reverse_index:
tag_id = self.reverse_index[tag]
self.assign_tag_id(token, tag_id)
else:
token.tag = tag
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id >= self.n_tags:
@ -73,7 +75,7 @@ cdef class Morphology:
# the statistical model fails.
# Related to Issue #220
if Lexeme.c_check_flag(token.lex, IS_SPACE):
tag_id = self.reverse_index[self.strings['SP']]
tag_id = self.reverse_index[self.strings.add('SP')]
rich_tag = self.rich_tags[tag_id]
analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
if analysis is NULL:
@ -104,7 +106,7 @@ cdef class Morphology:
tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception.
"""
tag = self.strings[tag_str]
tag = self.strings.add(tag_str)
tag_id = self.reverse_index[tag]
orth = self.strings[orth_str]
cdef RichTagC rich_tag = self.rich_tags[tag_id]
@ -140,9 +142,9 @@ cdef class Morphology:
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None:
return self.strings[py_string.lower()]
return self.strings.add(py_string.lower())
if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
return self.strings[py_string.lower()]
return self.strings.add(py_string.lower())
cdef set lemma_strings
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)

View File

@ -23,7 +23,6 @@ cdef struct LexemeC:
float prob
float sentiment
float l2_norm
cdef struct SerializedLexemeC:
@ -48,7 +47,7 @@ cdef struct Entity:
hash_t id
int start
int end
int label
attr_t label
cdef struct TokenC:
@ -56,10 +55,10 @@ cdef struct TokenC:
uint64_t morph
univ_pos_t pos
bint spacy
int tag
attr_t tag
int idx
attr_t lemma
int sense
attr_t sense
int head
attr_t dep
bint sent_start
@ -70,5 +69,5 @@ cdef struct TokenC:
uint32_t r_edge
int ent_iob
int ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
hash_t ent_id

View File

@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from .stateclass cimport StateClass
from ..typedefs cimport attr_t
from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParseC

View File

@ -99,7 +99,7 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
return False
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil:
cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil:
if gold.labels[child] == -1:
return True
elif label == -1:
@ -116,16 +116,16 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
cdef class Shift:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.push()
st.fast_forward()
@staticmethod
cdef weight_t cost(StateClass st, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil:
return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
@staticmethod
@ -133,17 +133,17 @@ cdef class Shift:
return push_cost(s, gold, s.B(0))
@staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0
cdef class Reduce:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.stack_depth() >= 2
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
if st.has_head(st.S(0)):
st.pop()
else:
@ -151,7 +151,7 @@ cdef class Reduce:
st.fast_forward()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
@staticmethod
@ -170,23 +170,23 @@ cdef class Reduce:
return cost
@staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0
cdef class LeftArc:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return not st.B_(0).sent_start
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.B(0), st.S(0), label)
st.pop()
st.fast_forward()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
@staticmethod
@ -204,23 +204,23 @@ cdef class LeftArc:
return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
@staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
cdef class RightArc:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return not st.B_(0).sent_start
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.add_arc(st.S(0), st.B(0), label)
st.push()
st.fast_forward()
@staticmethod
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
@staticmethod
@ -233,13 +233,13 @@ cdef class RightArc:
return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
@staticmethod
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
cdef class Break:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int i
if not USE_BREAK:
return False
@ -251,12 +251,12 @@ cdef class Break:
return True
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.set_break(st.B_(0).l_edge)
st.fast_forward()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
@staticmethod
@ -281,7 +281,7 @@ cdef class Break:
return cost + 1
@staticmethod
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 0
cdef int _get_root(int word, const GoldParseC* gold) nogil:
@ -369,7 +369,7 @@ cdef class ArcEager(TransitionSystem):
if label.upper() == 'ROOT':
label = 'ROOT'
gold.c.heads[i] = gold.heads[i]
gold.c.labels[i] = self.strings[label]
gold.c.labels[i] = self.strings.add(label)
return gold
cdef Transition lookup_transition(self, object name) except *:
@ -384,14 +384,14 @@ cdef class ArcEager(TransitionSystem):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
def move_name(self, int move, int label):
def move_name(self, int move, attr_t label):
label_str = self.strings[label]
if label_str:
return MOVE_NAMES[move] + '-' + label_str
else:
return MOVE_NAMES[move]
cdef Transition init_transition(self, int clas, int move, int label) except *:
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers
cdef Transition t
@ -469,7 +469,7 @@ cdef class ArcEager(TransitionSystem):
label_cost_funcs[RIGHT] = RightArc.label_cost
label_cost_funcs[BREAK] = Break.label_cost
cdef int* labels = gold.c.labels
cdef attr_t* labels = gold.c.labels
cdef int* heads = gold.c.heads
n_gold = 0

View File

@ -1,6 +1,7 @@
from .transition_system cimport TransitionSystem
from .transition_system cimport Transition
from ..gold cimport GoldParseC
from ..typedefs cimport attr_t
cdef class BiluoPushDown(TransitionSystem):

View File

@ -100,7 +100,7 @@ cdef class BiluoPushDown(TransitionSystem):
def __get__(self):
return (BEGIN, IN, LAST, UNIT, OUT)
def move_name(self, int move, int label):
def move_name(self, int move, attr_t label):
if move == OUT:
return 'O'
elif move == MISSING:
@ -132,7 +132,7 @@ cdef class BiluoPushDown(TransitionSystem):
if label_str.startswith('!'):
label_str = label_str[1:]
move_str = 'x'
label = self.strings[label_str]
label = self.strings.add(label_str)
else:
move_str = name
label = 0
@ -145,7 +145,7 @@ cdef class BiluoPushDown(TransitionSystem):
else:
raise KeyError(name)
cdef Transition init_transition(self, int clas, int move, int label) except *:
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers
cdef Transition t
@ -194,21 +194,21 @@ cdef class BiluoPushDown(TransitionSystem):
cdef class Missing:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return False
@staticmethod
cdef int transition(StateC* s, int label) nogil:
cdef int transition(StateC* s, attr_t label) nogil:
pass
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
return 9000
cdef class Begin:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
# Ensure we don't clobber preset entities. If no entity preset,
# ent_iob is 0
cdef int preset_ent_iob = st.B_(0).ent_iob
@ -232,14 +232,14 @@ cdef class Begin:
return label != 0 and not st.entity_is_open()
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label)
st.set_ent_tag(st.B(0), 3, label)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
@ -261,7 +261,7 @@ cdef class Begin:
cdef class In:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2:
return False
@ -277,17 +277,17 @@ cdef class In:
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.set_ent_tag(st.B(0), 1, label)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
move = IN
cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
if g_act == MISSING:
@ -313,24 +313,24 @@ cdef class In:
cdef class Last:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.B_(1).ent_iob == 1:
return False
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.close_ent()
st.set_ent_tag(st.B(0), 1, label)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
move = LAST
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING:
return 0
@ -355,7 +355,7 @@ cdef class Last:
cdef class Unit:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 2:
return False
@ -368,7 +368,7 @@ cdef class Unit:
return label != 0 and not st.entity_is_open()
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.open_ent(label)
st.close_ent()
st.set_ent_tag(st.B(0), 3, label)
@ -376,9 +376,9 @@ cdef class Unit:
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING:
return 0
@ -398,7 +398,7 @@ cdef class Unit:
cdef class Out:
@staticmethod
cdef bint is_valid(const StateC* st, int label) nogil:
cdef bint is_valid(const StateC* st, attr_t label) nogil:
cdef int preset_ent_iob = st.B_(0).ent_iob
if preset_ent_iob == 3:
return False
@ -407,15 +407,15 @@ cdef class Out:
return not st.entity_is_open()
@staticmethod
cdef int transition(StateC* st, int label) nogil:
cdef int transition(StateC* st, attr_t label) nogil:
st.set_ent_tag(st.B(0), 2, 0)
st.push()
st.pop()
@staticmethod
cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
cdef int g_act = gold.ner[s.B(0)].move
cdef int g_tag = gold.ner[s.B(0)].label
cdef attr_t g_tag = gold.ner[s.B(0)].label
if g_act == MISSING or g_act == ISNT:
return 0

View File

@ -1,6 +1,7 @@
from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from ..typedefs cimport attr_t
from ..structs cimport TokenC
from ..gold cimport GoldParse
from ..gold cimport GoldParseC
@ -13,20 +14,22 @@ from ._state cimport StateC
cdef struct Transition:
int clas
int move
int label
attr_t label
weight_t score
bint (*is_valid)(const StateC* state, int label) nogil
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil
int (*do)(StateC* state, int label) nogil
bint (*is_valid)(const StateC* state, attr_t label) nogil
weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil
int (*do)(StateC* state, attr_t label) nogil
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold,
attr_tlabel) nogil
ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC*
gold, attr_t label) nogil
ctypedef int (*do_func_t)(StateC* state, int label) nogil
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
@ -36,7 +39,7 @@ cdef class TransitionSystem:
cdef Transition* c
cdef readonly int n_moves
cdef int _size
cdef public int root_label
cdef public attr_t root_label
cdef public freqs
cdef init_state_t init_beam_state
@ -45,7 +48,7 @@ cdef class TransitionSystem:
cdef Transition lookup_transition(self, object name) except *
cdef Transition init_transition(self, int clas, int move, int label) except *
cdef Transition init_transition(self, int clas, int move, attr_t label) except *
cdef int set_valid(self, int* output, const StateC* st) nogil

View File

@ -99,7 +99,7 @@ cdef class TransitionSystem:
cdef Transition lookup_transition(self, object name) except *:
raise NotImplementedError
cdef Transition init_transition(self, int clas, int move, int label) except *:
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
raise NotImplementedError
def is_valid(self, StateClass stcls, move_name):

View File

@ -204,6 +204,7 @@ def test_doc_api_right_edge(en_tokenizer):
assert doc[6].right_edge.text == ','
@pytest.mark.xfail
@pytest.mark.parametrize('text,vectors', [
("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
])

View File

@ -11,7 +11,6 @@ import struct
import dill
from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t
from libc.math cimport sqrt
from .span cimport Span
@ -21,6 +20,7 @@ from .token cimport Token
from .printers import parse_tree
from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t
from ..attrs import intify_attrs
from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
@ -494,8 +494,8 @@ cdef class Doc:
cdef np.ndarray[attr_t, ndim=2] output
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
for i in range(self.length):
for j, feature in enumerate(attr_ids):
output[i, j] = get_token_attr(&self.c[i], feature)
@ -640,7 +640,7 @@ cdef class Doc:
"""
if self.length != 0:
raise ValueError("Cannot load into non-empty Doc")
cdef int[:, :] attrs
cdef attr_t[:, :] attrs
cdef int i, start, end, has_space
fields = dill.loads(data)
text, attrs = fields[:2]
@ -679,17 +679,15 @@ cdef class Doc:
if len(args) == 3:
# TODO: Warn deprecation
tag, lemma, ent_type = args
attributes[TAG] = self.vocab.strings[tag]
attributes[LEMMA] = self.vocab.strings[lemma]
attributes[ENT_TYPE] = self.vocab.strings[ent_type]
attributes[TAG] = tag
attributes[LEMMA] = lemma
attributes[ENT_TYPE] = ent_type
elif not args:
# TODO: This code makes little sense overall. We're still
# ignoring most of the attributes?
if "label" in attributes and 'ent_type' not in attributes:
if type(attributes["label"]) == int:
attributes[ENT_TYPE] = attributes["label"]
else:
attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]]
attributes[ENT_TYPE] = self.vocab.strings.add(attributes["label"])
if 'ent_type' in attributes:
attributes[ENT_TYPE] = attributes['ent_type']
elif args:
@ -699,6 +697,8 @@ cdef class Doc:
"Arguments supplied:\n%s\n"
"Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1:
return None
@ -708,13 +708,6 @@ cdef class Doc:
# Currently we have the token index, we want the range-end index
end += 1
cdef Span span = self[start:end]
tag = self.vocab.strings[attributes.get(TAG, span.root.tag)]
lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)]
ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)]
ent_id = attributes.get('ent_id', span.root.ent_id)
if isinstance(ent_id, basestring):
ent_id = self.vocab.strings[ent_id]
# Get LexemeC for newly merged token
new_orth = ''.join([t.text_with_ws for t in span])
if span[-1].whitespace_:
@ -723,18 +716,11 @@ cdef class Doc:
# House the new merged token where it starts
cdef TokenC* token = &self.c[start]
token.spacy = self.c[end-1].spacy
if tag in self.vocab.morphology.tag_map:
self.vocab.morphology.assign_tag(token, tag)
for attr_name, attr_value in attributes.items():
if attr_name == TAG:
self.vocab.morphology.assign_tag(token, attr_value)
else:
token.tag = self.vocab.strings[tag]
token.lemma = self.vocab.strings[lemma]
if ent_type == 'O':
token.ent_iob = 2
token.ent_type = 0
else:
token.ent_iob = 3
token.ent_type = self.vocab.strings[ent_type]
token.ent_id = ent_id
Token.set_struct_attr(token, attr_name, attr_value)
# Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a dependency

View File

@ -21,14 +21,14 @@ from .. import about
cdef class Span:
"""A slice from a Doc object."""
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
vector_norm=None):
"""Create a `Span` object from the slice `doc[start : end]`.
doc (Doc): The parent document.
start (int): The index of the first token of the span.
end (int): The index of the first token after the span.
label (int): A label to attach to the Span, e.g. for named entities.
label (uint64): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
RETURNS (Span): The newly constructed object.
"""
@ -377,7 +377,7 @@ cdef class Span:
property ent_id:
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
RETURNS (int): The entity ID.
RETURNS (uint64): The entity ID.
"""
def __get__(self):
return self.root.ent_id