mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 05:31:15 +03:00 
			
		
		
		
	WIP on stringstore change. 27 failures
This commit is contained in:
		
							parent
							
								
									fe4a746300
								
							
						
					
					
						commit
						84e66ca6d4
					
				|  | @ -150,6 +150,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): | ||||||
|         else: |         else: | ||||||
|             int_key = IDS[name.upper()] |             int_key = IDS[name.upper()] | ||||||
|         if strings_map is not None and isinstance(value, basestring): |         if strings_map is not None and isinstance(value, basestring): | ||||||
|             value = strings_map[value] |             value = strings_map.add(value) | ||||||
|         inty_attrs[int_key] = value |         inty_attrs[int_key] = value | ||||||
|     return inty_attrs |     return inty_attrs | ||||||
|  |  | ||||||
|  | @ -1,13 +1,14 @@ | ||||||
| from cymem.cymem cimport Pool | from cymem.cymem cimport Pool | ||||||
| 
 | 
 | ||||||
| from .structs cimport TokenC | from .structs cimport TokenC | ||||||
|  | from .typedefs cimport attr_t | ||||||
| from .syntax.transition_system cimport Transition | from .syntax.transition_system cimport Transition | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef struct GoldParseC: | cdef struct GoldParseC: | ||||||
|     int* tags |     int* tags | ||||||
|     int* heads |     int* heads | ||||||
|     int* labels |     attr_t* labels | ||||||
|     int** brackets |     int** brackets | ||||||
|     Transition* ner |     Transition* ner | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -384,7 +384,7 @@ cdef class GoldParse: | ||||||
|         # These are filled by the tagger/parser/entity recogniser |         # These are filled by the tagger/parser/entity recogniser | ||||||
|         self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int)) |         self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int)) | ||||||
|         self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int)) |         self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int)) | ||||||
|         self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int)) |         self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t)) | ||||||
|         self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition)) |         self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition)) | ||||||
| 
 | 
 | ||||||
|         self.words = [None] * len(doc) |         self.words = [None] * len(doc) | ||||||
|  |  | ||||||
|  | @ -35,7 +35,7 @@ cdef class Lexeme: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil: |     cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil: | ||||||
|         buff = <unsigned char*>&lex.flags |         buff = <unsigned char*>&lex.flags | ||||||
|         end = <unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm) |         end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment) | ||||||
|         for i in range(sizeof(lex_data.data)): |         for i in range(sizeof(lex_data.data)): | ||||||
|             buff[i] = lex_data.data[i] |             buff[i] = lex_data.data[i] | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -48,7 +48,7 @@ cdef class Morphology: | ||||||
|             self.tag_map[tag_str] = dict(attrs) |             self.tag_map[tag_str] = dict(attrs) | ||||||
|             attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) |             attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) | ||||||
|             self.rich_tags[i].id = i |             self.rich_tags[i].id = i | ||||||
|             self.rich_tags[i].name = self.strings[tag_str] |             self.rich_tags[i].name = self.strings.add(tag_str) | ||||||
|             self.rich_tags[i].morph = 0 |             self.rich_tags[i].morph = 0 | ||||||
|             self.rich_tags[i].pos = attrs[POS] |             self.rich_tags[i].pos = attrs[POS] | ||||||
|             self.reverse_index[self.rich_tags[i].name] = i |             self.reverse_index[self.rich_tags[i].name] = i | ||||||
|  | @ -59,10 +59,12 @@ cdef class Morphology: | ||||||
| 
 | 
 | ||||||
|     cdef int assign_tag(self, TokenC* token, tag) except -1: |     cdef int assign_tag(self, TokenC* token, tag) except -1: | ||||||
|         if isinstance(tag, basestring): |         if isinstance(tag, basestring): | ||||||
|             tag_id = self.reverse_index[self.strings[tag]] |             tag = self.strings.add(tag) | ||||||
|         else: |         if tag in self.reverse_index: | ||||||
|             tag_id = self.reverse_index[tag] |             tag_id = self.reverse_index[tag] | ||||||
|         self.assign_tag_id(token, tag_id) |             self.assign_tag_id(token, tag_id) | ||||||
|  |         else: | ||||||
|  |             token.tag = tag | ||||||
| 
 | 
 | ||||||
|     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: |     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: | ||||||
|         if tag_id >= self.n_tags: |         if tag_id >= self.n_tags: | ||||||
|  | @ -73,7 +75,7 @@ cdef class Morphology: | ||||||
|         # the statistical model fails. |         # the statistical model fails. | ||||||
|         # Related to Issue #220 |         # Related to Issue #220 | ||||||
|         if Lexeme.c_check_flag(token.lex, IS_SPACE): |         if Lexeme.c_check_flag(token.lex, IS_SPACE): | ||||||
|             tag_id = self.reverse_index[self.strings['SP']] |             tag_id = self.reverse_index[self.strings.add('SP')] | ||||||
|         rich_tag = self.rich_tags[tag_id] |         rich_tag = self.rich_tags[tag_id] | ||||||
|         analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth) |         analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth) | ||||||
|         if analysis is NULL: |         if analysis is NULL: | ||||||
|  | @ -104,7 +106,7 @@ cdef class Morphology: | ||||||
|             tag (unicode): The part-of-speech tag to key the exception. |             tag (unicode): The part-of-speech tag to key the exception. | ||||||
|             orth (unicode): The word-form to key the exception. |             orth (unicode): The word-form to key the exception. | ||||||
|         """ |         """ | ||||||
|         tag = self.strings[tag_str] |         tag = self.strings.add(tag_str) | ||||||
|         tag_id = self.reverse_index[tag] |         tag_id = self.reverse_index[tag] | ||||||
|         orth = self.strings[orth_str] |         orth = self.strings[orth_str] | ||||||
|         cdef RichTagC rich_tag = self.rich_tags[tag_id] |         cdef RichTagC rich_tag = self.rich_tags[tag_id] | ||||||
|  | @ -140,9 +142,9 @@ cdef class Morphology: | ||||||
|     def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): |     def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): | ||||||
|         cdef unicode py_string = self.strings[orth] |         cdef unicode py_string = self.strings[orth] | ||||||
|         if self.lemmatizer is None: |         if self.lemmatizer is None: | ||||||
|             return self.strings[py_string.lower()] |             return self.strings.add(py_string.lower()) | ||||||
|         if univ_pos not in (NOUN, VERB, ADJ, PUNCT): |         if univ_pos not in (NOUN, VERB, ADJ, PUNCT): | ||||||
|             return self.strings[py_string.lower()] |             return self.strings.add(py_string.lower()) | ||||||
|         cdef set lemma_strings |         cdef set lemma_strings | ||||||
|         cdef unicode lemma_string |         cdef unicode lemma_string | ||||||
|         lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) |         lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) | ||||||
|  |  | ||||||
|  | @ -23,7 +23,6 @@ cdef struct LexemeC: | ||||||
| 
 | 
 | ||||||
|     float prob |     float prob | ||||||
|     float sentiment |     float sentiment | ||||||
|     float l2_norm |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef struct SerializedLexemeC: | cdef struct SerializedLexemeC: | ||||||
|  | @ -48,7 +47,7 @@ cdef struct Entity: | ||||||
|     hash_t id |     hash_t id | ||||||
|     int start |     int start | ||||||
|     int end |     int end | ||||||
|     int label |     attr_t label | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef struct TokenC: | cdef struct TokenC: | ||||||
|  | @ -56,10 +55,10 @@ cdef struct TokenC: | ||||||
|     uint64_t morph |     uint64_t morph | ||||||
|     univ_pos_t pos |     univ_pos_t pos | ||||||
|     bint spacy |     bint spacy | ||||||
|     int tag |     attr_t tag | ||||||
|     int idx |     int idx | ||||||
|     attr_t lemma |     attr_t lemma | ||||||
|     int sense |     attr_t sense | ||||||
|     int head |     int head | ||||||
|     attr_t dep |     attr_t dep | ||||||
|     bint sent_start |     bint sent_start | ||||||
|  | @ -70,5 +69,5 @@ cdef struct TokenC: | ||||||
|     uint32_t r_edge |     uint32_t r_edge | ||||||
| 
 | 
 | ||||||
|     int ent_iob |     int ent_iob | ||||||
|     int ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. |     attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. | ||||||
|     hash_t ent_id |     hash_t ent_id | ||||||
|  |  | ||||||
|  | @ -3,6 +3,7 @@ from cymem.cymem cimport Pool | ||||||
| from thinc.typedefs cimport weight_t | from thinc.typedefs cimport weight_t | ||||||
| 
 | 
 | ||||||
| from .stateclass cimport StateClass | from .stateclass cimport StateClass | ||||||
|  | from ..typedefs cimport attr_t | ||||||
| 
 | 
 | ||||||
| from .transition_system cimport TransitionSystem, Transition | from .transition_system cimport TransitionSystem, Transition | ||||||
| from ..gold cimport GoldParseC | from ..gold cimport GoldParseC | ||||||
|  |  | ||||||
|  | @ -99,7 +99,7 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil: | ||||||
|         return False |         return False | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil: | cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil: | ||||||
|     if gold.labels[child] == -1: |     if gold.labels[child] == -1: | ||||||
|         return True |         return True | ||||||
|     elif label == -1: |     elif label == -1: | ||||||
|  | @ -116,16 +116,16 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: | ||||||
| 
 | 
 | ||||||
| cdef class Shift: | cdef class Shift: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef bint is_valid(const StateC* st, int label) nogil: |     cdef bint is_valid(const StateC* st, attr_t label) nogil: | ||||||
|         return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start |         return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef int transition(StateC* st, int label) nogil: |     cdef int transition(StateC* st, attr_t label) nogil: | ||||||
|         st.push() |         st.push() | ||||||
|         st.fast_forward() |         st.fast_forward() | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef weight_t cost(StateClass st, const GoldParseC* gold, int label) nogil: |     cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label) |         return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label) | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|  | @ -133,17 +133,17 @@ cdef class Shift: | ||||||
|         return push_cost(s, gold, s.B(0)) |         return push_cost(s, gold, s.B(0)) | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         return 0 |         return 0 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Reduce: | cdef class Reduce: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef bint is_valid(const StateC* st, int label) nogil: |     cdef bint is_valid(const StateC* st, attr_t label) nogil: | ||||||
|         return st.stack_depth() >= 2 |         return st.stack_depth() >= 2 | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef int transition(StateC* st, int label) nogil: |     cdef int transition(StateC* st, attr_t label) nogil: | ||||||
|         if st.has_head(st.S(0)): |         if st.has_head(st.S(0)): | ||||||
|             st.pop() |             st.pop() | ||||||
|         else: |         else: | ||||||
|  | @ -151,7 +151,7 @@ cdef class Reduce: | ||||||
|         st.fast_forward() |         st.fast_forward() | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) |         return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label) | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|  | @ -170,23 +170,23 @@ cdef class Reduce: | ||||||
|         return cost |         return cost | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         return 0 |         return 0 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class LeftArc: | cdef class LeftArc: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef bint is_valid(const StateC* st, int label) nogil: |     cdef bint is_valid(const StateC* st, attr_t label) nogil: | ||||||
|         return not st.B_(0).sent_start |         return not st.B_(0).sent_start | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef int transition(StateC* st, int label) nogil: |     cdef int transition(StateC* st, attr_t label) nogil: | ||||||
|         st.add_arc(st.B(0), st.S(0), label) |         st.add_arc(st.B(0), st.S(0), label) | ||||||
|         st.pop() |         st.pop() | ||||||
|         st.fast_forward() |         st.fast_forward() | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) |         return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label) | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|  | @ -204,23 +204,23 @@ cdef class LeftArc: | ||||||
|             return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) |             return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0)) | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label) |         return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class RightArc: | cdef class RightArc: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef bint is_valid(const StateC* st, int label) nogil: |     cdef bint is_valid(const StateC* st, attr_t label) nogil: | ||||||
|         return not st.B_(0).sent_start |         return not st.B_(0).sent_start | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef int transition(StateC* st, int label) nogil: |     cdef int transition(StateC* st, attr_t label) nogil: | ||||||
|         st.add_arc(st.S(0), st.B(0), label) |         st.add_arc(st.S(0), st.B(0), label) | ||||||
|         st.push() |         st.push() | ||||||
|         st.fast_forward() |         st.fast_forward() | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef inline weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) |         return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label) | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|  | @ -233,13 +233,13 @@ cdef class RightArc: | ||||||
|             return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) |             return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0)) | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label) |         return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Break: | cdef class Break: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef bint is_valid(const StateC* st, int label) nogil: |     cdef bint is_valid(const StateC* st, attr_t label) nogil: | ||||||
|         cdef int i |         cdef int i | ||||||
|         if not USE_BREAK: |         if not USE_BREAK: | ||||||
|             return False |             return False | ||||||
|  | @ -251,12 +251,12 @@ cdef class Break: | ||||||
|             return True |             return True | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef int transition(StateC* st, int label) nogil: |     cdef int transition(StateC* st, attr_t label) nogil: | ||||||
|         st.set_break(st.B_(0).l_edge) |         st.set_break(st.B_(0).l_edge) | ||||||
|         st.fast_forward() |         st.fast_forward() | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         return Break.move_cost(s, gold) + Break.label_cost(s, gold, label) |         return Break.move_cost(s, gold) + Break.label_cost(s, gold, label) | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|  | @ -281,7 +281,7 @@ cdef class Break: | ||||||
|             return cost + 1 |             return cost + 1 | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         return 0 |         return 0 | ||||||
| 
 | 
 | ||||||
| cdef int _get_root(int word, const GoldParseC* gold) nogil: | cdef int _get_root(int word, const GoldParseC* gold) nogil: | ||||||
|  | @ -369,7 +369,7 @@ cdef class ArcEager(TransitionSystem): | ||||||
|                 if label.upper() == 'ROOT': |                 if label.upper() == 'ROOT': | ||||||
|                     label = 'ROOT' |                     label = 'ROOT' | ||||||
|                 gold.c.heads[i] = gold.heads[i] |                 gold.c.heads[i] = gold.heads[i] | ||||||
|                 gold.c.labels[i] = self.strings[label] |                 gold.c.labels[i] = self.strings.add(label) | ||||||
|         return gold |         return gold | ||||||
| 
 | 
 | ||||||
|     cdef Transition lookup_transition(self, object name) except *: |     cdef Transition lookup_transition(self, object name) except *: | ||||||
|  | @ -384,14 +384,14 @@ cdef class ArcEager(TransitionSystem): | ||||||
|             if self.c[i].move == move and self.c[i].label == label: |             if self.c[i].move == move and self.c[i].label == label: | ||||||
|                 return self.c[i] |                 return self.c[i] | ||||||
| 
 | 
 | ||||||
|     def move_name(self, int move, int label): |     def move_name(self, int move, attr_t label): | ||||||
|         label_str = self.strings[label] |         label_str = self.strings[label] | ||||||
|         if label_str: |         if label_str: | ||||||
|             return MOVE_NAMES[move] + '-' + label_str |             return MOVE_NAMES[move] + '-' + label_str | ||||||
|         else: |         else: | ||||||
|             return MOVE_NAMES[move] |             return MOVE_NAMES[move] | ||||||
| 
 | 
 | ||||||
|     cdef Transition init_transition(self, int clas, int move, int label) except *: |     cdef Transition init_transition(self, int clas, int move, attr_t label) except *: | ||||||
|         # TODO: Apparent Cython bug here when we try to use the Transition() |         # TODO: Apparent Cython bug here when we try to use the Transition() | ||||||
|         # constructor with the function pointers |         # constructor with the function pointers | ||||||
|         cdef Transition t |         cdef Transition t | ||||||
|  | @ -469,7 +469,7 @@ cdef class ArcEager(TransitionSystem): | ||||||
|         label_cost_funcs[RIGHT] = RightArc.label_cost |         label_cost_funcs[RIGHT] = RightArc.label_cost | ||||||
|         label_cost_funcs[BREAK] = Break.label_cost |         label_cost_funcs[BREAK] = Break.label_cost | ||||||
| 
 | 
 | ||||||
|         cdef int* labels = gold.c.labels |         cdef attr_t* labels = gold.c.labels | ||||||
|         cdef int* heads = gold.c.heads |         cdef int* heads = gold.c.heads | ||||||
| 
 | 
 | ||||||
|         n_gold = 0 |         n_gold = 0 | ||||||
|  |  | ||||||
|  | @ -1,6 +1,7 @@ | ||||||
| from .transition_system cimport TransitionSystem | from .transition_system cimport TransitionSystem | ||||||
| from .transition_system cimport Transition | from .transition_system cimport Transition | ||||||
| from ..gold cimport GoldParseC | from ..gold cimport GoldParseC | ||||||
|  | from ..typedefs cimport attr_t | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class BiluoPushDown(TransitionSystem): | cdef class BiluoPushDown(TransitionSystem): | ||||||
|  |  | ||||||
|  | @ -100,7 +100,7 @@ cdef class BiluoPushDown(TransitionSystem): | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             return (BEGIN, IN, LAST, UNIT, OUT) |             return (BEGIN, IN, LAST, UNIT, OUT) | ||||||
| 
 | 
 | ||||||
|     def move_name(self, int move, int label): |     def move_name(self, int move, attr_t label): | ||||||
|         if move == OUT: |         if move == OUT: | ||||||
|             return 'O' |             return 'O' | ||||||
|         elif move == MISSING: |         elif move == MISSING: | ||||||
|  | @ -132,7 +132,7 @@ cdef class BiluoPushDown(TransitionSystem): | ||||||
|             if label_str.startswith('!'): |             if label_str.startswith('!'): | ||||||
|                 label_str = label_str[1:] |                 label_str = label_str[1:] | ||||||
|                 move_str = 'x' |                 move_str = 'x' | ||||||
|             label = self.strings[label_str] |             label = self.strings.add(label_str) | ||||||
|         else: |         else: | ||||||
|             move_str = name |             move_str = name | ||||||
|             label = 0 |             label = 0 | ||||||
|  | @ -145,7 +145,7 @@ cdef class BiluoPushDown(TransitionSystem): | ||||||
|         else: |         else: | ||||||
|             raise KeyError(name) |             raise KeyError(name) | ||||||
| 
 | 
 | ||||||
|     cdef Transition init_transition(self, int clas, int move, int label) except *: |     cdef Transition init_transition(self, int clas, int move, attr_t label) except *: | ||||||
|         # TODO: Apparent Cython bug here when we try to use the Transition() |         # TODO: Apparent Cython bug here when we try to use the Transition() | ||||||
|         # constructor with the function pointers |         # constructor with the function pointers | ||||||
|         cdef Transition t |         cdef Transition t | ||||||
|  | @ -194,21 +194,21 @@ cdef class BiluoPushDown(TransitionSystem): | ||||||
| 
 | 
 | ||||||
| cdef class Missing: | cdef class Missing: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef bint is_valid(const StateC* st, int label) nogil: |     cdef bint is_valid(const StateC* st, attr_t label) nogil: | ||||||
|         return False |         return False | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef int transition(StateC* s, int label) nogil: |     cdef int transition(StateC* s, attr_t label) nogil: | ||||||
|         pass |         pass | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         return 9000 |         return 9000 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Begin: | cdef class Begin: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef bint is_valid(const StateC* st, int label) nogil: |     cdef bint is_valid(const StateC* st, attr_t label) nogil: | ||||||
|         # Ensure we don't clobber preset entities. If no entity preset, |         # Ensure we don't clobber preset entities. If no entity preset, | ||||||
|         # ent_iob is 0 |         # ent_iob is 0 | ||||||
|         cdef int preset_ent_iob = st.B_(0).ent_iob |         cdef int preset_ent_iob = st.B_(0).ent_iob | ||||||
|  | @ -232,14 +232,14 @@ cdef class Begin: | ||||||
|             return label != 0 and not st.entity_is_open() |             return label != 0 and not st.entity_is_open() | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef int transition(StateC* st, int label) nogil: |     cdef int transition(StateC* st, attr_t label) nogil: | ||||||
|         st.open_ent(label) |         st.open_ent(label) | ||||||
|         st.set_ent_tag(st.B(0), 3, label) |         st.set_ent_tag(st.B(0), 3, label) | ||||||
|         st.push() |         st.push() | ||||||
|         st.pop() |         st.pop() | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         cdef int g_act = gold.ner[s.B(0)].move |         cdef int g_act = gold.ner[s.B(0)].move | ||||||
|         cdef int g_tag = gold.ner[s.B(0)].label |         cdef int g_tag = gold.ner[s.B(0)].label | ||||||
| 
 | 
 | ||||||
|  | @ -261,7 +261,7 @@ cdef class Begin: | ||||||
| 
 | 
 | ||||||
| cdef class In: | cdef class In: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef bint is_valid(const StateC* st, int label) nogil: |     cdef bint is_valid(const StateC* st, attr_t label) nogil: | ||||||
|         cdef int preset_ent_iob = st.B_(0).ent_iob |         cdef int preset_ent_iob = st.B_(0).ent_iob | ||||||
|         if preset_ent_iob == 2: |         if preset_ent_iob == 2: | ||||||
|             return False |             return False | ||||||
|  | @ -277,17 +277,17 @@ cdef class In: | ||||||
|         return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label |         return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef int transition(StateC* st, int label) nogil: |     cdef int transition(StateC* st, attr_t label) nogil: | ||||||
|         st.set_ent_tag(st.B(0), 1, label) |         st.set_ent_tag(st.B(0), 1, label) | ||||||
|         st.push() |         st.push() | ||||||
|         st.pop() |         st.pop() | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         move = IN |         move = IN | ||||||
|         cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT |         cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT | ||||||
|         cdef int g_act = gold.ner[s.B(0)].move |         cdef int g_act = gold.ner[s.B(0)].move | ||||||
|         cdef int g_tag = gold.ner[s.B(0)].label |         cdef attr_t g_tag = gold.ner[s.B(0)].label | ||||||
|         cdef bint is_sunk = _entity_is_sunk(s, gold.ner) |         cdef bint is_sunk = _entity_is_sunk(s, gold.ner) | ||||||
| 
 | 
 | ||||||
|         if g_act == MISSING: |         if g_act == MISSING: | ||||||
|  | @ -313,24 +313,24 @@ cdef class In: | ||||||
| 
 | 
 | ||||||
| cdef class Last: | cdef class Last: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef bint is_valid(const StateC* st, int label) nogil: |     cdef bint is_valid(const StateC* st, attr_t label) nogil: | ||||||
|         if st.B_(1).ent_iob == 1: |         if st.B_(1).ent_iob == 1: | ||||||
|             return False |             return False | ||||||
|         return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label |         return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef int transition(StateC* st, int label) nogil: |     cdef int transition(StateC* st, attr_t label) nogil: | ||||||
|         st.close_ent() |         st.close_ent() | ||||||
|         st.set_ent_tag(st.B(0), 1, label) |         st.set_ent_tag(st.B(0), 1, label) | ||||||
|         st.push() |         st.push() | ||||||
|         st.pop() |         st.pop() | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         move = LAST |         move = LAST | ||||||
| 
 | 
 | ||||||
|         cdef int g_act = gold.ner[s.B(0)].move |         cdef int g_act = gold.ner[s.B(0)].move | ||||||
|         cdef int g_tag = gold.ner[s.B(0)].label |         cdef attr_t g_tag = gold.ner[s.B(0)].label | ||||||
| 
 | 
 | ||||||
|         if g_act == MISSING: |         if g_act == MISSING: | ||||||
|             return 0 |             return 0 | ||||||
|  | @ -355,7 +355,7 @@ cdef class Last: | ||||||
| 
 | 
 | ||||||
| cdef class Unit: | cdef class Unit: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef bint is_valid(const StateC* st, int label) nogil: |     cdef bint is_valid(const StateC* st, attr_t label) nogil: | ||||||
|         cdef int preset_ent_iob = st.B_(0).ent_iob |         cdef int preset_ent_iob = st.B_(0).ent_iob | ||||||
|         if preset_ent_iob == 2: |         if preset_ent_iob == 2: | ||||||
|             return False |             return False | ||||||
|  | @ -368,7 +368,7 @@ cdef class Unit: | ||||||
|         return label != 0 and not st.entity_is_open() |         return label != 0 and not st.entity_is_open() | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef int transition(StateC* st, int label) nogil: |     cdef int transition(StateC* st, attr_t label) nogil: | ||||||
|         st.open_ent(label) |         st.open_ent(label) | ||||||
|         st.close_ent() |         st.close_ent() | ||||||
|         st.set_ent_tag(st.B(0), 3, label) |         st.set_ent_tag(st.B(0), 3, label) | ||||||
|  | @ -376,9 +376,9 @@ cdef class Unit: | ||||||
|         st.pop() |         st.pop() | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         cdef int g_act = gold.ner[s.B(0)].move |         cdef int g_act = gold.ner[s.B(0)].move | ||||||
|         cdef int g_tag = gold.ner[s.B(0)].label |         cdef attr_t g_tag = gold.ner[s.B(0)].label | ||||||
| 
 | 
 | ||||||
|         if g_act == MISSING: |         if g_act == MISSING: | ||||||
|             return 0 |             return 0 | ||||||
|  | @ -398,7 +398,7 @@ cdef class Unit: | ||||||
| 
 | 
 | ||||||
| cdef class Out: | cdef class Out: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef bint is_valid(const StateC* st, int label) nogil: |     cdef bint is_valid(const StateC* st, attr_t label) nogil: | ||||||
|         cdef int preset_ent_iob = st.B_(0).ent_iob |         cdef int preset_ent_iob = st.B_(0).ent_iob | ||||||
|         if preset_ent_iob == 3: |         if preset_ent_iob == 3: | ||||||
|             return False |             return False | ||||||
|  | @ -407,15 +407,15 @@ cdef class Out: | ||||||
|         return not st.entity_is_open() |         return not st.entity_is_open() | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef int transition(StateC* st, int label) nogil: |     cdef int transition(StateC* st, attr_t label) nogil: | ||||||
|         st.set_ent_tag(st.B(0), 2, 0) |         st.set_ent_tag(st.B(0), 2, 0) | ||||||
|         st.push() |         st.push() | ||||||
|         st.pop() |         st.pop() | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil: |     cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil: | ||||||
|         cdef int g_act = gold.ner[s.B(0)].move |         cdef int g_act = gold.ner[s.B(0)].move | ||||||
|         cdef int g_tag = gold.ner[s.B(0)].label |         cdef attr_t g_tag = gold.ner[s.B(0)].label | ||||||
| 
 | 
 | ||||||
|         if g_act == MISSING or g_act == ISNT: |         if g_act == MISSING or g_act == ISNT: | ||||||
|             return 0 |             return 0 | ||||||
|  |  | ||||||
|  | @ -1,6 +1,7 @@ | ||||||
| from cymem.cymem cimport Pool | from cymem.cymem cimport Pool | ||||||
| from thinc.typedefs cimport weight_t | from thinc.typedefs cimport weight_t | ||||||
| 
 | 
 | ||||||
|  | from ..typedefs cimport attr_t | ||||||
| from ..structs cimport TokenC | from ..structs cimport TokenC | ||||||
| from ..gold cimport GoldParse | from ..gold cimport GoldParse | ||||||
| from ..gold cimport GoldParseC | from ..gold cimport GoldParseC | ||||||
|  | @ -13,20 +14,22 @@ from ._state cimport StateC | ||||||
| cdef struct Transition: | cdef struct Transition: | ||||||
|     int clas |     int clas | ||||||
|     int move |     int move | ||||||
|     int label |     attr_t label | ||||||
| 
 | 
 | ||||||
|     weight_t score |     weight_t score | ||||||
| 
 | 
 | ||||||
|     bint (*is_valid)(const StateC* state, int label) nogil |     bint (*is_valid)(const StateC* state, attr_t label) nogil | ||||||
|     weight_t (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil |     weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil | ||||||
|     int (*do)(StateC* state, int label) nogil |     int (*do)(StateC* state, attr_t label) nogil | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil | ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, | ||||||
|  |         attr_tlabel) nogil | ||||||
| ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil | ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil | ||||||
| ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil | ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* | ||||||
|  |         gold, attr_t label) nogil | ||||||
| 
 | 
 | ||||||
| ctypedef int (*do_func_t)(StateC* state, int label) nogil | ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil | ||||||
| 
 | 
 | ||||||
| ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL | ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL | ||||||
| 
 | 
 | ||||||
|  | @ -36,7 +39,7 @@ cdef class TransitionSystem: | ||||||
|     cdef Transition* c |     cdef Transition* c | ||||||
|     cdef readonly int n_moves |     cdef readonly int n_moves | ||||||
|     cdef int _size |     cdef int _size | ||||||
|     cdef public int root_label |     cdef public attr_t root_label | ||||||
|     cdef public freqs |     cdef public freqs | ||||||
|     cdef init_state_t init_beam_state |     cdef init_state_t init_beam_state | ||||||
| 
 | 
 | ||||||
|  | @ -45,7 +48,7 @@ cdef class TransitionSystem: | ||||||
| 
 | 
 | ||||||
|     cdef Transition lookup_transition(self, object name) except * |     cdef Transition lookup_transition(self, object name) except * | ||||||
| 
 | 
 | ||||||
|     cdef Transition init_transition(self, int clas, int move, int label) except * |     cdef Transition init_transition(self, int clas, int move, attr_t label) except * | ||||||
| 
 | 
 | ||||||
|     cdef int set_valid(self, int* output, const StateC* st) nogil |     cdef int set_valid(self, int* output, const StateC* st) nogil | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -99,7 +99,7 @@ cdef class TransitionSystem: | ||||||
|     cdef Transition lookup_transition(self, object name) except *: |     cdef Transition lookup_transition(self, object name) except *: | ||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|     cdef Transition init_transition(self, int clas, int move, int label) except *: |     cdef Transition init_transition(self, int clas, int move, attr_t label) except *: | ||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|     def is_valid(self, StateClass stcls, move_name): |     def is_valid(self, StateClass stcls, move_name): | ||||||
|  |  | ||||||
|  | @ -204,6 +204,7 @@ def test_doc_api_right_edge(en_tokenizer): | ||||||
|     assert doc[6].right_edge.text == ',' |     assert doc[6].right_edge.text == ',' | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.xfail | ||||||
| @pytest.mark.parametrize('text,vectors', [ | @pytest.mark.parametrize('text,vectors', [ | ||||||
|     ("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"]) |     ("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"]) | ||||||
| ]) | ]) | ||||||
|  |  | ||||||
|  | @ -11,7 +11,6 @@ import struct | ||||||
| import dill | import dill | ||||||
| 
 | 
 | ||||||
| from libc.string cimport memcpy, memset | from libc.string cimport memcpy, memset | ||||||
| from libc.stdint cimport uint32_t |  | ||||||
| from libc.math cimport sqrt | from libc.math cimport sqrt | ||||||
| 
 | 
 | ||||||
| from .span cimport Span | from .span cimport Span | ||||||
|  | @ -21,6 +20,7 @@ from .token cimport Token | ||||||
| from .printers import parse_tree | from .printers import parse_tree | ||||||
| from ..lexeme cimport Lexeme, EMPTY_LEXEME | from ..lexeme cimport Lexeme, EMPTY_LEXEME | ||||||
| from ..typedefs cimport attr_t, flags_t | from ..typedefs cimport attr_t, flags_t | ||||||
|  | from ..attrs import intify_attrs | ||||||
| from ..attrs cimport attr_id_t | from ..attrs cimport attr_id_t | ||||||
| from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER | from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER | ||||||
| from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE | from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE | ||||||
|  | @ -494,8 +494,8 @@ cdef class Doc: | ||||||
|         cdef np.ndarray[attr_t, ndim=2] output |         cdef np.ndarray[attr_t, ndim=2] output | ||||||
|         # Make an array from the attributes --- otherwise our inner loop is Python |         # Make an array from the attributes --- otherwise our inner loop is Python | ||||||
|         # dict iteration. |         # dict iteration. | ||||||
|         cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32) |         cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) | ||||||
|         output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32) |         output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) | ||||||
|         for i in range(self.length): |         for i in range(self.length): | ||||||
|             for j, feature in enumerate(attr_ids): |             for j, feature in enumerate(attr_ids): | ||||||
|                 output[i, j] = get_token_attr(&self.c[i], feature) |                 output[i, j] = get_token_attr(&self.c[i], feature) | ||||||
|  | @ -640,7 +640,7 @@ cdef class Doc: | ||||||
|         """ |         """ | ||||||
|         if self.length != 0: |         if self.length != 0: | ||||||
|             raise ValueError("Cannot load into non-empty Doc") |             raise ValueError("Cannot load into non-empty Doc") | ||||||
|         cdef int[:, :] attrs |         cdef attr_t[:, :] attrs | ||||||
|         cdef int i, start, end, has_space |         cdef int i, start, end, has_space | ||||||
|         fields = dill.loads(data) |         fields = dill.loads(data) | ||||||
|         text, attrs = fields[:2] |         text, attrs = fields[:2] | ||||||
|  | @ -679,17 +679,15 @@ cdef class Doc: | ||||||
|         if len(args) == 3: |         if len(args) == 3: | ||||||
|             # TODO: Warn deprecation |             # TODO: Warn deprecation | ||||||
|             tag, lemma, ent_type = args |             tag, lemma, ent_type = args | ||||||
|             attributes[TAG] = self.vocab.strings[tag] |             attributes[TAG] = tag | ||||||
|             attributes[LEMMA] = self.vocab.strings[lemma] |             attributes[LEMMA] = lemma | ||||||
|             attributes[ENT_TYPE] = self.vocab.strings[ent_type] |             attributes[ENT_TYPE] = ent_type | ||||||
|         elif not args: |         elif not args: | ||||||
|             # TODO: This code makes little sense overall. We're still |  | ||||||
|             # ignoring most of the attributes? |  | ||||||
|             if "label" in attributes and 'ent_type' not in attributes: |             if "label" in attributes and 'ent_type' not in attributes: | ||||||
|                 if type(attributes["label"]) == int: |                 if type(attributes["label"]) == int: | ||||||
|                     attributes[ENT_TYPE] = attributes["label"] |                     attributes[ENT_TYPE] = attributes["label"] | ||||||
|                 else: |                 else: | ||||||
|                     attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]] |                     attributes[ENT_TYPE] = self.vocab.strings.add(attributes["label"]) | ||||||
|             if 'ent_type' in attributes: |             if 'ent_type' in attributes: | ||||||
|                 attributes[ENT_TYPE] = attributes['ent_type'] |                 attributes[ENT_TYPE] = attributes['ent_type'] | ||||||
|         elif args: |         elif args: | ||||||
|  | @ -699,6 +697,8 @@ cdef class Doc: | ||||||
|                 "Arguments supplied:\n%s\n" |                 "Arguments supplied:\n%s\n" | ||||||
|                 "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) |                 "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes))) | ||||||
| 
 | 
 | ||||||
|  |         attributes = intify_attrs(attributes, strings_map=self.vocab.strings) | ||||||
|  | 
 | ||||||
|         cdef int start = token_by_start(self.c, self.length, start_idx) |         cdef int start = token_by_start(self.c, self.length, start_idx) | ||||||
|         if start == -1: |         if start == -1: | ||||||
|             return None |             return None | ||||||
|  | @ -708,13 +708,6 @@ cdef class Doc: | ||||||
|         # Currently we have the token index, we want the range-end index |         # Currently we have the token index, we want the range-end index | ||||||
|         end += 1 |         end += 1 | ||||||
|         cdef Span span = self[start:end] |         cdef Span span = self[start:end] | ||||||
|         tag = self.vocab.strings[attributes.get(TAG, span.root.tag)] |  | ||||||
|         lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)] |  | ||||||
|         ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)] |  | ||||||
|         ent_id = attributes.get('ent_id', span.root.ent_id) |  | ||||||
|         if isinstance(ent_id, basestring): |  | ||||||
|             ent_id = self.vocab.strings[ent_id] |  | ||||||
| 
 |  | ||||||
|         # Get LexemeC for newly merged token |         # Get LexemeC for newly merged token | ||||||
|         new_orth = ''.join([t.text_with_ws for t in span]) |         new_orth = ''.join([t.text_with_ws for t in span]) | ||||||
|         if span[-1].whitespace_: |         if span[-1].whitespace_: | ||||||
|  | @ -723,18 +716,11 @@ cdef class Doc: | ||||||
|         # House the new merged token where it starts |         # House the new merged token where it starts | ||||||
|         cdef TokenC* token = &self.c[start] |         cdef TokenC* token = &self.c[start] | ||||||
|         token.spacy = self.c[end-1].spacy |         token.spacy = self.c[end-1].spacy | ||||||
|         if tag in self.vocab.morphology.tag_map: |         for attr_name, attr_value in attributes.items(): | ||||||
|             self.vocab.morphology.assign_tag(token, tag) |             if attr_name == TAG: | ||||||
|         else: |                 self.vocab.morphology.assign_tag(token, attr_value)  | ||||||
|             token.tag = self.vocab.strings[tag] |             else: | ||||||
|         token.lemma = self.vocab.strings[lemma] |                 Token.set_struct_attr(token, attr_name, attr_value) | ||||||
|         if ent_type == 'O': |  | ||||||
|             token.ent_iob = 2 |  | ||||||
|             token.ent_type = 0 |  | ||||||
|         else: |  | ||||||
|             token.ent_iob = 3 |  | ||||||
|             token.ent_type = self.vocab.strings[ent_type] |  | ||||||
|         token.ent_id = ent_id |  | ||||||
|         # Begin by setting all the head indices to absolute token positions |         # Begin by setting all the head indices to absolute token positions | ||||||
|         # This is easier to work with for now than the offsets |         # This is easier to work with for now than the offsets | ||||||
|         # Before thinking of something simpler, beware the case where a dependency |         # Before thinking of something simpler, beware the case where a dependency | ||||||
|  |  | ||||||
|  | @ -21,14 +21,14 @@ from .. import about | ||||||
| 
 | 
 | ||||||
| cdef class Span: | cdef class Span: | ||||||
|     """A slice from a Doc object.""" |     """A slice from a Doc object.""" | ||||||
|     def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None, |     def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None, | ||||||
|                   vector_norm=None): |                   vector_norm=None): | ||||||
|         """Create a `Span` object from the slice `doc[start : end]`. |         """Create a `Span` object from the slice `doc[start : end]`. | ||||||
| 
 | 
 | ||||||
|         doc (Doc): The parent document. |         doc (Doc): The parent document. | ||||||
|         start (int): The index of the first token of the span. |         start (int): The index of the first token of the span. | ||||||
|         end (int): The index of the first token after the span. |         end (int): The index of the first token after the span. | ||||||
|         label (int): A label to attach to the Span, e.g. for named entities. |         label (uint64): A label to attach to the Span, e.g. for named entities. | ||||||
|         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. |         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. | ||||||
|         RETURNS (Span): The newly constructed object. |         RETURNS (Span): The newly constructed object. | ||||||
|         """ |         """ | ||||||
|  | @ -377,7 +377,7 @@ cdef class Span: | ||||||
|     property ent_id: |     property ent_id: | ||||||
|         """An (integer) entity ID. Usually assigned by patterns in the `Matcher`. |         """An (integer) entity ID. Usually assigned by patterns in the `Matcher`. | ||||||
| 
 | 
 | ||||||
|         RETURNS (int): The entity ID. |         RETURNS (uint64): The entity ID. | ||||||
|         """ |         """ | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             return self.root.ent_id |             return self.root.ent_id | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user