mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Add assign_untagged method in Morphology
This commit is contained in:
		
							parent
							
								
									2c118ab3a6
								
							
						
					
					
						commit
						d528b6e36d
					
				| 
						 | 
					@ -35,6 +35,8 @@ cdef class Morphology:
 | 
				
			||||||
    cdef RichTagC* rich_tags
 | 
					    cdef RichTagC* rich_tags
 | 
				
			||||||
    cdef PreshMapArray _cache
 | 
					    cdef PreshMapArray _cache
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int assign_untagged(self, TokenC* token) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int assign_tag(self, TokenC* token, tag) except -1
 | 
					    cdef int assign_tag(self, TokenC* token, tag) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
 | 
					    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -42,7 +42,7 @@ cdef class Morphology:
 | 
				
			||||||
        self.tag_names = tuple(sorted(tag_map.keys()))
 | 
					        self.tag_names = tuple(sorted(tag_map.keys()))
 | 
				
			||||||
        self.reverse_index = {}
 | 
					        self.reverse_index = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
 | 
					        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
 | 
				
			||||||
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
 | 
					        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
 | 
				
			||||||
            self.tag_map[tag_str] = dict(attrs)
 | 
					            self.tag_map[tag_str] = dict(attrs)
 | 
				
			||||||
            attrs = _normalize_props(attrs)
 | 
					            attrs = _normalize_props(attrs)
 | 
				
			||||||
| 
						 | 
					@ -52,6 +52,10 @@ cdef class Morphology:
 | 
				
			||||||
            self.rich_tags[i].morph = 0
 | 
					            self.rich_tags[i].morph = 0
 | 
				
			||||||
            self.rich_tags[i].pos = attrs[POS]
 | 
					            self.rich_tags[i].pos = attrs[POS]
 | 
				
			||||||
            self.reverse_index[self.rich_tags[i].name] = i
 | 
					            self.reverse_index[self.rich_tags[i].name] = i
 | 
				
			||||||
 | 
					        # Add a 'null' tag, which we can reference when assign morphology to
 | 
				
			||||||
 | 
					        # untagged tokens.
 | 
				
			||||||
 | 
					        self.rich_tags[self.n_tags].id = self.n_tags
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
        self._cache = PreshMapArray(self.n_tags)
 | 
					        self._cache = PreshMapArray(self.n_tags)
 | 
				
			||||||
        self.exc = {}
 | 
					        self.exc = {}
 | 
				
			||||||
        if exc is not None:
 | 
					        if exc is not None:
 | 
				
			||||||
| 
						 | 
					@ -62,6 +66,10 @@ cdef class Morphology:
 | 
				
			||||||
        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
 | 
					        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
 | 
				
			||||||
                             self.exc), None, None)
 | 
					                             self.exc), None, None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int assign_untagged(self, TokenC* token) except -1:
 | 
				
			||||||
 | 
					        '''Set morphological attributes on a token without a POS tag.'''
 | 
				
			||||||
 | 
					        token.lemma = self.lemmatize(0, token.lex.orth, {})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int assign_tag(self, TokenC* token, tag) except -1:
 | 
					    cdef int assign_tag(self, TokenC* token, tag) except -1:
 | 
				
			||||||
        if isinstance(tag, basestring):
 | 
					        if isinstance(tag, basestring):
 | 
				
			||||||
            tag = self.strings.add(tag)
 | 
					            tag = self.strings.add(tag)
 | 
				
			||||||
| 
						 | 
					@ -72,7 +80,7 @@ cdef class Morphology:
 | 
				
			||||||
            token.tag = tag
 | 
					            token.tag = tag
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
 | 
					    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
 | 
				
			||||||
        if tag_id >= self.n_tags:
 | 
					        if tag_id > self.n_tags:
 | 
				
			||||||
            raise ValueError("Unknown tag ID: %s" % tag_id)
 | 
					            raise ValueError("Unknown tag ID: %s" % tag_id)
 | 
				
			||||||
        # TODO: It's pretty arbitrary to put this logic here. I guess the justification
 | 
					        # TODO: It's pretty arbitrary to put this logic here. I guess the justification
 | 
				
			||||||
        # is that this is where the specific word and the tag interact. Still,
 | 
					        # is that this is where the specific word and the tag interact. Still,
 | 
				
			||||||
| 
						 | 
					@ -151,8 +159,6 @@ cdef class Morphology:
 | 
				
			||||||
        cdef unicode py_string = self.strings[orth]
 | 
					        cdef unicode py_string = self.strings[orth]
 | 
				
			||||||
        if self.lemmatizer is None:
 | 
					        if self.lemmatizer is None:
 | 
				
			||||||
            return self.strings.add(py_string.lower())
 | 
					            return self.strings.add(py_string.lower())
 | 
				
			||||||
        if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
 | 
					 | 
				
			||||||
            return self.strings.add(py_string.lower())
 | 
					 | 
				
			||||||
        cdef set lemma_strings
 | 
					        cdef set lemma_strings
 | 
				
			||||||
        cdef unicode lemma_string
 | 
					        cdef unicode lemma_string
 | 
				
			||||||
        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
 | 
					        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user