mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Pass pos_tags into Tokenizer.from_dir
This commit is contained in:
		
							parent
							
								
									6788c86b2f
								
							
						
					
					
						commit
						2d0e99a096
					
				| 
						 | 
					@ -31,14 +31,12 @@ cdef class Tokenizer:
 | 
				
			||||||
        self._load_special_tokenization(rules, pos_tags)
 | 
					        self._load_special_tokenization(rules, pos_tags)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def from_dir(cls, Vocab vocab, directory):
 | 
					    def from_dir(cls, Vocab vocab, data_dir, pos_tags):
 | 
				
			||||||
        data_dir = path.join(data_dir, 'tokenizer')
 | 
					        rules, prefix_re, suffix_re, infix_re = read_lang_data(data_dir)
 | 
				
			||||||
        rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
 | 
					 | 
				
			||||||
        prefix_re = re.compile(prefix_re)
 | 
					        prefix_re = re.compile(prefix_re)
 | 
				
			||||||
        suffix_re = re.compile(suffix_re)
 | 
					        suffix_re = re.compile(suffix_re)
 | 
				
			||||||
        infix_re = re.compile(infix_re)
 | 
					        infix_re = re.compile(infix_re)
 | 
				
			||||||
        return cls(vocab, tok_rules, prefix_re, suffix_re, infix_re,
 | 
					        return cls(vocab, rules, prefix_re, suffix_re, infix_re, pos_tags)
 | 
				
			||||||
                   pos_tags)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cpdef Tokens tokens_from_list(self, list strings):
 | 
					    cpdef Tokens tokens_from_list(self, list strings):
 | 
				
			||||||
        cdef int length = sum([len(s) for s in strings])
 | 
					        cdef int length = sum([len(s) for s in strings])
 | 
				
			||||||
| 
						 | 
					@ -257,7 +255,6 @@ cdef class Tokenizer:
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
                    tokens[i].lemma = 0
 | 
					                    tokens[i].lemma = 0
 | 
				
			||||||
                if 'pos' in props:
 | 
					                if 'pos' in props:
 | 
				
			||||||
                    # TODO: Clean up this mess...
 | 
					 | 
				
			||||||
                    tokens[i].tag = self.vocab.strings[props['pos']]
 | 
					                    tokens[i].tag = self.vocab.strings[props['pos']]
 | 
				
			||||||
                    tokens[i].pos = tag_map[props['pos']][0]
 | 
					                    tokens[i].pos = tag_map[props['pos']][0]
 | 
				
			||||||
                    # These are defaults, which can be over-ridden by the
 | 
					                    # These are defaults, which can be over-ridden by the
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user