mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	Update train_tagger script
This commit is contained in:
		
							parent
							
								
									a81c5a7abf
								
							
						
					
					
						commit
						01b42c531f
					
				
							
								
								
									
										79
									
								
								examples/training/train_tagger.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								examples/training/train_tagger.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,79 @@ | |||
| """A quick example for training a part-of-speech tagger, without worrying | ||||
| about the tokenization, or other language-specific customizations.""" | ||||
| 
 | ||||
| from __future__ import unicode_literals | ||||
| from __future__ import print_function | ||||
| 
 | ||||
| import plac | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from spacy.vocab import Vocab | ||||
| from spacy.tagger import Tagger | ||||
| from spacy.tokens import Doc | ||||
| import random | ||||
| 
 | ||||
| 
 | ||||
| # You need to define a mapping from your data's part-of-speech tag names to the | ||||
| # Universal Part-of-Speech tag set, as spaCy includes an enum of these tags. | ||||
| # See here for the Universal Tag Set: | ||||
| # http://universaldependencies.github.io/docs/u/pos/index.html | ||||
| # You may also specify morphological features for your tags, from the universal | ||||
| # scheme. | ||||
| TAG_MAP = { | ||||
|         'N': {"pos": "NOUN"}, | ||||
|         'V': {"pos": "VERB"}, | ||||
|         'J': {"pos": "ADJ"} | ||||
|     } | ||||
| 
 | ||||
| # Usually you'll read this in, of course. Data formats vary. | ||||
| # Ensure your strings are unicode. | ||||
| DATA = [ | ||||
|     ( | ||||
|         ["I", "like", "green", "eggs"], | ||||
|         ["N", "V",    "J",     "N"] | ||||
|     ), | ||||
|     ( | ||||
|         ["Eat", "blue", "ham"], | ||||
|         ["V",   "J",    "N"] | ||||
|     ) | ||||
| ] | ||||
|      | ||||
| def ensure_dir(path): | ||||
|     if not path.exists(): | ||||
|         path.mkdir() | ||||
| 
 | ||||
| 
 | ||||
| def main(output_dir=None): | ||||
|     if output_dir is not None: | ||||
|         output_dir = Path(output_dir) | ||||
|         ensure_dir(output_dir) | ||||
|         ensure_dir(output_dir / "pos") | ||||
|         ensure_dir(output_dir / "vocab") | ||||
|      | ||||
|     vocab = Vocab(tag_map=TAG_MAP) | ||||
|     # The default_templates argument is where features are specified. See | ||||
|     # spacy/tagger.pyx for the defaults. | ||||
|     tagger = Tagger.blank(vocab, Tagger.default_templates()) | ||||
| 
 | ||||
|     for i in range(5): | ||||
|         for words, tags in DATA: | ||||
|             doc = Doc(vocab, orths_and_spaces=zip(words, [True] * len(words))) | ||||
|             tagger.update(doc, tags) | ||||
|         random.shuffle(DATA) | ||||
|     tagger.model.end_training() | ||||
|     doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True]*4)) | ||||
|     tagger(doc) | ||||
|     for word in doc: | ||||
|         print(word.text, word.tag_, word.pos_) | ||||
|     if output_dir is not None: | ||||
|         tagger.model.dump(str(output_dir / 'pos' / 'model')) | ||||
|         with (output_dir / 'vocab' / 'strings.json').open('wb') as file_: | ||||
|             tagger.vocab.strings.dump(file_) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     plac.call(main) | ||||
|     # I V VERB | ||||
|     # like V VERB | ||||
|     # blue N NOUN | ||||
|     # eggs N NOUN | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user