mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Fix conllu script (#4579)
* force extensions to avoid clash between example scripts * fix arg order and default file encoding * add example config for conllu script * newline * move extension definitions to main function * few more encodings fixes
This commit is contained in:
		
							parent
							
								
									4e43c0ba93
								
							
						
					
					
						commit
						4ec7623288
					
				|  | @ -7,7 +7,6 @@ from __future__ import unicode_literals | |||
| import plac | ||||
| from pathlib import Path | ||||
| import re | ||||
| import sys | ||||
| import json | ||||
| 
 | ||||
| import spacy | ||||
|  | @ -19,12 +18,9 @@ from spacy.util import compounding, minibatch, minibatch_by_words | |||
| from spacy.syntax.nonproj import projectivize | ||||
| from spacy.matcher import Matcher | ||||
| from spacy import displacy | ||||
| from collections import defaultdict, Counter | ||||
| from timeit import default_timer as timer | ||||
| from collections import defaultdict | ||||
| 
 | ||||
| import itertools | ||||
| import random | ||||
| import numpy.random | ||||
| 
 | ||||
| from spacy import lang | ||||
| from spacy.lang import zh | ||||
|  | @ -323,10 +319,6 @@ def get_token_conllu(token, i): | |||
|     return "\n".join(lines) | ||||
| 
 | ||||
| 
 | ||||
| Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True) | ||||
| Token.set_extension("begins_fused", default=False, force=True) | ||||
| Token.set_extension("inside_fused", default=False, force=True) | ||||
| 
 | ||||
| 
 | ||||
| ################## | ||||
| # Initialization # | ||||
|  | @ -459,13 +451,13 @@ class TreebankPaths(object): | |||
| 
 | ||||
| @plac.annotations( | ||||
|     ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), | ||||
|     parses_dir=("Directory to write the development parses", "positional", None, Path), | ||||
|     corpus=( | ||||
|         "UD corpus to train and evaluate on, e.g. en, es_ancora, etc", | ||||
|         "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora", | ||||
|         "positional", | ||||
|         None, | ||||
|         str, | ||||
|     ), | ||||
|     parses_dir=("Directory to write the development parses", "positional", None, Path), | ||||
|     config=("Path to json formatted config file", "option", "C", Path), | ||||
|     limit=("Size limit", "option", "n", int), | ||||
|     gpu_device=("Use GPU", "option", "g", int), | ||||
|  | @ -490,6 +482,10 @@ def main( | |||
|     # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 | ||||
|     import tqdm | ||||
| 
 | ||||
|     Token.set_extension("get_conllu_lines", method=get_token_conllu) | ||||
|     Token.set_extension("begins_fused", default=False) | ||||
|     Token.set_extension("inside_fused", default=False) | ||||
| 
 | ||||
|     spacy.util.fix_random_seed() | ||||
|     lang.zh.Chinese.Defaults.use_jieba = False | ||||
|     lang.ja.Japanese.Defaults.use_janome = False | ||||
|  | @ -506,8 +502,8 @@ def main( | |||
| 
 | ||||
|     docs, golds = read_data( | ||||
|         nlp, | ||||
|         paths.train.conllu.open(), | ||||
|         paths.train.text.open(), | ||||
|         paths.train.conllu.open(encoding="utf8"), | ||||
|         paths.train.text.open(encoding="utf8"), | ||||
|         max_doc_length=config.max_doc_length, | ||||
|         limit=limit, | ||||
|     ) | ||||
|  |  | |||
							
								
								
									
										1
									
								
								examples/training/conllu-config.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								examples/training/conllu-config.json
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | |||
| {"nr_epoch": 3, "batch_size": 24, "dropout":  0.001, "vectors":  0, "multitask_tag":  0, "multitask_sent":  0} | ||||
|  | @ -13,8 +13,7 @@ import spacy.util | |||
| from spacy.tokens import Token, Doc | ||||
| from spacy.gold import GoldParse | ||||
| from spacy.syntax.nonproj import projectivize | ||||
| from collections import defaultdict, Counter | ||||
| from timeit import default_timer as timer | ||||
| from collections import defaultdict | ||||
| from spacy.matcher import Matcher | ||||
| 
 | ||||
| import itertools | ||||
|  | @ -290,11 +289,6 @@ def get_token_conllu(token, i): | |||
|     return "\n".join(lines) | ||||
| 
 | ||||
| 
 | ||||
| Token.set_extension("get_conllu_lines", method=get_token_conllu) | ||||
| Token.set_extension("begins_fused", default=False) | ||||
| Token.set_extension("inside_fused", default=False) | ||||
| 
 | ||||
| 
 | ||||
| ################## | ||||
| # Initialization # | ||||
| ################## | ||||
|  | @ -381,20 +375,24 @@ class TreebankPaths(object): | |||
| 
 | ||||
| @plac.annotations( | ||||
|     ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), | ||||
|     parses_dir=("Directory to write the development parses", "positional", None, Path), | ||||
|     config=("Path to json formatted config file", "positional", None, Config.load), | ||||
|     corpus=( | ||||
|         "UD corpus to train and evaluate on, e.g. en, es_ancora, etc", | ||||
|         "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora", | ||||
|         "positional", | ||||
|         None, | ||||
|         str, | ||||
|     ), | ||||
|     parses_dir=("Directory to write the development parses", "positional", None, Path), | ||||
|     config=("Path to json formatted config file", "positional", None, Config.load), | ||||
|     limit=("Size limit", "option", "n", int), | ||||
| ) | ||||
| def main(ud_dir, parses_dir, config, corpus, limit=0): | ||||
|     # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200 | ||||
|     import tqdm | ||||
| 
 | ||||
|     Token.set_extension("get_conllu_lines", method=get_token_conllu) | ||||
|     Token.set_extension("begins_fused", default=False) | ||||
|     Token.set_extension("inside_fused", default=False) | ||||
| 
 | ||||
|     paths = TreebankPaths(ud_dir, corpus) | ||||
|     if not (parses_dir / corpus).exists(): | ||||
|         (parses_dir / corpus).mkdir() | ||||
|  | @ -403,8 +401,8 @@ def main(ud_dir, parses_dir, config, corpus, limit=0): | |||
| 
 | ||||
|     docs, golds = read_data( | ||||
|         nlp, | ||||
|         paths.train.conllu.open(), | ||||
|         paths.train.text.open(), | ||||
|         paths.train.conllu.open(encoding="utf8"), | ||||
|         paths.train.text.open(encoding="utf8"), | ||||
|         max_doc_length=config.max_doc_length, | ||||
|         limit=limit, | ||||
|     ) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user