mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Fix conllu script (#4579)
* force extensions to avoid clash between example scripts * fix arg order and default file encoding * add example config for conllu script * newline * move extension definitions to main function * few more encodings fixes
This commit is contained in:
parent
4e43c0ba93
commit
4ec7623288
|
@ -7,7 +7,6 @@ from __future__ import unicode_literals
|
|||
import plac
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
|
||||
import spacy
|
||||
|
@ -19,12 +18,9 @@ from spacy.util import compounding, minibatch, minibatch_by_words
|
|||
from spacy.syntax.nonproj import projectivize
|
||||
from spacy.matcher import Matcher
|
||||
from spacy import displacy
|
||||
from collections import defaultdict, Counter
|
||||
from timeit import default_timer as timer
|
||||
from collections import defaultdict
|
||||
|
||||
import itertools
|
||||
import random
|
||||
import numpy.random
|
||||
|
||||
from spacy import lang
|
||||
from spacy.lang import zh
|
||||
|
@ -323,10 +319,6 @@ def get_token_conllu(token, i):
|
|||
return "\n".join(lines)
|
||||
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
|
||||
Token.set_extension("begins_fused", default=False, force=True)
|
||||
Token.set_extension("inside_fused", default=False, force=True)
|
||||
|
||||
|
||||
##################
|
||||
# Initialization #
|
||||
|
@ -459,13 +451,13 @@ class TreebankPaths(object):
|
|||
|
||||
@plac.annotations(
|
||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||
corpus=(
|
||||
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
||||
"UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
|
||||
"positional",
|
||||
None,
|
||||
str,
|
||||
),
|
||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||
config=("Path to json formatted config file", "option", "C", Path),
|
||||
limit=("Size limit", "option", "n", int),
|
||||
gpu_device=("Use GPU", "option", "g", int),
|
||||
|
@ -490,6 +482,10 @@ def main(
|
|||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
spacy.util.fix_random_seed()
|
||||
lang.zh.Chinese.Defaults.use_jieba = False
|
||||
lang.ja.Japanese.Defaults.use_janome = False
|
||||
|
@ -506,8 +502,8 @@ def main(
|
|||
|
||||
docs, golds = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(),
|
||||
paths.train.text.open(),
|
||||
paths.train.conllu.open(encoding="utf8"),
|
||||
paths.train.text.open(encoding="utf8"),
|
||||
max_doc_length=config.max_doc_length,
|
||||
limit=limit,
|
||||
)
|
||||
|
|
1
examples/training/conllu-config.json
Normal file
1
examples/training/conllu-config.json
Normal file
|
@ -0,0 +1 @@
|
|||
{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0}
|
|
@ -13,8 +13,7 @@ import spacy.util
|
|||
from spacy.tokens import Token, Doc
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.syntax.nonproj import projectivize
|
||||
from collections import defaultdict, Counter
|
||||
from timeit import default_timer as timer
|
||||
from collections import defaultdict
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
import itertools
|
||||
|
@ -290,11 +289,6 @@ def get_token_conllu(token, i):
|
|||
return "\n".join(lines)
|
||||
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
|
||||
##################
|
||||
# Initialization #
|
||||
##################
|
||||
|
@ -381,20 +375,24 @@ class TreebankPaths(object):
|
|||
|
||||
@plac.annotations(
|
||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||
config=("Path to json formatted config file", "positional", None, Config.load),
|
||||
corpus=(
|
||||
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
||||
"UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
|
||||
"positional",
|
||||
None,
|
||||
str,
|
||||
),
|
||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||
config=("Path to json formatted config file", "positional", None, Config.load),
|
||||
limit=("Size limit", "option", "n", int),
|
||||
)
|
||||
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
paths = TreebankPaths(ud_dir, corpus)
|
||||
if not (parses_dir / corpus).exists():
|
||||
(parses_dir / corpus).mkdir()
|
||||
|
@ -403,8 +401,8 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
|
|||
|
||||
docs, golds = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(),
|
||||
paths.train.text.open(),
|
||||
paths.train.conllu.open(encoding="utf8"),
|
||||
paths.train.text.open(encoding="utf8"),
|
||||
max_doc_length=config.max_doc_length,
|
||||
limit=limit,
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue
Block a user