mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
💫 New JSON helpers, training data internals & CLI rewrite (#2932)
* Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
This commit is contained in:
parent
0369db75c1
commit
37c7c85a86
|
@ -11,6 +11,8 @@ ujson>=1.35
|
|||
dill>=0.2,<0.3
|
||||
regex==2018.01.10
|
||||
requests>=2.13.0,<3.0.0
|
||||
jsonschema>=2.6.0,<3.0.0
|
||||
wasabi>=0.0.8,<1.1.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
# Development dependencies
|
||||
pytest>=4.0.0,<5.0.0
|
||||
|
|
2
setup.py
2
setup.py
|
@ -207,6 +207,8 @@ def setup_package():
|
|||
"regex==2018.01.10",
|
||||
"dill>=0.2,<0.3",
|
||||
"requests>=2.13.0,<3.0.0",
|
||||
"jsonschema>=2.6.0,<3.0.0",
|
||||
"wasabi>=0.0.8,<1.1.0",
|
||||
'pathlib==1.0.1; python_version < "3.4"',
|
||||
],
|
||||
setup_requires=["wheel"],
|
||||
|
|
|
@ -1,40 +1,41 @@
|
|||
# coding: utf8
|
||||
from __future__ import print_function
|
||||
|
||||
# NB! This breaks in plac on Python 2!!
|
||||
# from __future__ import unicode_literals
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
import plac
|
||||
import sys
|
||||
from wasabi import Printer
|
||||
from spacy.cli import download, link, info, package, train, pretrain, convert
|
||||
from spacy.cli import vocab, init_model, profile, evaluate, validate
|
||||
from spacy.cli import ud_train, ud_evaluate
|
||||
from spacy.util import prints
|
||||
from spacy.cli import init_model, profile, evaluate, validate
|
||||
from spacy.cli import ud_train, ud_evaluate, debug_data
|
||||
|
||||
msg = Printer()
|
||||
|
||||
commands = {
|
||||
'download': download,
|
||||
'link': link,
|
||||
'info': info,
|
||||
'train': train,
|
||||
'pretrain': pretrain,
|
||||
'ud-train': ud_train,
|
||||
'evaluate': evaluate,
|
||||
'ud-evaluate': ud_evaluate,
|
||||
'convert': convert,
|
||||
'package': package,
|
||||
'vocab': vocab,
|
||||
'init-model': init_model,
|
||||
'profile': profile,
|
||||
'validate': validate
|
||||
"download": download,
|
||||
"link": link,
|
||||
"info": info,
|
||||
"train": train,
|
||||
"pretrain": pretrain,
|
||||
"debug-data": debug_data,
|
||||
"ud-train": ud_train,
|
||||
"evaluate": evaluate,
|
||||
"ud-evaluate": ud_evaluate,
|
||||
"convert": convert,
|
||||
"package": package,
|
||||
"init-model": init_model,
|
||||
"profile": profile,
|
||||
"validate": validate,
|
||||
}
|
||||
if len(sys.argv) == 1:
|
||||
prints(', '.join(commands), title="Available commands", exits=1)
|
||||
msg.info("Available commands", ", ".join(commands), exits=1)
|
||||
command = sys.argv.pop(1)
|
||||
sys.argv[0] = 'spacy %s' % command
|
||||
sys.argv[0] = "spacy %s" % command
|
||||
if command in commands:
|
||||
plac.call(commands[command], sys.argv[1:])
|
||||
else:
|
||||
prints(
|
||||
"Available: %s" % ', '.join(commands),
|
||||
title="Unknown command: %s" % command,
|
||||
exits=1)
|
||||
available = "Available: {}".format(", ".join(commands))
|
||||
msg.fail("Unknown command: {}".format(command), available, exits=1)
|
||||
|
|
|
@ -1,14 +1,13 @@
|
|||
from .download import download
|
||||
from .info import info
|
||||
from .link import link
|
||||
from .package import package
|
||||
from .profile import profile
|
||||
from .train import train
|
||||
from .pretrain import pretrain
|
||||
from .evaluate import evaluate
|
||||
from .convert import convert
|
||||
from .vocab import make_vocab as vocab
|
||||
from .init_model import init_model
|
||||
from .validate import validate
|
||||
from .ud_train import main as ud_train
|
||||
from .conll17_ud_eval import main as ud_evaluate
|
||||
from .download import download # noqa: F401
|
||||
from .info import info # noqa: F401
|
||||
from .link import link # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
from .profile import profile # noqa: F401
|
||||
from .train import train # noqa: F401
|
||||
from .pretrain import pretrain # noqa: F401
|
||||
from .debug_data import debug_data # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_model import init_model # noqa: F401
|
||||
from .validate import validate # noqa: F401
|
||||
from .ud import ud_train, ud_evaluate # noqa: F401
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# fmt: off
|
||||
|
||||
class Messages(object):
|
||||
M001 = ("Download successful but linking failed")
|
||||
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
|
||||
|
@ -73,3 +75,31 @@ class Messages(object):
|
|||
M052 = ("Not a valid meta.json format")
|
||||
M053 = ("Expected dict but got: {meta_type}")
|
||||
M054 = ("No --lang specified, but tokenization required.")
|
||||
M055 = ("Training pipeline: {pipeline}")
|
||||
M056 = ("Starting with base model '{model}'")
|
||||
M057 = ("Starting with blank model '{model}'")
|
||||
M058 = ("Loading vector from model '{model}'")
|
||||
M059 = ("Can't use multitask objective without '{pipe}' in the pipeline")
|
||||
M060 = ("Counting training words (limit={limit})")
|
||||
M061 = ("\nSaving model...")
|
||||
M062 = ("Output directory is not empty.")
|
||||
M063 = ("Incompatible arguments")
|
||||
M064 = ("The -f and -c arguments are deprecated, and not compatible with "
|
||||
"the -j argument, which should specify the same information. "
|
||||
"Either merge the frequencies and clusters data into the "
|
||||
"JSONL-formatted file (recommended), or use only the -f and -c "
|
||||
"files, without the other lexical attributes.")
|
||||
M065 = ("This can lead to unintended side effects when saving the model. "
|
||||
"Please use an empty directory or a different path instead. If "
|
||||
"the specified output path doesn't exist, the directory will be "
|
||||
"created for you.")
|
||||
M066 = ("Saved model to output directory")
|
||||
M067 = ("Can't find lexical data")
|
||||
M068 = ("Sucessfully compiled vocab and vectors, and saved model")
|
||||
M069 = ("Unknown file type: '{name}'")
|
||||
M070 = ("Supported file types: '{options}'")
|
||||
M071 = ("Loaded pretrained tok2vec for: {components}")
|
||||
M072 = ("Model language ('{model_lang}') doesn't match language specified "
|
||||
"as `lang` argument ('{lang}') ")
|
||||
|
||||
# fmt: on
|
||||
|
|
|
@ -3,49 +3,91 @@ from __future__ import unicode_literals
|
|||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
|
||||
from ..util import write_jsonl, write_json
|
||||
from ..compat import json_dumps, path2str
|
||||
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
|
||||
from .converters import ner_jsonl2json
|
||||
from ._messages import Messages
|
||||
from ..util import prints
|
||||
|
||||
|
||||
# Converters are matched by file extension. To add a converter, add a new
|
||||
# entry to this dict with the file extension mapped to the converter function
|
||||
# imported from /converters.
|
||||
CONVERTERS = {
|
||||
'conllubio': conllubio2json,
|
||||
'conllu': conllu2json,
|
||||
'conll': conllu2json,
|
||||
'ner': conll_ner2json,
|
||||
'iob': iob2json,
|
||||
'jsonl': ner_jsonl2json
|
||||
"conllubio": conllubio2json,
|
||||
"conllu": conllu2json,
|
||||
"conll": conllu2json,
|
||||
"ner": conll_ner2json,
|
||||
"iob": iob2json,
|
||||
"jsonl": ner_jsonl2json,
|
||||
}
|
||||
|
||||
# File types
|
||||
FILE_TYPES = ("json", "jsonl")
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_file=("input file", "positional", None, str),
|
||||
output_dir=("output directory for converted file", "positional", None, str),
|
||||
input_file=("Input file", "positional", None, str),
|
||||
output_dir=("Output directory for converted file", "positional", None, str),
|
||||
file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool))
|
||||
def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto',
|
||||
lang=None):
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||
)
|
||||
def convert(
|
||||
input_file,
|
||||
output_dir="-",
|
||||
file_type="jsonl",
|
||||
n_sents=1,
|
||||
morphology=False,
|
||||
converter="auto",
|
||||
lang=None,
|
||||
):
|
||||
"""
|
||||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions.
|
||||
experiment management functions. If no output_dir is specified, the data
|
||||
is written to stdout, so you can pipe them forward to a JSONL file:
|
||||
$ spacy convert some_file.conllu > some_file.jsonl
|
||||
"""
|
||||
msg = Printer()
|
||||
input_path = Path(input_file)
|
||||
output_path = Path(output_dir)
|
||||
if file_type not in FILE_TYPES:
|
||||
msg.fail(
|
||||
Messages.M069.format(name=file_type),
|
||||
Messages.M070.format(options=", ".join(FILE_TYPES)),
|
||||
exits=1,
|
||||
)
|
||||
if not input_path.exists():
|
||||
prints(input_path, title=Messages.M028, exits=1)
|
||||
if not output_path.exists():
|
||||
prints(output_path, title=Messages.M029, exits=1)
|
||||
if converter == 'auto':
|
||||
msg.fail(Messages.M028, input_path, exits=1)
|
||||
if output_dir != "-" and not Path(output_dir).exists():
|
||||
msg.fail(Messages.M029, output_dir, exits=1)
|
||||
if converter == "auto":
|
||||
converter = input_path.suffix[1:]
|
||||
if converter not in CONVERTERS:
|
||||
prints(Messages.M031.format(converter=converter),
|
||||
title=Messages.M030, exits=1)
|
||||
msg.fail(Messages.M030, Messages.M031.format(converter=converter), exits=1)
|
||||
# Use converter function to convert data
|
||||
func = CONVERTERS[converter]
|
||||
func(input_path, output_path,
|
||||
n_sents=n_sents, use_morphology=morphology, lang=lang)
|
||||
input_data = input_path.open("r", encoding="utf-8").read()
|
||||
data = func(input_data, nsents=n_sents, use_morphology=morphology, lang=lang)
|
||||
if output_dir != "-":
|
||||
# Export data to a file
|
||||
suffix = ".{}".format(file_type)
|
||||
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
||||
if file_type == "json":
|
||||
write_json(output_file, data)
|
||||
elif file_type == "jsonl":
|
||||
write_jsonl(output_file, data)
|
||||
msg.good(
|
||||
Messages.M032.format(name=path2str(output_file)),
|
||||
Messages.M033.format(n_docs=len(data)),
|
||||
)
|
||||
else:
|
||||
# Print to stdout
|
||||
if file_type == "json":
|
||||
print(json_dumps(data))
|
||||
elif file_type == "jsonl":
|
||||
for line in data:
|
||||
print(json_dumps(line))
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from .conllu2json import conllu2json
|
||||
from .conllubio2json import conllubio2json
|
||||
from .iob2json import iob2json
|
||||
from .conll_ner2json import conll_ner2json
|
||||
from .jsonl2json import ner_jsonl2json
|
||||
from .conllu2json import conllu2json # noqa: F401
|
||||
from .conllubio2json import conllubio2json # noqa: F401
|
||||
from .iob2json import iob2json # noqa: F401
|
||||
from .conll_ner2json import conll_ner2json # noqa: F401
|
||||
from .jsonl2json import ner_jsonl2json # noqa: F401
|
||||
|
|
|
@ -1,52 +1,38 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .._messages import Messages
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
from ...gold import iob_to_biluo
|
||||
|
||||
|
||||
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
||||
def conll_ner2json(input_data, **kwargs):
|
||||
"""
|
||||
Convert files in the CoNLL-2003 NER format into JSON format for use with
|
||||
train cli.
|
||||
"""
|
||||
docs = read_conll_ner(input_path)
|
||||
|
||||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
||||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints(Messages.M033.format(n_docs=len(docs)),
|
||||
title=Messages.M032.format(name=path2str(output_file)))
|
||||
|
||||
|
||||
def read_conll_ner(input_path):
|
||||
text = input_path.open('r', encoding='utf-8').read()
|
||||
i = 0
|
||||
delimit_docs = '-DOCSTART- -X- O O'
|
||||
delimit_docs = "-DOCSTART- -X- O O"
|
||||
output_docs = []
|
||||
for doc in text.strip().split(delimit_docs):
|
||||
for doc in input_data.strip().split(delimit_docs):
|
||||
doc = doc.strip()
|
||||
if not doc:
|
||||
continue
|
||||
output_doc = []
|
||||
for sent in doc.split('\n\n'):
|
||||
for sent in doc.split("\n\n"):
|
||||
sent = sent.strip()
|
||||
if not sent:
|
||||
continue
|
||||
lines = [line.strip() for line in sent.split('\n') if line.strip()]
|
||||
lines = [line.strip() for line in sent.split("\n") if line.strip()]
|
||||
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
|
||||
biluo_ents = iob_to_biluo(iob_ents)
|
||||
output_doc.append({'tokens': [
|
||||
{'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
|
||||
zip(words, tags, biluo_ents)
|
||||
]})
|
||||
output_docs.append({
|
||||
'id': len(output_docs),
|
||||
'paragraphs': [{'sentences': output_doc}]
|
||||
})
|
||||
output_doc.append(
|
||||
{
|
||||
"tokens": [
|
||||
{"orth": w, "tag": tag, "ner": ent}
|
||||
for (w, tag, ent) in zip(words, tags, biluo_ents)
|
||||
]
|
||||
}
|
||||
)
|
||||
output_docs.append(
|
||||
{"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
|
||||
)
|
||||
output_doc = []
|
||||
return output_docs
|
||||
|
|
|
@ -1,34 +1,27 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .._messages import Messages
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
from ...gold import iob_to_biluo
|
||||
import re
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
|
||||
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
||||
|
||||
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None):
|
||||
"""
|
||||
Convert conllu files into JSON format for use with train cli.
|
||||
use_morphology parameter enables appending morphology to tags, which is
|
||||
useful for languages such as Spanish, where UD tags are not so rich.
|
||||
"""
|
||||
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
||||
|
||||
"""
|
||||
Extract NER tags if available and convert them so that they follow
|
||||
BILUO and the Wikipedia scheme
|
||||
"""
|
||||
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
||||
# by @katarkor
|
||||
|
||||
docs = []
|
||||
sentences = []
|
||||
conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
|
||||
conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
|
||||
checked_for_ner = False
|
||||
has_ner_tags = False
|
||||
|
||||
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
||||
sentence, brackets = tokens[0]
|
||||
if not checked_for_ner:
|
||||
|
@ -37,29 +30,19 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=
|
|||
sentences.append(generate_sentence(sentence, has_ner_tags))
|
||||
# Real-sized documents could be extracted using the comments on the
|
||||
# conluu document
|
||||
|
||||
if(len(sentences) % n_sents == 0):
|
||||
if len(sentences) % n_sents == 0:
|
||||
doc = create_doc(sentences, i)
|
||||
docs.append(doc)
|
||||
sentences = []
|
||||
|
||||
output_filename = input_path.parts[-1].replace(".conll", ".json")
|
||||
output_filename = input_path.parts[-1].replace(".conllu", ".json")
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints(Messages.M033.format(n_docs=len(docs)),
|
||||
title=Messages.M032.format(name=path2str(output_file)))
|
||||
return docs
|
||||
|
||||
|
||||
def is_ner(tag):
|
||||
|
||||
"""
|
||||
Check the 10th column of the first token to determine if the file contains
|
||||
NER tags
|
||||
"""
|
||||
|
||||
tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag)
|
||||
Check the 10th column of the first token to determine if the file contains
|
||||
NER tags
|
||||
"""
|
||||
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
|
||||
if tag_match:
|
||||
return True
|
||||
elif tag == "O":
|
||||
|
@ -67,29 +50,29 @@ def is_ner(tag):
|
|||
else:
|
||||
return False
|
||||
|
||||
def read_conllx(input_path, use_morphology=False, n=0):
|
||||
text = input_path.open('r', encoding='utf-8').read()
|
||||
|
||||
def read_conllx(input_data, use_morphology=False, n=0):
|
||||
i = 0
|
||||
for sent in text.strip().split('\n\n'):
|
||||
lines = sent.strip().split('\n')
|
||||
for sent in input_data.strip().split("\n\n"):
|
||||
lines = sent.strip().split("\n")
|
||||
if lines:
|
||||
while lines[0].startswith('#'):
|
||||
while lines[0].startswith("#"):
|
||||
lines.pop(0)
|
||||
tokens = []
|
||||
for line in lines:
|
||||
|
||||
parts = line.split('\t')
|
||||
parts = line.split("\t")
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
|
||||
if '-' in id_ or '.' in id_:
|
||||
if "-" in id_ or "." in id_:
|
||||
continue
|
||||
try:
|
||||
id_ = int(id_) - 1
|
||||
head = (int(head) - 1) if head != '0' else id_
|
||||
dep = 'ROOT' if dep == 'root' else dep
|
||||
tag = pos if tag == '_' else tag
|
||||
tag = tag+'__'+morph if use_morphology else tag
|
||||
head = (int(head) - 1) if head != "0" else id_
|
||||
dep = "ROOT" if dep == "root" else dep
|
||||
tag = pos if tag == "_" else tag
|
||||
tag = tag + "__" + morph if use_morphology else tag
|
||||
tokens.append((id_, word, tag, head, dep, iob))
|
||||
except:
|
||||
except: # noqa: E722
|
||||
print(line)
|
||||
raise
|
||||
tuples = [list(t) for t in zip(*tokens)]
|
||||
|
@ -98,31 +81,31 @@ def read_conllx(input_path, use_morphology=False, n=0):
|
|||
if n >= 1 and i >= n:
|
||||
break
|
||||
|
||||
|
||||
def simplify_tags(iob):
|
||||
|
||||
"""
|
||||
Simplify tags obtained from the dataset in order to follow Wikipedia
|
||||
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
|
||||
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
|
||||
'MISC'.
|
||||
'MISC'.
|
||||
"""
|
||||
|
||||
new_iob = []
|
||||
for tag in iob:
|
||||
tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag)
|
||||
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
|
||||
if tag_match:
|
||||
prefix = tag_match.group(1)
|
||||
suffix = tag_match.group(2)
|
||||
if suffix == 'GPE_LOC':
|
||||
suffix = 'LOC'
|
||||
elif suffix == 'GPE_ORG':
|
||||
suffix = 'ORG'
|
||||
elif suffix != 'PER' and suffix != 'LOC' and suffix != 'ORG':
|
||||
suffix = 'MISC'
|
||||
tag = prefix + '-' + suffix
|
||||
if suffix == "GPE_LOC":
|
||||
suffix = "LOC"
|
||||
elif suffix == "GPE_ORG":
|
||||
suffix = "ORG"
|
||||
elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
|
||||
suffix = "MISC"
|
||||
tag = prefix + "-" + suffix
|
||||
new_iob.append(tag)
|
||||
return new_iob
|
||||
|
||||
|
||||
def generate_sentence(sent, has_ner_tags):
|
||||
(id_, word, tag, head, dep, iob) = sent
|
||||
sentence = {}
|
||||
|
@ -144,7 +127,7 @@ def generate_sentence(sent, has_ner_tags):
|
|||
return sentence
|
||||
|
||||
|
||||
def create_doc(sentences,id):
|
||||
def create_doc(sentences, id):
|
||||
doc = {}
|
||||
paragraph = {}
|
||||
doc["id"] = id
|
||||
|
|
|
@ -1,65 +1,54 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
from ...gold import iob_to_biluo
|
||||
|
||||
def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
||||
|
||||
def conllubio2json(input_data, n_sents=10, use_morphology=False, lang=None):
|
||||
"""
|
||||
Convert conllu files into JSON format for use with train cli.
|
||||
use_morphology parameter enables appending morphology to tags, which is
|
||||
useful for languages such as Spanish, where UD tags are not so rich.
|
||||
"""
|
||||
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
||||
|
||||
docs = []
|
||||
sentences = []
|
||||
conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
|
||||
|
||||
conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
|
||||
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
||||
sentence, brackets = tokens[0]
|
||||
sentences.append(generate_sentence(sentence))
|
||||
# Real-sized documents could be extracted using the comments on the
|
||||
# conluu document
|
||||
if(len(sentences) % n_sents == 0):
|
||||
if len(sentences) % n_sents == 0:
|
||||
doc = create_doc(sentences, i)
|
||||
docs.append(doc)
|
||||
sentences = []
|
||||
|
||||
output_filename = input_path.parts[-1].replace(".conll", ".json")
|
||||
output_filename = input_path.parts[-1].replace(".conllu", ".json")
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints("Created %d documents" % len(docs),
|
||||
title="Generated output file %s" % path2str(output_file))
|
||||
return docs
|
||||
|
||||
|
||||
def read_conllx(input_path, use_morphology=False, n=0):
|
||||
text = input_path.open('r', encoding='utf-8').read()
|
||||
def read_conllx(input_data, use_morphology=False, n=0):
|
||||
i = 0
|
||||
for sent in text.strip().split('\n\n'):
|
||||
lines = sent.strip().split('\n')
|
||||
for sent in input_data.strip().split("\n\n"):
|
||||
lines = sent.strip().split("\n")
|
||||
if lines:
|
||||
while lines[0].startswith('#'):
|
||||
while lines[0].startswith("#"):
|
||||
lines.pop(0)
|
||||
tokens = []
|
||||
for line in lines:
|
||||
|
||||
parts = line.split('\t')
|
||||
parts = line.split("\t")
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts
|
||||
if '-' in id_ or '.' in id_:
|
||||
if "-" in id_ or "." in id_:
|
||||
continue
|
||||
try:
|
||||
id_ = int(id_) - 1
|
||||
head = (int(head) - 1) if head != '0' else id_
|
||||
dep = 'ROOT' if dep == 'root' else dep
|
||||
tag = pos if tag == '_' else tag
|
||||
tag = tag+'__'+morph if use_morphology else tag
|
||||
ner = ner if ner else 'O'
|
||||
head = (int(head) - 1) if head != "0" else id_
|
||||
dep = "ROOT" if dep == "root" else dep
|
||||
tag = pos if tag == "_" else tag
|
||||
tag = tag + "__" + morph if use_morphology else tag
|
||||
ner = ner if ner else "O"
|
||||
tokens.append((id_, word, tag, head, dep, ner))
|
||||
except:
|
||||
except: # noqa: E722
|
||||
print(line)
|
||||
raise
|
||||
tuples = [list(t) for t in zip(*tokens)]
|
||||
|
@ -68,6 +57,7 @@ def read_conllx(input_path, use_morphology=False, n=0):
|
|||
if n >= 1 and i >= n:
|
||||
break
|
||||
|
||||
|
||||
def generate_sentence(sent):
|
||||
(id_, word, tag, head, dep, ner) = sent
|
||||
sentence = {}
|
||||
|
@ -85,7 +75,7 @@ def generate_sentence(sent):
|
|||
return sentence
|
||||
|
||||
|
||||
def create_doc(sentences,id):
|
||||
def create_doc(sentences, id):
|
||||
doc = {}
|
||||
paragraph = {}
|
||||
doc["id"] = id
|
||||
|
|
|
@ -1,26 +1,24 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from cytoolz import partition_all, concat
|
||||
|
||||
from .._messages import Messages
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
from cytoolz import partition_all
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
|
||||
|
||||
def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
||||
def iob2json(input_data, n_sents=10, *args, **kwargs):
|
||||
"""
|
||||
Convert IOB files into JSON format for use with train cli.
|
||||
"""
|
||||
with input_path.open('r', encoding='utf8') as file_:
|
||||
sentences = read_iob(file_)
|
||||
docs = merge_sentences(sentences, n_sents)
|
||||
output_filename = input_path.parts[-1].replace(".iob", ".json")
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints(Messages.M033.format(n_docs=len(docs)),
|
||||
title=Messages.M032.format(name=path2str(output_file)))
|
||||
docs = []
|
||||
for group in partition_all(n_sents, docs):
|
||||
group = list(group)
|
||||
first = group.pop(0)
|
||||
to_extend = first["paragraphs"][0]["sentences"]
|
||||
for sent in group[1:]:
|
||||
to_extend.extend(sent["paragraphs"][0]["sentences"])
|
||||
docs.append(first)
|
||||
return docs
|
||||
|
||||
|
||||
def read_iob(raw_sents):
|
||||
|
@ -28,30 +26,20 @@ def read_iob(raw_sents):
|
|||
for line in raw_sents:
|
||||
if not line.strip():
|
||||
continue
|
||||
tokens = [t.split('|') for t in line.split()]
|
||||
tokens = [t.split("|") for t in line.split()]
|
||||
if len(tokens[0]) == 3:
|
||||
words, pos, iob = zip(*tokens)
|
||||
else:
|
||||
words, iob = zip(*tokens)
|
||||
pos = ['-'] * len(words)
|
||||
pos = ["-"] * len(words)
|
||||
biluo = iob_to_biluo(iob)
|
||||
sentences.append([
|
||||
{'orth': w, 'tag': p, 'ner': ent}
|
||||
for (w, p, ent) in zip(words, pos, biluo)
|
||||
])
|
||||
sentences = [{'tokens': sent} for sent in sentences]
|
||||
paragraphs = [{'sentences': [sent]} for sent in sentences]
|
||||
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
|
||||
sentences.append(
|
||||
[
|
||||
{"orth": w, "tag": p, "ner": ent}
|
||||
for (w, p, ent) in zip(words, pos, biluo)
|
||||
]
|
||||
)
|
||||
sentences = [{"tokens": sent} for sent in sentences]
|
||||
paragraphs = [{"sentences": [sent]} for sent in sentences]
|
||||
docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs]
|
||||
return docs
|
||||
|
||||
def merge_sentences(docs, n_sents):
|
||||
counter = 0
|
||||
merged = []
|
||||
for group in partition_all(n_sents, docs):
|
||||
group = list(group)
|
||||
first = group.pop(0)
|
||||
to_extend = first['paragraphs'][0]['sentences']
|
||||
for sent in group[1:]:
|
||||
to_extend.extend(sent['paragraphs'][0]['sentences'])
|
||||
merged.append(first)
|
||||
return merged
|
||||
|
|
|
@ -1,33 +1,21 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
import ujson as json
|
||||
|
||||
import ujson
|
||||
|
||||
from ...util import get_lang_class
|
||||
from .._messages import Messages
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints, get_lang_class
|
||||
from ...gold import docs_to_json
|
||||
|
||||
|
||||
def ner_jsonl2json(input_path, output_path, lang=None, n_sents=10, use_morphology=False):
|
||||
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
|
||||
if lang is None:
|
||||
prints(Messages.M054, exits=True)
|
||||
raise ValueError(Messages.M054)
|
||||
json_docs = []
|
||||
input_tuples = list(read_jsonl(input_path))
|
||||
input_tuples = [ujson.loads(line) for line in input_data]
|
||||
nlp = get_lang_class(lang)()
|
||||
for i, (raw_text, ents) in enumerate(input_tuples):
|
||||
doc = nlp.make_doc(raw_text)
|
||||
doc[0].is_sent_start = True
|
||||
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents['entities']]
|
||||
json_docs.append(docs_to_json(i, [doc]))
|
||||
|
||||
output_filename = input_path.parts[-1].replace(".jsonl", ".json")
|
||||
output_loc = output_path / output_filename
|
||||
with (output_loc).open('w', encoding='utf8') as file_:
|
||||
file_.write(json_dumps(json_docs))
|
||||
prints(Messages.M033.format(n_docs=len(json_docs)),
|
||||
title=Messages.M032.format(name=path2str(output_loc)))
|
||||
|
||||
def read_jsonl(input_path):
|
||||
with input_path.open('r', encoding='utf8') as file_:
|
||||
for line in file_:
|
||||
yield json.loads(line)
|
||||
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]]
|
||||
json_docs.append(doc.to_json())
|
||||
return json_docs
|
||||
|
|
398
spacy/cli/debug_data.py
Normal file
398
spacy/cli/debug_data.py
Normal file
|
@ -0,0 +1,398 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
import plac
|
||||
import sys
|
||||
from wasabi import Printer, MESSAGES
|
||||
|
||||
from ..gold import GoldCorpus, read_json_object
|
||||
from ..util import load_model, get_lang_class, read_json, read_jsonl
|
||||
|
||||
# from .schemas import get_schema, validate_json
|
||||
from ._messages import Messages
|
||||
|
||||
|
||||
# Minimum number of expected occurences of label in data to train new label
|
||||
NEW_LABEL_THRESHOLD = 50
|
||||
# Minimum number of expected examples to train a blank model
|
||||
BLANK_MODEL_MIN_THRESHOLD = 100
|
||||
BLANK_MODEL_THRESHOLD = 2000
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
||||
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
||||
base_model=("name of model to update (optional)", "option", "b", str),
|
||||
pipeline=(
|
||||
"Comma-separated names of pipeline components to train",
|
||||
"option",
|
||||
"p",
|
||||
str,
|
||||
),
|
||||
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
|
||||
ignore_validation=(
|
||||
"Don't exit if JSON format validation fails",
|
||||
"flag",
|
||||
"IV",
|
||||
bool,
|
||||
),
|
||||
verbose=("Print additional information and explanations", "flag", "V", bool),
|
||||
no_format=("Don't pretty-print the results", "flag", "NF", bool),
|
||||
)
|
||||
def debug_data(
|
||||
lang,
|
||||
train_path,
|
||||
dev_path,
|
||||
base_model=None,
|
||||
pipeline="tagger,parser,ner",
|
||||
ignore_warnings=False,
|
||||
ignore_validation=False,
|
||||
verbose=False,
|
||||
no_format=False,
|
||||
):
|
||||
msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
|
||||
|
||||
# Make sure all files and paths exists if they are needed
|
||||
if not train_path.exists():
|
||||
msg.fail(Messages.M050, train_path, exits=1)
|
||||
if not dev_path.exists():
|
||||
msg.fail(Messages.M051, dev_path, exits=1)
|
||||
|
||||
# Initialize the model and pipeline
|
||||
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||
if base_model:
|
||||
nlp = load_model(base_model)
|
||||
else:
|
||||
lang_cls = get_lang_class(lang)
|
||||
nlp = lang_cls()
|
||||
|
||||
msg.divider("Data format validation")
|
||||
# Load the data in one – might take a while but okay in this case
|
||||
with msg.loading("Loading {}...".format(train_path.parts[-1])):
|
||||
train_data = _load_file(train_path, msg)
|
||||
with msg.loading("Loading {}...".format(dev_path.parts[-1])):
|
||||
dev_data = _load_file(dev_path, msg)
|
||||
|
||||
# Validate data format using the JSON schema
|
||||
# TODO: update once the new format is ready
|
||||
# schema = get_schema("training")
|
||||
train_data_errors = [] # TODO: validate_json(train_data, schema)
|
||||
dev_data_errors = [] # TODO: validate_json(dev_data, schema)
|
||||
if not train_data_errors:
|
||||
msg.good("Training data JSON format is valid")
|
||||
if not dev_data_errors:
|
||||
msg.good("Development data JSON format is valid")
|
||||
for error in train_data_errors:
|
||||
msg.fail("Training data: {}".format(error))
|
||||
for error in dev_data_errors:
|
||||
msg.fail("Develoment data: {}".format(error))
|
||||
if (train_data_errors or dev_data_errors) and not ignore_validation:
|
||||
sys.exit(1)
|
||||
|
||||
# Create the gold corpus to be able to better analyze data
|
||||
with msg.loading("Analyzing corpus..."):
|
||||
train_data = read_json_object(train_data)
|
||||
dev_data = read_json_object(dev_data)
|
||||
corpus = GoldCorpus(train_data, dev_data)
|
||||
train_docs = list(corpus.train_docs(nlp))
|
||||
dev_docs = list(corpus.dev_docs(nlp))
|
||||
msg.good("Corpus is loadable")
|
||||
|
||||
# Create all gold data here to avoid iterating over the train_docs constantly
|
||||
gold_data = _compile_gold(train_docs, pipeline)
|
||||
train_texts = gold_data["texts"]
|
||||
dev_texts = set([doc.text for doc, gold in dev_docs])
|
||||
|
||||
msg.divider("Training stats")
|
||||
msg.text("Training pipeline: {}".format(", ".join(pipeline)))
|
||||
for pipe in [p for p in pipeline if p not in nlp.factories]:
|
||||
msg.fail("Pipeline component '{}' not available in factories".format(pipe))
|
||||
if base_model:
|
||||
msg.text("Starting with base model '{}'".format(base_model))
|
||||
else:
|
||||
msg.text("Starting with blank model '{}'".format(lang))
|
||||
msg.text("{} training docs".format(len(train_docs)))
|
||||
msg.text("{} evaluation docs".format(len(dev_docs)))
|
||||
|
||||
overlap = len(train_texts.intersection(dev_texts))
|
||||
if overlap:
|
||||
msg.warn("{} training examples also in evaluation data".format(overlap))
|
||||
else:
|
||||
msg.good("No overlap between training and evaluation data")
|
||||
if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
|
||||
text = "Low number of examples to train from a blank model ({})".format(
|
||||
len(train_docs)
|
||||
)
|
||||
if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
|
||||
msg.fail(text)
|
||||
else:
|
||||
msg.warn(text)
|
||||
msg.text(
|
||||
"It's recommended to use at least {} examples (minimum {})".format(
|
||||
BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
|
||||
),
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
msg.divider("Vocab & Vectors")
|
||||
n_words = gold_data["n_words"]
|
||||
msg.info(
|
||||
"{} total {} in the data ({} unique)".format(
|
||||
n_words, "word" if n_words == 1 else "words", len(gold_data["words"])
|
||||
)
|
||||
)
|
||||
most_common_words = gold_data["words"].most_common(10)
|
||||
msg.text(
|
||||
"10 most common words: {}".format(
|
||||
_format_labels(most_common_words, counts=True)
|
||||
),
|
||||
show=verbose,
|
||||
)
|
||||
if len(nlp.vocab.vectors):
|
||||
msg.info(
|
||||
"{} vectors ({} unique keys, {} dimensions)".format(
|
||||
len(nlp.vocab.vectors),
|
||||
nlp.vocab.vectors.n_keys,
|
||||
nlp.vocab.vectors_length,
|
||||
)
|
||||
)
|
||||
else:
|
||||
msg.info("No word vectors present in the model")
|
||||
|
||||
if "ner" in pipeline:
|
||||
# Get all unique NER labels present in the data
|
||||
labels = set(label for label in gold_data["ner"] if label not in ("O", "-"))
|
||||
label_counts = gold_data["ner"]
|
||||
model_labels = _get_labels_from_model(nlp, "ner")
|
||||
new_labels = [l for l in labels if l not in model_labels]
|
||||
existing_labels = [l for l in labels if l in model_labels]
|
||||
has_low_data_warning = False
|
||||
has_no_neg_warning = False
|
||||
|
||||
msg.divider("Named Entity Recognition")
|
||||
msg.info(
|
||||
"{} new {}, {} existing {}".format(
|
||||
len(new_labels),
|
||||
"label" if len(new_labels) == 1 else "labels",
|
||||
len(existing_labels),
|
||||
"label" if len(existing_labels) == 1 else "labels",
|
||||
)
|
||||
)
|
||||
missing_values = label_counts["-"]
|
||||
msg.text(
|
||||
"{} missing {} (tokens with '-' label)".format(
|
||||
missing_values, "value" if missing_values == 1 else "values"
|
||||
)
|
||||
)
|
||||
if new_labels:
|
||||
labels_with_counts = [
|
||||
(label, count)
|
||||
for label, count in label_counts.most_common()
|
||||
if label != "-"
|
||||
]
|
||||
labels_with_counts = _format_labels(labels_with_counts, counts=True)
|
||||
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||
if existing_labels:
|
||||
msg.text(
|
||||
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||
)
|
||||
|
||||
for label in new_labels:
|
||||
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
||||
msg.warn(
|
||||
"Low number of examples for new label '{}' ({})".format(
|
||||
label, label_counts[label]
|
||||
)
|
||||
)
|
||||
has_low_data_warning = True
|
||||
|
||||
with msg.loading("Analyzing label distribution..."):
|
||||
neg_docs = _get_examples_without_label(train_docs, label)
|
||||
if neg_docs == 0:
|
||||
msg.warn(
|
||||
"No examples for texts WITHOUT new label '{}'".format(label)
|
||||
)
|
||||
has_no_neg_warning = True
|
||||
|
||||
if not has_low_data_warning:
|
||||
msg.good("Good amount of examples for all labels")
|
||||
if not has_no_neg_warning:
|
||||
msg.good("Examples without occurences available for all labels")
|
||||
|
||||
if has_low_data_warning:
|
||||
msg.text(
|
||||
"To train a new entity type, your data should include at "
|
||||
"least {} insteances of the new label".format(NEW_LABEL_THRESHOLD),
|
||||
show=verbose,
|
||||
)
|
||||
if has_no_neg_warning:
|
||||
msg.text(
|
||||
"Training data should always include examples of entities "
|
||||
"in context, as well as examples without a given entity "
|
||||
"type.",
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
if "textcat" in pipeline:
|
||||
msg.divider("Text Classification")
|
||||
labels = [label for label in gold_data["textcat"]]
|
||||
model_labels = _get_labels_from_model(nlp, "textcat")
|
||||
new_labels = [l for l in labels if l not in model_labels]
|
||||
existing_labels = [l for l in labels if l in model_labels]
|
||||
msg.info(
|
||||
"Text Classification: {} new label(s), {} existing label(s)".format(
|
||||
len(new_labels), len(existing_labels)
|
||||
)
|
||||
)
|
||||
if new_labels:
|
||||
labels_with_counts = _format_labels(
|
||||
gold_data["textcat"].most_common(), counts=True
|
||||
)
|
||||
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||
if existing_labels:
|
||||
msg.text(
|
||||
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||
)
|
||||
|
||||
if "tagger" in pipeline:
|
||||
msg.divider("Part-of-speech Tagging")
|
||||
labels = [label for label in gold_data["tags"]]
|
||||
tag_map = nlp.Defaults.tag_map
|
||||
msg.info(
|
||||
"{} {} in data ({} {} in tag map)".format(
|
||||
len(labels),
|
||||
"label" if len(labels) == 1 else "labels",
|
||||
len(tag_map),
|
||||
"label" if len(tag_map) == 1 else "labels",
|
||||
)
|
||||
)
|
||||
labels_with_counts = _format_labels(
|
||||
gold_data["tags"].most_common(), counts=True
|
||||
)
|
||||
msg.text(labels_with_counts, show=verbose)
|
||||
non_tagmap = [l for l in labels if l not in tag_map]
|
||||
if not non_tagmap:
|
||||
msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
|
||||
for label in non_tagmap:
|
||||
msg.fail(
|
||||
"Label '{}' not found in tag map for language '{}'".format(
|
||||
label, nlp.lang
|
||||
)
|
||||
)
|
||||
|
||||
if "parser" in pipeline:
|
||||
msg.divider("Dependency Parsing")
|
||||
labels = [label for label in gold_data["deps"]]
|
||||
msg.info(
|
||||
"{} {} in data".format(
|
||||
len(labels), "label" if len(labels) == 1 else "labels"
|
||||
)
|
||||
)
|
||||
labels_with_counts = _format_labels(
|
||||
gold_data["deps"].most_common(), counts=True
|
||||
)
|
||||
msg.text(labels_with_counts, show=verbose)
|
||||
|
||||
msg.divider("Summary")
|
||||
good_counts = msg.counts[MESSAGES.GOOD]
|
||||
warn_counts = msg.counts[MESSAGES.WARN]
|
||||
fail_counts = msg.counts[MESSAGES.FAIL]
|
||||
if good_counts:
|
||||
msg.good(
|
||||
"{} {} passed".format(
|
||||
good_counts, "check" if good_counts == 1 else "checks"
|
||||
)
|
||||
)
|
||||
if warn_counts:
|
||||
msg.warn(
|
||||
"{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
|
||||
)
|
||||
if fail_counts:
|
||||
msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
|
||||
|
||||
if fail_counts:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def _load_file(file_path, msg):
|
||||
file_name = file_path.parts[-1]
|
||||
if file_path.suffix == ".json":
|
||||
data = read_json(file_path)
|
||||
msg.good("Loaded {}".format(file_name))
|
||||
return data
|
||||
elif file_path.suffix == ".jsonl":
|
||||
data = read_jsonl(file_path)
|
||||
msg.good("Loaded {}".format(file_name))
|
||||
return data
|
||||
msg.fail(
|
||||
"Can't load file extension {}".format(file_path.suffix),
|
||||
"Expected .json or .jsonl",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def _compile_gold(train_docs, pipeline):
|
||||
data = {
|
||||
"ner": Counter(),
|
||||
"cats": Counter(),
|
||||
"tags": Counter(),
|
||||
"deps": Counter(),
|
||||
"words": Counter(),
|
||||
"n_words": 0,
|
||||
"texts": set(),
|
||||
}
|
||||
for doc, gold in train_docs:
|
||||
data["words"].update(gold.words)
|
||||
data["n_words"] += len(gold.words)
|
||||
data["texts"].add(doc.text)
|
||||
if "ner" in pipeline:
|
||||
for label in gold.ner:
|
||||
if label.startswith(("B-", "U-")):
|
||||
combined_label = label.split("-")[1]
|
||||
data["ner"][combined_label] += 1
|
||||
elif label == "-":
|
||||
data["ner"]["-"] += 1
|
||||
if "textcat" in pipeline:
|
||||
data["cats"].update(gold.cats)
|
||||
if "tagger" in pipeline:
|
||||
data["tags"].update(gold.tags)
|
||||
if "parser" in pipeline:
|
||||
data["deps"].update(gold.labels)
|
||||
return data
|
||||
|
||||
|
||||
def _format_labels(labels, counts=False):
|
||||
if counts:
|
||||
return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
|
||||
return ", ".join(["'{}'".format(l) for l in labels])
|
||||
|
||||
|
||||
def _get_ner_counts(data):
|
||||
counter = Counter()
|
||||
for doc, gold in data:
|
||||
for label in gold.ner:
|
||||
if label.startswith(("B-", "U-")):
|
||||
combined_label = label.split("-")[1]
|
||||
counter[combined_label] += 1
|
||||
elif label == "-":
|
||||
counter["-"] += 1
|
||||
return counter
|
||||
|
||||
|
||||
def _get_examples_without_label(data, label):
|
||||
count = 0
|
||||
for doc, gold in data:
|
||||
labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
|
||||
if label not in labels:
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def _get_labels_from_model(nlp, pipe_name):
|
||||
if pipe_name not in nlp.pipe_names:
|
||||
return set()
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
return pipe.labels
|
|
@ -6,34 +6,37 @@ import requests
|
|||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from .link import link
|
||||
from ..util import prints, get_package_path
|
||||
from ..util import get_package_path
|
||||
from .. import about
|
||||
|
||||
|
||||
msg = Printer()
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("model to download, shortcut or name", "positional", None, str),
|
||||
direct=("force direct download. Needs model name with version and won't "
|
||||
"perform compatibility check", "flag", "d", bool),
|
||||
pip_args=("additional arguments to be passed to `pip install` when "
|
||||
"installing the model"))
|
||||
model=("Model to download (shortcut or name)", "positional", None, str),
|
||||
direct=("Force direct download of name + version", "flag", "d", bool),
|
||||
pip_args=("additional arguments to be passed to `pip install` on model install"),
|
||||
)
|
||||
def download(model, direct=False, *pip_args):
|
||||
"""
|
||||
Download compatible model from default download path using pip. Model
|
||||
can be shortcut, model name or, if --direct flag is set, full model name
|
||||
with version.
|
||||
with version. For direct downloads, the compatibility check will be skipped.
|
||||
"""
|
||||
if direct:
|
||||
dl = download_model('{m}/{m}.tar.gz#egg={m}'.format(m=model), pip_args)
|
||||
dl = download_model("{m}/{m}.tar.gz#egg={m}".format(m=model), pip_args)
|
||||
else:
|
||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
||||
model_name = shortcuts.get(model, model)
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
dl = download_model('{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}'
|
||||
.format(m=model_name, v=version), pip_args)
|
||||
dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}"
|
||||
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||
if dl != 0: # if download subprocess doesn't return 0, exit
|
||||
sys.exit(dl)
|
||||
try:
|
||||
|
@ -43,44 +46,49 @@ def download(model, direct=False, *pip_args):
|
|||
# subprocess
|
||||
package_path = get_package_path(model_name)
|
||||
link(model_name, model, force=True, model_path=package_path)
|
||||
except:
|
||||
except: # noqa: E722
|
||||
# Dirty, but since spacy.download and the auto-linking is
|
||||
# mostly a convenience wrapper, it's best to show a success
|
||||
# message and loading instructions, even if linking fails.
|
||||
prints(Messages.M001, title=Messages.M002.format(name=model_name))
|
||||
msg.warn(Messages.M002.format(name=model_name), Messages.M001)
|
||||
|
||||
|
||||
def get_json(url, desc):
|
||||
r = requests.get(url)
|
||||
if r.status_code != 200:
|
||||
prints(Messages.M004.format(desc=desc, version=about.__version__),
|
||||
title=Messages.M003.format(code=r.status_code), exits=1)
|
||||
msg.fail(
|
||||
Messages.M003.format(code=r.status_code),
|
||||
Messages.M004.format(desc=desc, version=about.__version__),
|
||||
exits=1,
|
||||
)
|
||||
return r.json()
|
||||
|
||||
|
||||
def get_compatibility():
|
||||
version = about.__version__
|
||||
version = version.rsplit('.dev', 1)[0]
|
||||
version = version.rsplit(".dev", 1)[0]
|
||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||
comp = comp_table['spacy']
|
||||
comp = comp_table["spacy"]
|
||||
if version not in comp:
|
||||
prints(Messages.M006.format(version=version), title=Messages.M005,
|
||||
exits=1)
|
||||
msg.fail(Messages.M005, Messages.M006.format(version=version), exits=1)
|
||||
return comp[version]
|
||||
|
||||
|
||||
def get_version(model, comp):
|
||||
model = model.rsplit('.dev', 1)[0]
|
||||
model = model.rsplit(".dev", 1)[0]
|
||||
if model not in comp:
|
||||
prints(Messages.M007.format(name=model, version=about.__version__),
|
||||
title=Messages.M005, exits=1)
|
||||
msg.fail(
|
||||
Messages.M005,
|
||||
Messages.M007.format(name=model, version=about.__version__),
|
||||
exits=1,
|
||||
)
|
||||
return comp[model][0]
|
||||
|
||||
|
||||
def download_model(filename, user_pip_args=None):
|
||||
download_url = about.__download_url__ + '/' + filename
|
||||
pip_args = ['--no-cache-dir', '--no-deps']
|
||||
download_url = about.__download_url__ + "/" + filename
|
||||
pip_args = ["--no-cache-dir", "--no-deps"]
|
||||
if user_pip_args:
|
||||
pip_args.extend(user_pip_args)
|
||||
cmd = [sys.executable, '-m', 'pip', 'install'] + pip_args + [download_url]
|
||||
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
||||
return subprocess.call(cmd, env=os.environ.copy())
|
||||
|
|
|
@ -3,30 +3,35 @@ from __future__ import unicode_literals, division, print_function
|
|||
|
||||
import plac
|
||||
from timeit import default_timer as timer
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from ..gold import GoldCorpus
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
from .. import displacy
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("model name or path", "positional", None, str),
|
||||
data_path=("location of JSON-formatted evaluation data", "positional",
|
||||
None, str),
|
||||
gold_preproc=("use gold preprocessing", "flag", "G", bool),
|
||||
gpu_id=("use GPU", "option", "g", int),
|
||||
displacy_path=("directory to output rendered parses as HTML", "option",
|
||||
"dp", str),
|
||||
displacy_limit=("limit of parses to render as HTML", "option", "dl", int))
|
||||
def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None,
|
||||
displacy_limit=25):
|
||||
model=("Model name or path", "positional", None, str),
|
||||
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
gpu_id=("Use GPU", "option", "g", int),
|
||||
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
|
||||
)
|
||||
def evaluate(
|
||||
model,
|
||||
data_path,
|
||||
gpu_id=-1,
|
||||
gold_preproc=False,
|
||||
displacy_path=None,
|
||||
displacy_limit=25,
|
||||
):
|
||||
"""
|
||||
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||
output directory as the displacy_path argument.
|
||||
"""
|
||||
|
||||
msg = Printer()
|
||||
util.fix_random_seed()
|
||||
if gpu_id >= 0:
|
||||
util.use_gpu(gpu_id)
|
||||
|
@ -34,9 +39,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
|
|||
data_path = util.ensure_path(data_path)
|
||||
displacy_path = util.ensure_path(displacy_path)
|
||||
if not data_path.exists():
|
||||
prints(data_path, title=Messages.M034, exits=1)
|
||||
msg.fail(Messages.M034, data_path, exits=1)
|
||||
if displacy_path and not displacy_path.exists():
|
||||
prints(displacy_path, title=Messages.M035, exits=1)
|
||||
msg.fail(Messages.M035, displacy_path, exits=1)
|
||||
corpus = GoldCorpus(data_path, data_path)
|
||||
nlp = util.load_model(model)
|
||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||
|
@ -44,65 +49,80 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
|
|||
scorer = nlp.evaluate(dev_docs, verbose=False)
|
||||
end = timer()
|
||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||
print_results(scorer, time=end - begin, words=nwords,
|
||||
wps=nwords / (end - begin))
|
||||
results = {
|
||||
"Time": "%.2f s" % end - begin,
|
||||
"Words": nwords,
|
||||
"Words/s": "%.0f" % nwords / (end - begin),
|
||||
"TOK": "%.2f" % scorer.token_acc,
|
||||
"POS": "%.2f" % scorer.tags_acc,
|
||||
"UAS": "%.2f" % scorer.uas,
|
||||
"LAS": "%.2f" % scorer.las,
|
||||
"NER P": "%.2f" % scorer.ents_p,
|
||||
"NER R": "%.2f" % scorer.ents_r,
|
||||
"NER F": "%.2f" % scorer.ents_f,
|
||||
}
|
||||
msg.table(results, title="Results")
|
||||
|
||||
if displacy_path:
|
||||
docs, golds = zip(*dev_docs)
|
||||
render_deps = 'parser' in nlp.meta.get('pipeline', [])
|
||||
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
||||
render_parses(docs, displacy_path, model_name=model,
|
||||
limit=displacy_limit, deps=render_deps, ents=render_ents)
|
||||
prints(displacy_path, title=Messages.M036.format(n=displacy_limit))
|
||||
render_deps = "parser" in nlp.meta.get("pipeline", [])
|
||||
render_ents = "ner" in nlp.meta.get("pipeline", [])
|
||||
render_parses(
|
||||
docs,
|
||||
displacy_path,
|
||||
model_name=model,
|
||||
limit=displacy_limit,
|
||||
deps=render_deps,
|
||||
ents=render_ents,
|
||||
)
|
||||
msg.good(Messages.M036.format(n=displacy_limit), displacy_path)
|
||||
|
||||
|
||||
def render_parses(docs, output_path, model_name='', limit=250, deps=True,
|
||||
ents=True):
|
||||
docs[0].user_data['title'] = model_name
|
||||
def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
|
||||
docs[0].user_data["title"] = model_name
|
||||
if ents:
|
||||
with (output_path / 'entities.html').open('w') as file_:
|
||||
html = displacy.render(docs[:limit], style='ent', page=True)
|
||||
with (output_path / "entities.html").open("w") as file_:
|
||||
html = displacy.render(docs[:limit], style="ent", page=True)
|
||||
file_.write(html)
|
||||
if deps:
|
||||
with (output_path / 'parses.html').open('w') as file_:
|
||||
html = displacy.render(docs[:limit], style='dep', page=True,
|
||||
options={'compact': True})
|
||||
with (output_path / "parses.html").open("w") as file_:
|
||||
html = displacy.render(
|
||||
docs[:limit], style="dep", page=True, options={"compact": True}
|
||||
)
|
||||
file_.write(html)
|
||||
|
||||
|
||||
def print_progress(itn, losses, dev_scores, wps=0.0):
|
||||
scores = {}
|
||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
||||
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
||||
for col in [
|
||||
"dep_loss",
|
||||
"tag_loss",
|
||||
"uas",
|
||||
"tags_acc",
|
||||
"token_acc",
|
||||
"ents_p",
|
||||
"ents_r",
|
||||
"ents_f",
|
||||
"wps",
|
||||
]:
|
||||
scores[col] = 0.0
|
||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
||||
scores["dep_loss"] = losses.get("parser", 0.0)
|
||||
scores["ner_loss"] = losses.get("ner", 0.0)
|
||||
scores["tag_loss"] = losses.get("tagger", 0.0)
|
||||
scores.update(dev_scores)
|
||||
scores['wps'] = wps
|
||||
tpl = '\t'.join((
|
||||
'{:d}',
|
||||
'{dep_loss:.3f}',
|
||||
'{ner_loss:.3f}',
|
||||
'{uas:.3f}',
|
||||
'{ents_p:.3f}',
|
||||
'{ents_r:.3f}',
|
||||
'{ents_f:.3f}',
|
||||
'{tags_acc:.3f}',
|
||||
'{token_acc:.3f}',
|
||||
'{wps:.1f}'))
|
||||
scores["wps"] = wps
|
||||
tpl = "\t".join(
|
||||
(
|
||||
"{:d}",
|
||||
"{dep_loss:.3f}",
|
||||
"{ner_loss:.3f}",
|
||||
"{uas:.3f}",
|
||||
"{ents_p:.3f}",
|
||||
"{ents_r:.3f}",
|
||||
"{ents_f:.3f}",
|
||||
"{tags_acc:.3f}",
|
||||
"{token_acc:.3f}",
|
||||
"{wps:.1f}",
|
||||
)
|
||||
)
|
||||
print(tpl.format(itn, **scores))
|
||||
|
||||
|
||||
def print_results(scorer, time, words, wps):
|
||||
results = {
|
||||
'Time': '%.2f s' % time,
|
||||
'Words': words,
|
||||
'Words/s': '%.0f' % wps,
|
||||
'TOK': '%.2f' % scorer.token_acc,
|
||||
'POS': '%.2f' % scorer.tags_acc,
|
||||
'UAS': '%.2f' % scorer.uas,
|
||||
'LAS': '%.2f' % scorer.las,
|
||||
'NER P': '%.2f' % scorer.ents_p,
|
||||
'NER R': '%.2f' % scorer.ents_r,
|
||||
'NER F': '%.2f' % scorer.ents_f}
|
||||
util.print_table(results, title="Results")
|
||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
|||
import plac
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from ..compat import path2str
|
||||
|
@ -12,56 +13,65 @@ from .. import about
|
|||
|
||||
|
||||
@plac.annotations(
|
||||
model=("optional: shortcut link of model", "positional", None, str),
|
||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str),
|
||||
silent=("don't print anything (just return)", "flag", "s"))
|
||||
model=("Optional shortcut link of model", "positional", None, str),
|
||||
markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
|
||||
silent=("Don't print anything (just return)", "flag", "s"),
|
||||
)
|
||||
def info(model=None, markdown=False, silent=False):
|
||||
"""Print info about spaCy installation. If a model shortcut link is
|
||||
"""
|
||||
Print info about spaCy installation. If a model shortcut link is
|
||||
speficied as an argument, print model information. Flag --markdown
|
||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||
"""
|
||||
msg = Printer()
|
||||
if model:
|
||||
if util.is_package(model):
|
||||
model_path = util.get_package_path(model)
|
||||
else:
|
||||
model_path = util.get_data_path() / model
|
||||
meta_path = model_path / 'meta.json'
|
||||
meta_path = model_path / "meta.json"
|
||||
if not meta_path.is_file():
|
||||
util.prints(meta_path, title=Messages.M020, exits=1)
|
||||
msg.fail(Messages.M020, meta_path, exits=1)
|
||||
meta = util.read_json(meta_path)
|
||||
if model_path.resolve() != model_path:
|
||||
meta['link'] = path2str(model_path)
|
||||
meta['source'] = path2str(model_path.resolve())
|
||||
meta["link"] = path2str(model_path)
|
||||
meta["source"] = path2str(model_path.resolve())
|
||||
else:
|
||||
meta['source'] = path2str(model_path)
|
||||
meta["source"] = path2str(model_path)
|
||||
if not silent:
|
||||
print_info(meta, 'model %s' % model, markdown)
|
||||
title = "Info about model '{}'".format(model)
|
||||
model_meta = {
|
||||
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
|
||||
}
|
||||
if markdown:
|
||||
util.print_markdown(model_meta, title=title)
|
||||
else:
|
||||
msg.table(model_meta, title=title)
|
||||
return meta
|
||||
data = {'spaCy version': about.__version__,
|
||||
'Location': path2str(Path(__file__).parent.parent),
|
||||
'Platform': platform.platform(),
|
||||
'Python version': platform.python_version(),
|
||||
'Models': list_models()}
|
||||
data = {
|
||||
"spaCy version": about.__version__,
|
||||
"Location": path2str(Path(__file__).parent.parent),
|
||||
"Platform": platform.platform(),
|
||||
"Python version": platform.python_version(),
|
||||
"Models": list_models(),
|
||||
}
|
||||
if not silent:
|
||||
print_info(data, 'spaCy', markdown)
|
||||
title = "Info about spaCy"
|
||||
if markdown:
|
||||
util.print_markdown(data, title=title)
|
||||
else:
|
||||
msg.table(data, title=title)
|
||||
return data
|
||||
|
||||
|
||||
def print_info(data, title, markdown):
|
||||
title = 'Info about %s' % title
|
||||
if markdown:
|
||||
util.print_markdown(data, title=title)
|
||||
else:
|
||||
util.print_table(data, title=title)
|
||||
|
||||
|
||||
def list_models():
|
||||
def exclude_dir(dir_name):
|
||||
# exclude common cache directories and hidden directories
|
||||
exclude = ['cache', 'pycache', '__pycache__']
|
||||
return dir_name in exclude or dir_name.startswith('.')
|
||||
exclude = ("cache", "pycache", "__pycache__")
|
||||
return dir_name in exclude or dir_name.startswith(".")
|
||||
|
||||
data_path = util.get_data_path()
|
||||
if data_path:
|
||||
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
|
||||
return ', '.join([m for m in models if not exclude_dir(m)])
|
||||
return '-'
|
||||
return ", ".join([m for m in models if not exclude_dir(m)])
|
||||
return "-"
|
||||
|
|
|
@ -11,13 +11,12 @@ from preshed.counter import PreshCounter
|
|||
import tarfile
|
||||
import gzip
|
||||
import zipfile
|
||||
import ujson as json
|
||||
from spacy.lexeme import intify_attrs
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from ..vectors import Vectors
|
||||
from ..errors import Errors, Warnings, user_warning
|
||||
from ..util import prints, ensure_path, get_lang_class
|
||||
from ..util import ensure_path, get_lang_class, read_jsonl
|
||||
|
||||
try:
|
||||
import ftfy
|
||||
|
@ -25,121 +24,133 @@ except ImportError:
|
|||
ftfy = None
|
||||
|
||||
|
||||
msg = Printer()
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
output_dir=("model output directory", "positional", None, Path),
|
||||
freqs_loc=("location of words frequencies file", "option", "f", Path),
|
||||
jsonl_loc=("location of JSONL-formatted attributes file", "option", "j", Path),
|
||||
clusters_loc=("optional: location of brown clusters data",
|
||||
"option", "c", str),
|
||||
vectors_loc=("optional: location of vectors file in Word2Vec format "
|
||||
"(either as .txt or zipped as .zip or .tar.gz)", "option",
|
||||
"v", str),
|
||||
prune_vectors=("optional: number of vectors to prune to",
|
||||
"option", "V", int)
|
||||
lang=("Model language", "positional", None, str),
|
||||
output_dir=("Model output directory", "positional", None, Path),
|
||||
freqs_loc=("Location of words frequencies file", "option", "f", Path),
|
||||
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
||||
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
||||
vectors_loc=("Optional vectors file in Word2Vec format" "option", "v", str),
|
||||
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
||||
)
|
||||
def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, jsonl_loc=None,
|
||||
vectors_loc=None, prune_vectors=-1):
|
||||
def init_model(
|
||||
lang,
|
||||
output_dir,
|
||||
freqs_loc=None,
|
||||
clusters_loc=None,
|
||||
jsonl_loc=None,
|
||||
vectors_loc=None,
|
||||
prune_vectors=-1,
|
||||
):
|
||||
"""
|
||||
Create a new model from raw data, like word frequencies, Brown clusters
|
||||
and word vectors.
|
||||
and word vectors. If vectors are provided in Word2Vec format, they can
|
||||
be either a .txt or zipped as a .zip or .tar.gz.
|
||||
"""
|
||||
if jsonl_loc is not None:
|
||||
if freqs_loc is not None or clusters_loc is not None:
|
||||
settings = ['-j']
|
||||
settings = ["-j"]
|
||||
if freqs_loc:
|
||||
settings.append('-f')
|
||||
settings.append("-f")
|
||||
if clusters_loc:
|
||||
settings.append('-c')
|
||||
prints(' '.join(settings),
|
||||
title=(
|
||||
"The -f and -c arguments are deprecated, and not compatible "
|
||||
"with the -j argument, which should specify the same information. "
|
||||
"Either merge the frequencies and clusters data into the "
|
||||
"jsonl-formatted file (recommended), or use only the -f and "
|
||||
"-c files, without the other lexical attributes."))
|
||||
settings.append("-c")
|
||||
msg.warn(Messages.M063, Messages.M064)
|
||||
jsonl_loc = ensure_path(jsonl_loc)
|
||||
lex_attrs = (json.loads(line) for line in jsonl_loc.open())
|
||||
lex_attrs = read_jsonl(jsonl_loc)
|
||||
else:
|
||||
clusters_loc = ensure_path(clusters_loc)
|
||||
freqs_loc = ensure_path(freqs_loc)
|
||||
if freqs_loc is not None and not freqs_loc.exists():
|
||||
prints(freqs_loc, title=Messages.M037, exits=1)
|
||||
msg.fail(Messages.M037, freqs_loc, exits=1)
|
||||
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
|
||||
|
||||
nlp = create_model(lang, lex_attrs)
|
||||
with msg.loading("Creating model..."):
|
||||
nlp = create_model(lang, lex_attrs)
|
||||
msg.good("Successfully created model")
|
||||
if vectors_loc is not None:
|
||||
add_vectors(nlp, vectors_loc, prune_vectors)
|
||||
vec_added = len(nlp.vocab.vectors)
|
||||
lex_added = len(nlp.vocab)
|
||||
prints(Messages.M039.format(entries=lex_added, vectors=vec_added),
|
||||
title=Messages.M038)
|
||||
msg.good(Messages.M038, Messages.M039.format(entries=lex_added, vectors=vec_added))
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
return nlp
|
||||
|
||||
|
||||
def open_file(loc):
|
||||
'''Handle .gz, .tar.gz or unzipped files'''
|
||||
"""Handle .gz, .tar.gz or unzipped files"""
|
||||
loc = ensure_path(loc)
|
||||
print("Open loc")
|
||||
if tarfile.is_tarfile(str(loc)):
|
||||
return tarfile.open(str(loc), 'r:gz')
|
||||
elif loc.parts[-1].endswith('gz'):
|
||||
return (line.decode('utf8') for line in gzip.open(str(loc), 'r'))
|
||||
elif loc.parts[-1].endswith('zip'):
|
||||
return tarfile.open(str(loc), "r:gz")
|
||||
elif loc.parts[-1].endswith("gz"):
|
||||
return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
|
||||
elif loc.parts[-1].endswith("zip"):
|
||||
zip_file = zipfile.ZipFile(str(loc))
|
||||
names = zip_file.namelist()
|
||||
file_ = zip_file.open(names[0])
|
||||
return (line.decode('utf8') for line in file_)
|
||||
return (line.decode("utf8") for line in file_)
|
||||
else:
|
||||
return loc.open('r', encoding='utf8')
|
||||
return loc.open("r", encoding="utf8")
|
||||
|
||||
|
||||
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
|
||||
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
||||
with msg.loading("Counting frequencies..."):
|
||||
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
|
||||
msg.good("Counted frequencies")
|
||||
with msg.loading("Reading clusters..."):
|
||||
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
||||
msg.good("Read clusters")
|
||||
lex_attrs = []
|
||||
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
||||
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
||||
attrs = {'orth': word, 'id': i, 'prob': prob}
|
||||
attrs = {"orth": word, "id": i, "prob": prob}
|
||||
# Decode as a little-endian string, so that we can do & 15 to get
|
||||
# the first 4 bits. See _parse_features.pyx
|
||||
if word in clusters:
|
||||
attrs['cluster'] = int(clusters[word][::-1], 2)
|
||||
attrs["cluster"] = int(clusters[word][::-1], 2)
|
||||
else:
|
||||
attrs['cluster'] = 0
|
||||
attrs["cluster"] = 0
|
||||
lex_attrs.append(attrs)
|
||||
return lex_attrs
|
||||
|
||||
|
||||
def create_model(lang, lex_attrs):
|
||||
print("Creating model...")
|
||||
lang_class = get_lang_class(lang)
|
||||
nlp = lang_class()
|
||||
for lexeme in nlp.vocab:
|
||||
lexeme.rank = 0
|
||||
lex_added = 0
|
||||
for attrs in lex_attrs:
|
||||
if 'settings' in attrs:
|
||||
if "settings" in attrs:
|
||||
continue
|
||||
lexeme = nlp.vocab[attrs['orth']]
|
||||
lexeme = nlp.vocab[attrs["orth"]]
|
||||
lexeme.set_attrs(**attrs)
|
||||
lexeme.is_oov = False
|
||||
lex_added += 1
|
||||
lex_added += 1
|
||||
oov_prob = min(lex.prob for lex in nlp.vocab)
|
||||
nlp.vocab.cfg.update({'oov_prob': oov_prob-1})
|
||||
nlp.vocab.cfg.update({"oov_prob": oov_prob - 1})
|
||||
return nlp
|
||||
|
||||
|
||||
def add_vectors(nlp, vectors_loc, prune_vectors):
|
||||
vectors_loc = ensure_path(vectors_loc)
|
||||
if vectors_loc and vectors_loc.parts[-1].endswith('.npz'):
|
||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open('rb')))
|
||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||
for lex in nlp.vocab:
|
||||
if lex.rank:
|
||||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
||||
else:
|
||||
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
|
||||
if vectors_loc:
|
||||
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
||||
vectors_data, vector_keys = read_vectors(vectors_loc)
|
||||
msg.good("Loaded vectors from {}".format(vectors_loc))
|
||||
else:
|
||||
vectors_data, vector_keys = (None, None)
|
||||
if vector_keys is not None:
|
||||
for word in vector_keys:
|
||||
if word not in nlp.vocab:
|
||||
|
@ -147,35 +158,34 @@ def add_vectors(nlp, vectors_loc, prune_vectors):
|
|||
lexeme.is_oov = False
|
||||
if vectors_data is not None:
|
||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||
nlp.vocab.vectors.name = '%s_model.vectors' % nlp.meta['lang']
|
||||
nlp.meta['vectors']['name'] = nlp.vocab.vectors.name
|
||||
nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
|
||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||
if prune_vectors >= 1:
|
||||
nlp.vocab.prune_vectors(prune_vectors)
|
||||
|
||||
|
||||
def read_vectors(vectors_loc):
|
||||
print("Reading vectors from %s" % vectors_loc)
|
||||
f = open_file(vectors_loc)
|
||||
shape = tuple(int(size) for size in next(f).split())
|
||||
vectors_data = numpy.zeros(shape=shape, dtype='f')
|
||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
||||
vectors_keys = []
|
||||
for i, line in enumerate(tqdm(f)):
|
||||
line = line.rstrip()
|
||||
pieces = line.rsplit(' ', vectors_data.shape[1]+1)
|
||||
pieces = line.rsplit(" ", vectors_data.shape[1] + 1)
|
||||
word = pieces.pop(0)
|
||||
if len(pieces) != vectors_data.shape[1]:
|
||||
raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
|
||||
vectors_data[i] = numpy.asarray(pieces, dtype='f')
|
||||
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
||||
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
||||
vectors_keys.append(word)
|
||||
return vectors_data, vectors_keys
|
||||
|
||||
|
||||
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||
print("Counting frequencies...")
|
||||
counts = PreshCounter()
|
||||
total = 0
|
||||
with freqs_loc.open() as f:
|
||||
for i, line in enumerate(f):
|
||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
||||
freq = int(freq)
|
||||
counts.inc(i + 1, freq)
|
||||
total += freq
|
||||
|
@ -184,7 +194,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
|||
probs = {}
|
||||
with freqs_loc.open() as f:
|
||||
for line in tqdm(f):
|
||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
||||
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
||||
doc_freq = int(doc_freq)
|
||||
freq = int(freq)
|
||||
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
||||
|
@ -196,7 +206,6 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
|||
|
||||
|
||||
def read_clusters(clusters_loc):
|
||||
print("Reading clusters...")
|
||||
clusters = {}
|
||||
if ftfy is None:
|
||||
user_warning(Warnings.W004)
|
||||
|
@ -213,7 +222,7 @@ def read_clusters(clusters_loc):
|
|||
if int(freq) >= 3:
|
||||
clusters[word] = cluster
|
||||
else:
|
||||
clusters[word] = '0'
|
||||
clusters[word] = "0"
|
||||
# Expand clusters with re-casing
|
||||
for word, cluster in list(clusters.items()):
|
||||
if word.lower() not in clusters:
|
||||
|
|
|
@ -3,51 +3,54 @@ from __future__ import unicode_literals
|
|||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from ..compat import symlink_to, path2str
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
origin=("package name or local path to model", "positional", None, str),
|
||||
link_name=("name of shortuct link to create", "positional", None, str),
|
||||
force=("force overwriting of existing link", "flag", "f", bool))
|
||||
force=("force overwriting of existing link", "flag", "f", bool),
|
||||
)
|
||||
def link(origin, link_name, force=False, model_path=None):
|
||||
"""
|
||||
Create a symlink for models within the spacy/data directory. Accepts
|
||||
either the name of a pip package, or the local path to the model data
|
||||
directory. Linking models allows loading them via spacy.load(link_name).
|
||||
"""
|
||||
msg = Printer()
|
||||
if util.is_package(origin):
|
||||
model_path = util.get_package_path(origin)
|
||||
else:
|
||||
model_path = Path(origin) if model_path is None else Path(model_path)
|
||||
if not model_path.exists():
|
||||
prints(Messages.M009.format(path=path2str(model_path)),
|
||||
title=Messages.M008, exits=1)
|
||||
msg.fail(
|
||||
Messages.M008, Messages.M009.format(path=path2str(model_path)), exits=1
|
||||
)
|
||||
data_path = util.get_data_path()
|
||||
if not data_path or not data_path.exists():
|
||||
spacy_loc = Path(__file__).parent.parent
|
||||
prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1)
|
||||
msg.fail(Messages.M010, Messages.M011.format(path=spacy_loc), exits=1)
|
||||
link_path = util.get_data_path() / link_name
|
||||
if link_path.is_symlink() and not force:
|
||||
prints(Messages.M013, title=Messages.M012.format(name=link_name),
|
||||
exits=1)
|
||||
msg.fail(Messages.M012.format(name=link_name), Messages.M013, exits=1)
|
||||
elif link_path.is_symlink(): # does a symlink exist?
|
||||
# NB: It's important to check for is_symlink here and not for exists,
|
||||
# because invalid/outdated symlinks would return False otherwise.
|
||||
link_path.unlink()
|
||||
elif link_path.exists(): # does it exist otherwise?
|
||||
elif link_path.exists(): # does it exist otherwise?
|
||||
# NB: Check this last because valid symlinks also "exist".
|
||||
prints(Messages.M015, link_path,
|
||||
title=Messages.M014.format(name=link_name), exits=1)
|
||||
msg = "%s --> %s" % (path2str(model_path), path2str(link_path))
|
||||
msg.fail(Messages.M014.format(name=link_name), Messages.M015, exits=1)
|
||||
details = "%s --> %s" % (path2str(model_path), path2str(link_path))
|
||||
try:
|
||||
symlink_to(link_path, model_path)
|
||||
except:
|
||||
except: # noqa: E722
|
||||
# This is quite dirty, but just making sure other errors are caught.
|
||||
prints(Messages.M017, msg, title=Messages.M016.format(name=link_name))
|
||||
msg.fail(Messages.M016.format(name=link_name), Messages.M017)
|
||||
msg.text(details)
|
||||
raise
|
||||
prints(msg, Messages.M019.format(name=link_name), title=Messages.M018)
|
||||
msg.good(Messages.M018, details)
|
||||
msg.text(Messages.M019.format(name=link_name))
|
||||
|
|
|
@ -4,109 +4,106 @@ from __future__ import unicode_literals
|
|||
import plac
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, get_raw_input
|
||||
|
||||
from ._messages import Messages
|
||||
from ..compat import path2str, json_dumps
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_dir=("directory with model data", "positional", None, str),
|
||||
output_dir=("output parent directory", "positional", None, str),
|
||||
meta_path=("path to meta.json", "option", "m", str),
|
||||
create_meta=("create meta.json, even if one exists in directory – if "
|
||||
"existing meta is found, entries are shown as defaults in "
|
||||
"the command line prompt", "flag", "c", bool),
|
||||
force=("force overwriting of existing model directory in output directory",
|
||||
"flag", "f", bool))
|
||||
def package(input_dir, output_dir, meta_path=None, create_meta=False,
|
||||
force=False):
|
||||
input_dir=("Directory with model data", "positional", None, str),
|
||||
output_dir=("Output parent directory", "positional", None, str),
|
||||
meta_path=("Path to meta.json", "option", "m", str),
|
||||
create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
|
||||
force=("Force overwriting existing model in output directory", "flag", "f", bool),
|
||||
)
|
||||
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
||||
"""
|
||||
Generate Python package for model data, including meta and required
|
||||
installation files. A new directory will be created in the specified
|
||||
output directory, and model data will be copied over.
|
||||
output directory, and model data will be copied over. If --create-meta is
|
||||
set and a meta.json already exists in the output directory, the existing
|
||||
values will be used as the defaults in the command-line prompt.
|
||||
"""
|
||||
msg = Printer()
|
||||
input_path = util.ensure_path(input_dir)
|
||||
output_path = util.ensure_path(output_dir)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
if not input_path or not input_path.exists():
|
||||
prints(input_path, title=Messages.M008, exits=1)
|
||||
msg.fail(Messages.M008, input_path, exits=1)
|
||||
if not output_path or not output_path.exists():
|
||||
prints(output_path, title=Messages.M040, exits=1)
|
||||
msg.fail(Messages.M040, output_path, exits=1)
|
||||
if meta_path and not meta_path.exists():
|
||||
prints(meta_path, title=Messages.M020, exits=1)
|
||||
msg.fail(Messages.M020, meta_path, exits=1)
|
||||
|
||||
meta_path = meta_path or input_path / 'meta.json'
|
||||
meta_path = meta_path or input_path / "meta.json"
|
||||
if meta_path.is_file():
|
||||
meta = util.read_json(meta_path)
|
||||
if not create_meta: # only print this if user doesn't want to overwrite
|
||||
prints(meta_path, title=Messages.M041)
|
||||
if not create_meta: # only print if user doesn't want to overwrite
|
||||
msg.good(Messages.M041, meta_path)
|
||||
else:
|
||||
meta = generate_meta(input_dir, meta)
|
||||
meta = validate_meta(meta, ['lang', 'name', 'version'])
|
||||
model_name = meta['lang'] + '_' + meta['name']
|
||||
model_name_v = model_name + '-' + meta['version']
|
||||
meta = generate_meta(input_dir, meta, msg)
|
||||
for key in ("lang", "name", "version"):
|
||||
if key not in meta or meta[key] == "":
|
||||
msg.fail(Messages.M048.format(key=key), Messages.M049, exits=1)
|
||||
model_name = meta["lang"] + "_" + meta["name"]
|
||||
model_name_v = model_name + "-" + meta["version"]
|
||||
main_path = output_path / model_name_v
|
||||
package_path = main_path / model_name
|
||||
|
||||
create_dirs(package_path, force)
|
||||
shutil.copytree(path2str(input_path),
|
||||
path2str(package_path / model_name_v))
|
||||
create_file(main_path / 'meta.json', json_dumps(meta))
|
||||
create_file(main_path / 'setup.py', TEMPLATE_SETUP)
|
||||
create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST)
|
||||
create_file(package_path / '__init__.py', TEMPLATE_INIT)
|
||||
prints(main_path, Messages.M043,
|
||||
title=Messages.M042.format(name=model_name_v))
|
||||
|
||||
|
||||
def create_dirs(package_path, force):
|
||||
if package_path.exists():
|
||||
if force:
|
||||
shutil.rmtree(path2str(package_path))
|
||||
else:
|
||||
prints(package_path, Messages.M045, title=Messages.M044, exits=1)
|
||||
msg.fail(
|
||||
Messages.M044,
|
||||
Messages.M045.format(path=path2str(package_path)),
|
||||
exits=1,
|
||||
)
|
||||
Path.mkdir(package_path, parents=True)
|
||||
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
||||
create_file(main_path / "meta.json", json_dumps(meta))
|
||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
||||
msg.good(Messages.M042.format(name=model_name_v), main_path)
|
||||
msg.text(Messages.M043)
|
||||
|
||||
|
||||
def create_file(file_path, contents):
|
||||
file_path.touch()
|
||||
file_path.open('w', encoding='utf-8').write(contents)
|
||||
file_path.open("w", encoding="utf-8").write(contents)
|
||||
|
||||
|
||||
def generate_meta(model_path, existing_meta):
|
||||
def generate_meta(model_path, existing_meta, msg):
|
||||
meta = existing_meta or {}
|
||||
settings = [('lang', 'Model language', meta.get('lang', 'en')),
|
||||
('name', 'Model name', meta.get('name', 'model')),
|
||||
('version', 'Model version', meta.get('version', '0.0.0')),
|
||||
('spacy_version', 'Required spaCy version',
|
||||
'>=%s,<3.0.0' % about.__version__),
|
||||
('description', 'Model description',
|
||||
meta.get('description', False)),
|
||||
('author', 'Author', meta.get('author', False)),
|
||||
('email', 'Author email', meta.get('email', False)),
|
||||
('url', 'Author website', meta.get('url', False)),
|
||||
('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
|
||||
settings = [
|
||||
("lang", "Model language", meta.get("lang", "en")),
|
||||
("name", "Model name", meta.get("name", "model")),
|
||||
("version", "Model version", meta.get("version", "0.0.0")),
|
||||
("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
|
||||
("description", "Model description", meta.get("description", False)),
|
||||
("author", "Author", meta.get("author", False)),
|
||||
("email", "Author email", meta.get("email", False)),
|
||||
("url", "Author website", meta.get("url", False)),
|
||||
("license", "License", meta.get("license", "CC BY-SA 3.0")),
|
||||
]
|
||||
nlp = util.load_model_from_path(Path(model_path))
|
||||
meta['pipeline'] = nlp.pipe_names
|
||||
meta['vectors'] = {'width': nlp.vocab.vectors_length,
|
||||
'vectors': len(nlp.vocab.vectors),
|
||||
'keys': nlp.vocab.vectors.n_keys}
|
||||
prints(Messages.M047, title=Messages.M046)
|
||||
meta["pipeline"] = nlp.pipe_names
|
||||
meta["vectors"] = {
|
||||
"width": nlp.vocab.vectors_length,
|
||||
"vectors": len(nlp.vocab.vectors),
|
||||
"keys": nlp.vocab.vectors.n_keys,
|
||||
}
|
||||
msg.divider(Messages.M046)
|
||||
msg.text(Messages.M047)
|
||||
for setting, desc, default in settings:
|
||||
response = util.get_raw_input(desc, default)
|
||||
meta[setting] = default if response == '' and default else response
|
||||
if about.__title__ != 'spacy':
|
||||
meta['parent_package'] = about.__title__
|
||||
return meta
|
||||
|
||||
|
||||
def validate_meta(meta, keys):
|
||||
for key in keys:
|
||||
if key not in meta or meta[key] == '':
|
||||
prints(Messages.M049, title=Messages.M048.format(key=key), exits=1)
|
||||
response = get_raw_input(desc, default)
|
||||
meta[setting] = default if response == "" and default else response
|
||||
if about.__title__ != "spacy":
|
||||
meta["parent_package"] = about.__title__
|
||||
return meta
|
||||
|
||||
|
||||
|
|
|
@ -1,66 +1,148 @@
|
|||
'''This script is experimental.
|
||||
|
||||
Try pre-training the CNN component of the text categorizer using a cheap
|
||||
language modelling-like objective. Specifically, we load pre-trained vectors
|
||||
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
||||
predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
|
||||
we're not merely doing compression here, because heavy dropout is applied,
|
||||
including over the input words. This means the model must often (50% of the time)
|
||||
use the context in order to predict the word.
|
||||
|
||||
To evaluate the technique, we're pre-training with the 50k texts from the IMDB
|
||||
corpus, and then training with only 100 labels. Note that it's a bit dirty to
|
||||
pre-train with the development data, but also not *so* terrible: we're not using
|
||||
the development labels, after all --- only the unlabelled text.
|
||||
'''
|
||||
# coding: utf8
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import plac
|
||||
import random
|
||||
import numpy
|
||||
import time
|
||||
import ujson as json
|
||||
from pathlib import Path
|
||||
import ujson
|
||||
import sys
|
||||
from collections import Counter
|
||||
|
||||
import spacy
|
||||
from spacy.tokens import Doc
|
||||
from spacy.attrs import ID, HEAD
|
||||
from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path
|
||||
from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
||||
from pathlib import Path
|
||||
from thinc.v2v import Affine, Maxout
|
||||
from thinc.api import wrap
|
||||
from thinc.misc import LayerNorm as LN
|
||||
from thinc.neural.util import prefer_gpu
|
||||
from wasabi import Printer
|
||||
|
||||
from ..tokens import Doc
|
||||
from ..attrs import ID, HEAD
|
||||
from ..compat import json_dumps
|
||||
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
||||
from .. import util
|
||||
|
||||
|
||||
def prefer_gpu():
|
||||
used = spacy.util.use_gpu(0)
|
||||
if used is None:
|
||||
return False
|
||||
else:
|
||||
import cupy.random
|
||||
cupy.random.seed(0)
|
||||
return True
|
||||
@plac.annotations(
|
||||
texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
|
||||
vectors_model=("Name or path to vectors model to learn from"),
|
||||
output_dir=("Directory to write models each epoch", "positional", None, str),
|
||||
width=("Width of CNN layers", "option", "cw", int),
|
||||
depth=("Depth of CNN layers", "option", "cd", int),
|
||||
embed_rows=("Embedding rows", "option", "er", int),
|
||||
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
||||
dropout=("Dropout", "option", "d", float),
|
||||
seed=("Seed for random number generators", "option", "s", float),
|
||||
nr_iter=("Number of iterations to pretrain", "option", "i", int),
|
||||
)
|
||||
def pretrain(
|
||||
texts_loc,
|
||||
vectors_model,
|
||||
output_dir,
|
||||
width=96,
|
||||
depth=4,
|
||||
embed_rows=2000,
|
||||
use_vectors=False,
|
||||
dropout=0.2,
|
||||
nr_iter=1000,
|
||||
seed=0,
|
||||
):
|
||||
"""
|
||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||
using an approximate language-modelling objective. Specifically, we load
|
||||
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
||||
vectors which match the pre-trained ones. The weights are saved to a directory
|
||||
after each epoch. You can then pass a path to one of these pre-trained weights
|
||||
files to the 'spacy train' command.
|
||||
|
||||
This technique may be especially helpful if you have little labelled data.
|
||||
However, it's still quite experimental, so your mileage may vary.
|
||||
|
||||
def load_texts(path):
|
||||
'''Load inputs from a jsonl file.
|
||||
|
||||
Each line should be a dict like {"text": "..."}
|
||||
'''
|
||||
path = ensure_path(path)
|
||||
with path.open('r', encoding='utf8') as file_:
|
||||
texts = [json.loads(line) for line in file_]
|
||||
random.shuffle(texts)
|
||||
return texts
|
||||
To load the weights back in during 'spacy train', you need to ensure
|
||||
all settings are the same between pretraining and training. The API and
|
||||
errors around this need some improvement.
|
||||
"""
|
||||
config = dict(locals())
|
||||
msg = Printer()
|
||||
util.fix_random_seed(seed)
|
||||
|
||||
has_gpu = prefer_gpu()
|
||||
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
||||
|
||||
output_dir = Path(output_dir)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
msg.good("Created output directory")
|
||||
util.write_json(output_dir / "config.json", config)
|
||||
msg.good("Saved settings to config.json")
|
||||
|
||||
# Load texts from file or stdin
|
||||
if texts_loc != "-": # reading from a file
|
||||
texts_loc = Path(texts_loc)
|
||||
if not texts_loc.exists():
|
||||
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
|
||||
with msg.loading("Loading input texts..."):
|
||||
texts = list(util.read_jsonl(texts_loc))
|
||||
msg.good("Loaded input texts")
|
||||
random.shuffle(texts)
|
||||
else: # reading from stdin
|
||||
msg.text("Reading input text from stdin...")
|
||||
texts = stream_texts()
|
||||
|
||||
with msg.loading("Loading model '{}'...".format(vectors_model)):
|
||||
nlp = util.load_model(vectors_model)
|
||||
msg.good("Loaded model '{}'".format(vectors_model))
|
||||
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
|
||||
model = create_pretraining_model(
|
||||
nlp,
|
||||
Tok2Vec(
|
||||
width,
|
||||
embed_rows,
|
||||
conv_depth=depth,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
bilstm_depth=0, # Requires PyTorch. Experimental.
|
||||
cnn_maxout_pieces=2, # You can try setting this higher
|
||||
subword_features=True,
|
||||
),
|
||||
) # Set to False for character models, e.g. Chinese
|
||||
optimizer = create_default_optimizer(model.ops)
|
||||
tracker = ProgressTracker()
|
||||
msg.divider("Pre-training tok2vec layer")
|
||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||
for epoch in range(nr_iter):
|
||||
for batch in util.minibatch_by_words(
|
||||
((text, None) for text in texts), size=5000
|
||||
):
|
||||
docs = make_docs(nlp, [text for (text, _) in batch])
|
||||
loss = make_update(model, docs, optimizer, drop=dropout)
|
||||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
msg.row(progress, **row_settings)
|
||||
if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
|
||||
break
|
||||
with model.use_params(optimizer.averages):
|
||||
with (output_dir / ("model%d.bin" % epoch)).open("wb") as file_:
|
||||
file_.write(model.tok2vec.to_bytes())
|
||||
log = {
|
||||
"nr_word": tracker.nr_word,
|
||||
"loss": tracker.loss,
|
||||
"epoch_loss": tracker.epoch_loss,
|
||||
"epoch": epoch,
|
||||
}
|
||||
with (output_dir / "log.jsonl").open("a") as file_:
|
||||
file_.write(json_dumps(log) + "\n")
|
||||
tracker.epoch_loss = 0.0
|
||||
if texts_loc != "-":
|
||||
# Reshuffle the texts if texts were loaded from a file
|
||||
random.shuffle(texts)
|
||||
|
||||
|
||||
def stream_texts():
|
||||
for line in sys.stdin:
|
||||
yield json.loads(line)
|
||||
yield ujson.loads(line)
|
||||
|
||||
|
||||
def make_update(model, docs, optimizer, drop=0.):
|
||||
def make_update(model, docs, optimizer, drop=0.0):
|
||||
"""Perform an update over a single batch of documents.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
|
@ -74,7 +156,7 @@ def make_update(model, docs, optimizer, drop=0.):
|
|||
# Don't want to return a cupy object here
|
||||
# The gradients are modified in-place by the BERT MLM,
|
||||
# so we get an accurate loss
|
||||
loss = float((gradients**2).mean())
|
||||
loss = float((gradients ** 2).mean())
|
||||
return loss
|
||||
|
||||
|
||||
|
@ -98,7 +180,7 @@ def make_docs(nlp, batch):
|
|||
|
||||
def get_vectors_loss(ops, docs, prediction):
|
||||
"""Compute a mean-squared error loss between the documents' vectors and
|
||||
the prediction.
|
||||
the prediction.
|
||||
|
||||
Note that this is ripe for customization! We could compute the vectors
|
||||
in some other word, e.g. with an LSTM language model, or use some other
|
||||
|
@ -115,43 +197,40 @@ def get_vectors_loss(ops, docs, prediction):
|
|||
|
||||
|
||||
def create_pretraining_model(nlp, tok2vec):
|
||||
'''Define a network for the pretraining. We simply add an output layer onto
|
||||
"""Define a network for the pretraining. We simply add an output layer onto
|
||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||
Each array in the output needs to have one row per token in the doc.
|
||||
'''
|
||||
"""
|
||||
output_size = nlp.vocab.vectors.data.shape[1]
|
||||
output_layer = chain(
|
||||
LN(Maxout(300, pieces=3)),
|
||||
zero_init(Affine(output_size, drop_factor=0.0))
|
||||
LN(Maxout(300, pieces=3)), zero_init(Affine(output_size, drop_factor=0.0))
|
||||
)
|
||||
# This is annoying, but the parser etc have the flatten step after
|
||||
# the tok2vec. To load the weights in cleanly, we need to match
|
||||
# the shape of the models' components exactly. So what we cann
|
||||
# "tok2vec" has to be the same set of processes as what the components do.
|
||||
tok2vec = chain(tok2vec, flatten)
|
||||
model = chain(
|
||||
tok2vec,
|
||||
output_layer
|
||||
)
|
||||
model = chain(tok2vec, output_layer)
|
||||
model = masked_language_model(nlp.vocab, model)
|
||||
model.tok2vec = tok2vec
|
||||
model.output_layer = output_layer
|
||||
model.begin_training([nlp.make_doc('Give it a doc to infer shapes')])
|
||||
model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
|
||||
return model
|
||||
|
||||
|
||||
def masked_language_model(vocab, model, mask_prob=0.15):
|
||||
'''Convert a model into a BERT-style masked language model'''
|
||||
"""Convert a model into a BERT-style masked language model"""
|
||||
|
||||
random_words = RandomWords(vocab)
|
||||
def mlm_forward(docs, drop=0.):
|
||||
|
||||
def mlm_forward(docs, drop=0.0):
|
||||
mask, docs = apply_mask(docs, random_words, mask_prob=mask_prob)
|
||||
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
||||
output, backprop = model.begin_update(docs, drop=drop)
|
||||
|
||||
def mlm_backward(d_output, sgd=None):
|
||||
d_output *= 1-mask
|
||||
d_output *= 1 - mask
|
||||
return backprop(d_output, sgd=sgd)
|
||||
|
||||
return output, mlm_backward
|
||||
|
@ -161,7 +240,7 @@ def masked_language_model(vocab, model, mask_prob=0.15):
|
|||
|
||||
def apply_mask(docs, random_words, mask_prob=0.15):
|
||||
N = sum(len(doc) for doc in docs)
|
||||
mask = numpy.random.uniform(0., 1.0, (N,))
|
||||
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
||||
mask = mask >= mask_prob
|
||||
i = 0
|
||||
masked_docs = []
|
||||
|
@ -184,7 +263,7 @@ def apply_mask(docs, random_words, mask_prob=0.15):
|
|||
return mask, masked_docs
|
||||
|
||||
|
||||
def replace_word(word, random_words, mask='[MASK]'):
|
||||
def replace_word(word, random_words, mask="[MASK]"):
|
||||
roll = random.random()
|
||||
if roll < 0.8:
|
||||
return mask
|
||||
|
@ -193,23 +272,25 @@ def replace_word(word, random_words, mask='[MASK]'):
|
|||
else:
|
||||
return word
|
||||
|
||||
|
||||
class RandomWords(object):
|
||||
def __init__(self, vocab):
|
||||
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
||||
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
||||
self.words = self.words[:10000]
|
||||
self.probs = self.probs[:10000]
|
||||
self.probs = numpy.exp(numpy.array(self.probs, dtype='f'))
|
||||
self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
|
||||
self.probs /= self.probs.sum()
|
||||
self._cache = []
|
||||
|
||||
def next(self):
|
||||
if not self._cache:
|
||||
self._cache.extend(numpy.random.choice(len(self.words), 10000,
|
||||
p=self.probs))
|
||||
self._cache.extend(
|
||||
numpy.random.choice(len(self.words), 10000, p=self.probs)
|
||||
)
|
||||
index = self._cache.pop()
|
||||
return self.words[index]
|
||||
|
||||
|
||||
|
||||
class ProgressTracker(object):
|
||||
def __init__(self, frequency=1000000):
|
||||
|
@ -245,76 +326,3 @@ class ProgressTracker(object):
|
|||
return status
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
|
||||
vectors_model=("Name or path to vectors model to learn from"),
|
||||
output_dir=("Directory to write models each epoch", "positional", None, str),
|
||||
width=("Width of CNN layers", "option", "cw", int),
|
||||
depth=("Depth of CNN layers", "option", "cd", int),
|
||||
embed_rows=("Embedding rows", "option", "er", int),
|
||||
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
||||
dropout=("Dropout", "option", "d", float),
|
||||
seed=("Seed for random number generators", "option", "s", float),
|
||||
nr_iter=("Number of iterations to pretrain", "option", "i", int),
|
||||
)
|
||||
def pretrain(texts_loc, vectors_model, output_dir, width=96, depth=4,
|
||||
embed_rows=2000, use_vectors=False, dropout=0.2, nr_iter=1000, seed=0):
|
||||
"""
|
||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||
using an approximate language-modelling objective. Specifically, we load
|
||||
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
||||
vectors which match the pre-trained ones. The weights are saved to a directory
|
||||
after each epoch. You can then pass a path to one of these pre-trained weights
|
||||
files to the 'spacy train' command.
|
||||
|
||||
This technique may be especially helpful if you have little labelled data.
|
||||
However, it's still quite experimental, so your mileage may vary.
|
||||
|
||||
To load the weights back in during 'spacy train', you need to ensure
|
||||
all settings are the same between pretraining and training. The API and
|
||||
errors around this need some improvement.
|
||||
"""
|
||||
config = dict(locals())
|
||||
output_dir = ensure_path(output_dir)
|
||||
random.seed(seed)
|
||||
numpy.random.seed(seed)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
with (output_dir / 'config.json').open('w') as file_:
|
||||
file_.write(json.dumps(config))
|
||||
has_gpu = prefer_gpu()
|
||||
print("Use GPU?", has_gpu)
|
||||
nlp = spacy.load(vectors_model)
|
||||
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
|
||||
model = create_pretraining_model(nlp,
|
||||
Tok2Vec(width, embed_rows,
|
||||
conv_depth=depth,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
bilstm_depth=0, # Requires PyTorch. Experimental.
|
||||
cnn_maxout_pieces=2, # You can try setting this higher
|
||||
subword_features=True)) # Set to False for character models, e.g. Chinese
|
||||
optimizer = create_default_optimizer(model.ops)
|
||||
tracker = ProgressTracker()
|
||||
print('Epoch', '#Words', 'Loss', 'w/s')
|
||||
texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc)
|
||||
for epoch in range(nr_iter):
|
||||
for batch in minibatch_by_words(((text, None) for text in texts), size=5000):
|
||||
docs = make_docs(nlp, [text for (text, _) in batch])
|
||||
loss = make_update(model, docs, optimizer, drop=dropout)
|
||||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
print(*progress)
|
||||
if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7:
|
||||
break
|
||||
with model.use_params(optimizer.averages):
|
||||
with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
|
||||
file_.write(model.tok2vec.to_bytes())
|
||||
with (output_dir / 'log.jsonl').open('a') as file_:
|
||||
file_.write(json.dumps({'nr_word': tracker.nr_word,
|
||||
'loss': tracker.loss, 'epoch_loss': tracker.epoch_loss,
|
||||
'epoch': epoch}) + '\n')
|
||||
tracker.epoch_loss = 0.0
|
||||
if texts_loc != '-':
|
||||
texts = load_texts(texts_loc)
|
||||
|
|
|
@ -6,45 +6,64 @@ from pathlib import Path
|
|||
import ujson
|
||||
import cProfile
|
||||
import pstats
|
||||
|
||||
import spacy
|
||||
import sys
|
||||
import tqdm
|
||||
import cytoolz
|
||||
import thinc.extra.datasets
|
||||
from wasabi import Printer
|
||||
|
||||
|
||||
def read_inputs(loc):
|
||||
if loc is None:
|
||||
file_ = sys.stdin
|
||||
file_ = (line.encode('utf8') for line in file_)
|
||||
else:
|
||||
file_ = Path(loc).open()
|
||||
for line in file_:
|
||||
data = ujson.loads(line)
|
||||
text = data['text']
|
||||
yield text
|
||||
from ..util import load_model
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model/language", "positional", None, str),
|
||||
inputs=("Location of input file", "positional", None, read_inputs))
|
||||
def profile(lang, inputs=None):
|
||||
model=("Model to load", "positional", None, str),
|
||||
inputs=("Location of input file. '-' for stdin.", "positional", None, str),
|
||||
n_texts=("Maximum number of texts to use if available", "option", "n", int),
|
||||
)
|
||||
def profile(model, inputs=None, n_texts=10000):
|
||||
"""
|
||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||
Input should be formatted as one JSON object per line with a key "text".
|
||||
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
||||
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
||||
"""
|
||||
msg = Printer()
|
||||
if inputs is not None:
|
||||
inputs = _read_inputs(inputs, msg)
|
||||
if inputs is None:
|
||||
imdb_train, _ = thinc.extra.datasets.imdb()
|
||||
inputs, _ = zip(*imdb_train)
|
||||
inputs = inputs[:25000]
|
||||
nlp = spacy.load(lang)
|
||||
texts = list(cytoolz.take(10000, inputs))
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
|
||||
"Profile.prof")
|
||||
n_inputs = 25000
|
||||
with msg.loading("Loading IMDB dataset via Thinc..."):
|
||||
imdb_train, _ = thinc.extra.datasets.imdb()
|
||||
inputs, _ = zip(*imdb_train)
|
||||
msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
|
||||
inputs = inputs[:n_inputs]
|
||||
with msg.loading("Loading model '{}'...".format(model)):
|
||||
nlp = load_model(model)
|
||||
msg.good("Loaded model '{}'".format(model))
|
||||
texts = list(cytoolz.take(n_texts, inputs))
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||
s = pstats.Stats("Profile.prof")
|
||||
msg.divider("Profile stats")
|
||||
s.strip_dirs().sort_stats("time").print_stats()
|
||||
|
||||
|
||||
def parse_texts(nlp, texts):
|
||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
||||
pass
|
||||
|
||||
|
||||
def _read_inputs(loc, msg):
|
||||
if loc == "-":
|
||||
msg.info("Reading input from sys.stdin")
|
||||
file_ = sys.stdin
|
||||
file_ = (line.encode("utf8") for line in file_)
|
||||
else:
|
||||
input_path = Path(loc)
|
||||
if not input_path.exists() or not input_path.is_file():
|
||||
msg.fail("Not a valid input data file", loc, exits=1)
|
||||
msg.info("Using data from {}".format(input_path.parts[-1]))
|
||||
file_ = input_path.open()
|
||||
for line in file_:
|
||||
data = ujson.loads(line)
|
||||
text = data["text"]
|
||||
yield text
|
||||
|
|
51
spacy/cli/schemas/__init__.py
Normal file
51
spacy/cli/schemas/__init__.py
Normal file
|
@ -0,0 +1,51 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
from jsonschema import Draft4Validator
|
||||
|
||||
from ...errors import Errors
|
||||
from ...util import read_json
|
||||
|
||||
|
||||
SCHEMAS = {}
|
||||
|
||||
|
||||
def get_schema(name):
|
||||
"""Get the JSON schema for a given name. Looks for a .json file in
|
||||
spacy.cli.schemas, validates the schema and raises ValueError if not found.
|
||||
|
||||
EXAMPLE:
|
||||
>>> schema = get_schema('training')
|
||||
|
||||
name (unicode): The name of the schema.
|
||||
RETURNS (dict): The JSON schema.
|
||||
"""
|
||||
if name not in SCHEMAS:
|
||||
schema_path = Path(__file__).parent / "{}.json".format(name)
|
||||
if not schema_path.exists():
|
||||
raise ValueError(Errors.E104.format(name=name))
|
||||
schema = read_json(schema_path)
|
||||
# TODO: replace with (stable) Draft6Validator, if available
|
||||
validator = Draft4Validator(schema)
|
||||
validator.check_schema(schema)
|
||||
SCHEMAS[name] = schema
|
||||
return SCHEMAS[name]
|
||||
|
||||
|
||||
def validate_json(data, schema):
|
||||
"""Validate data against a given JSON schema (see https://json-schema.org).
|
||||
|
||||
data: JSON-serializable data to validate.
|
||||
schema (dict): The JSON schema.
|
||||
RETURNS (list): A list of error messages, if available.
|
||||
"""
|
||||
validator = Draft4Validator(schema)
|
||||
errors = []
|
||||
for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
|
||||
if err.path:
|
||||
err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
|
||||
else:
|
||||
err_path = ""
|
||||
errors.append(err.message + " " + err_path)
|
||||
return errors
|
128
spacy/cli/schemas/meta.json
Normal file
128
spacy/cli/schemas/meta.json
Normal file
|
@ -0,0 +1,128 @@
|
|||
{
|
||||
"$schema": "http://json-schema.org/draft-06/schema",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lang": {
|
||||
"title": "Two-letter language code, e.g. 'en'",
|
||||
"type": "string",
|
||||
"minLength": 2,
|
||||
"maxLength": 2,
|
||||
"pattern": "^[a-z]*$"
|
||||
},
|
||||
"name": {
|
||||
"title": "Model name",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[a-z_]*$"
|
||||
},
|
||||
"version": {
|
||||
"title": "Model version",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[0-9a-z.-]*$"
|
||||
},
|
||||
"spacy_version": {
|
||||
"title": "Compatible spaCy version identifier",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[0-9a-z.-><=]*$"
|
||||
},
|
||||
"parent_package": {
|
||||
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"default": "spacy"
|
||||
},
|
||||
"pipeline": {
|
||||
"title": "Names of pipeline components",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
}
|
||||
},
|
||||
"description": {
|
||||
"title": "Model description",
|
||||
"type": "string"
|
||||
},
|
||||
"license": {
|
||||
"title": "Model license",
|
||||
"type": "string"
|
||||
},
|
||||
"author": {
|
||||
"title": "Model author name",
|
||||
"type": "string"
|
||||
},
|
||||
"email": {
|
||||
"title": "Model author email",
|
||||
"type": "string",
|
||||
"format": "email"
|
||||
},
|
||||
"url": {
|
||||
"title": "Model author URL",
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"sources": {
|
||||
"title": "Training data sources",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"vectors": {
|
||||
"title": "Included word vectors",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"keys": {
|
||||
"title": "Number of unique keys",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"vectors": {
|
||||
"title": "Number of unique vectors",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"width": {
|
||||
"title": "Number of dimensions",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"accuracy": {
|
||||
"title": "Accuracy numbers",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"type": "number",
|
||||
"minimum": 0.0
|
||||
}
|
||||
}
|
||||
},
|
||||
"speed": {
|
||||
"title": "Speed evaluation numbers",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "number",
|
||||
"minimum": 0.0
|
||||
},
|
||||
{
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"lang",
|
||||
"name",
|
||||
"version"
|
||||
]
|
||||
}
|
146
spacy/cli/schemas/training.json
Normal file
146
spacy/cli/schemas/training.json
Normal file
|
@ -0,0 +1,146 @@
|
|||
{
|
||||
"$schema": "http://json-schema.org/draft-06/schema",
|
||||
"title": "Training data for spaCy models",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"title": "The text of the training example",
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"ents": {
|
||||
"title": "Named entity spans in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {
|
||||
"title": "Start character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"label": {
|
||||
"title": "Entity label",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[A-Z0-9]*$"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"start",
|
||||
"end",
|
||||
"label"
|
||||
]
|
||||
}
|
||||
},
|
||||
"sents": {
|
||||
"title": "Sentence spans in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {
|
||||
"title": "Start character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"start",
|
||||
"end"
|
||||
]
|
||||
}
|
||||
},
|
||||
"cats": {
|
||||
"title": "Text categories for the text classifier",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"title": "A text category",
|
||||
"oneOf": [
|
||||
{
|
||||
"type": "boolean"
|
||||
},
|
||||
{
|
||||
"type": "number",
|
||||
"minimum": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"propertyNames": {
|
||||
"pattern": "^[A-Z0-9]*$",
|
||||
"minLength": 1
|
||||
}
|
||||
},
|
||||
"tokens": {
|
||||
"title": "The tokens in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"minProperties": 1,
|
||||
"properties": {
|
||||
"id": {
|
||||
"title": "Token ID, usually token index",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"start": {
|
||||
"title": "Start character offset of the token",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the token",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
},
|
||||
"pos": {
|
||||
"title": "Coarse-grained part-of-speech tag",
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"tag": {
|
||||
"title": "Fine-grained part-of-speech tag",
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"dep": {
|
||||
"title": "Dependency label",
|
||||
"type": "string",
|
||||
"minLength": 1
|
||||
},
|
||||
"head": {
|
||||
"title": "Index of the token's head",
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"start",
|
||||
"end"
|
||||
]
|
||||
}
|
||||
},
|
||||
"_": {
|
||||
"title": "Custom user space",
|
||||
"type": "object"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"text"
|
||||
]
|
||||
}
|
||||
}
|
|
@ -6,213 +6,296 @@ from pathlib import Path
|
|||
import tqdm
|
||||
from thinc.neural._classes.model import Model
|
||||
from timeit import default_timer as timer
|
||||
import json
|
||||
import shutil
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from .._ml import create_default_optimizer
|
||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||
from ..gold import GoldCorpus
|
||||
from ..util import prints, minibatch, minibatch_by_words
|
||||
from .. import util
|
||||
from .. import about
|
||||
from .. import displacy
|
||||
from ..compat import json_dumps
|
||||
|
||||
|
||||
# Take dropout and batch size as generators of values -- dropout
|
||||
# starts high and decays sharply, to force the optimizer to explore.
|
||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||
# at the beginning of training.
|
||||
dropout_rates = util.decaying(
|
||||
util.env_opt("dropout_from", 0.2),
|
||||
util.env_opt("dropout_to", 0.2),
|
||||
util.env_opt("dropout_decay", 0.0),
|
||||
)
|
||||
batch_sizes = util.compounding(
|
||||
util.env_opt("batch_from", 1000),
|
||||
util.env_opt("batch_to", 1000),
|
||||
util.env_opt("batch_compound", 1.001),
|
||||
)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
output_dir=("output directory to store model in", "positional", None, str),
|
||||
train_data=("location of JSON-formatted training data", "positional",
|
||||
None, str),
|
||||
dev_data=("location of JSON-formatted development data (optional)",
|
||||
"positional", None, str),
|
||||
n_iter=("number of iterations", "option", "n", int),
|
||||
n_sents=("number of sentences", "option", "ns", int),
|
||||
lang=("Model language", "positional", None, str),
|
||||
output_path=("Output directory to store model in", "positional", None, Path),
|
||||
train_path=("Location of JSON-formatted training data", "positional", None, Path),
|
||||
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
|
||||
base_model=("Name of model to update (optional)", "option", "b", str),
|
||||
pipeline=("Comma-separated names of pipeline components", "option", "p", str),
|
||||
vectors=("Model to load vectors from", "option", "v", str),
|
||||
n_iter=("Number of iterations", "option", "n", int),
|
||||
n_examples=("Number of examples", "option", "ns", int),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
vectors=("Model to load vectors from", "option", "v"),
|
||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||
no_parser=("Don't train parser", "flag", "P", bool),
|
||||
no_entities=("Don't train NER", "flag", "N", bool),
|
||||
parser_multitasks=("Side objectives for parser CNN, e.g. dep dep,tag", "option", "pt", str),
|
||||
noise_level=("Amount of corruption to add for data augmentation", "option", "nl", float),
|
||||
entity_multitasks=("Side objectives for ner CNN, e.g. dep dep,tag", "option", "et", str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
version=("Model version", "option", "V", str),
|
||||
meta_path=("Optional path to meta.json. All relevant properties will be "
|
||||
"overwritten.", "option", "m", Path),
|
||||
init_tok2vec=("Path to pretrained weights for the token-to-vector parts "
|
||||
"of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
|
||||
verbose=("Display more information for debug", "option", None, bool))
|
||||
def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||
parser_multitasks='', entity_multitasks='', init_tok2vec=None,
|
||||
use_gpu=-1, vectors=None, no_tagger=False, noise_level=0.0,
|
||||
no_parser=False, no_entities=False, gold_preproc=False,
|
||||
version="0.0.0", meta_path=None, verbose=False):
|
||||
meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
|
||||
init_tok2vec=(
|
||||
"Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
|
||||
"option",
|
||||
"t2v",
|
||||
Path,
|
||||
),
|
||||
parser_multitasks=(
|
||||
"Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'",
|
||||
"option",
|
||||
"pt",
|
||||
str,
|
||||
),
|
||||
entity_multitasks=(
|
||||
"Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'",
|
||||
"option",
|
||||
"et",
|
||||
str,
|
||||
),
|
||||
noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
|
||||
verbose=("Display more information for debug", "flag", "VV", bool),
|
||||
debug=("Run data diagnostics before training", "flag", "D", bool),
|
||||
)
|
||||
def train(
|
||||
lang,
|
||||
output_path,
|
||||
train_path,
|
||||
dev_path,
|
||||
base_model=None,
|
||||
pipeline="tagger,parser,ner",
|
||||
vectors=None,
|
||||
n_iter=30,
|
||||
n_examples=0,
|
||||
use_gpu=-1,
|
||||
version="0.0.0",
|
||||
meta_path=None,
|
||||
init_tok2vec=None,
|
||||
parser_multitasks="",
|
||||
entity_multitasks="",
|
||||
noise_level=0.0,
|
||||
gold_preproc=False,
|
||||
learn_tokens=False,
|
||||
verbose=False,
|
||||
debug=False,
|
||||
):
|
||||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
||||
JSON format. To convert data from other formats, use the `spacy convert`
|
||||
command.
|
||||
"""
|
||||
msg = Printer()
|
||||
util.fix_random_seed()
|
||||
util.set_env_log(True)
|
||||
n_sents = n_sents or None
|
||||
output_path = util.ensure_path(output_dir)
|
||||
train_path = util.ensure_path(train_data)
|
||||
dev_path = util.ensure_path(dev_data)
|
||||
util.set_env_log(verbose)
|
||||
|
||||
# Make sure all files and paths exists if they are needed
|
||||
train_path = util.ensure_path(train_path)
|
||||
dev_path = util.ensure_path(dev_path)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
if not train_path.exists():
|
||||
prints(train_path, title=Messages.M050, exits=1)
|
||||
if dev_path and not dev_path.exists():
|
||||
prints(dev_path, title=Messages.M051, exits=1)
|
||||
if not train_path or not train_path.exists():
|
||||
msg.fail(Messages.M050, train_path, exits=1)
|
||||
if not dev_path or not dev_path.exists():
|
||||
msg.fail(Messages.M051, dev_path, exits=1)
|
||||
if meta_path is not None and not meta_path.exists():
|
||||
prints(meta_path, title=Messages.M020, exits=1)
|
||||
msg.fail(Messages.M020, meta_path, exits=1)
|
||||
meta = util.read_json(meta_path) if meta_path else {}
|
||||
if not isinstance(meta, dict):
|
||||
prints(Messages.M053.format(meta_type=type(meta)),
|
||||
title=Messages.M052, exits=1)
|
||||
meta.setdefault('lang', lang)
|
||||
meta.setdefault('name', 'unnamed')
|
||||
|
||||
msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
|
||||
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
|
||||
msg.fail(Messages.M062, Messages.M065)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
|
||||
print("Counting training words (limit=%s" % n_sents)
|
||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||
n_train_words = corpus.count_train()
|
||||
print(n_train_words)
|
||||
pipeline = ['tagger', 'parser', 'ner']
|
||||
if no_tagger and 'tagger' in pipeline:
|
||||
pipeline.remove('tagger')
|
||||
if no_parser and 'parser' in pipeline:
|
||||
pipeline.remove('parser')
|
||||
if no_entities and 'ner' in pipeline:
|
||||
pipeline.remove('ner')
|
||||
# Set up the base model and pipeline. If a base model is specified, load
|
||||
# the model and make sure the pipeline matches the pipeline setting. If
|
||||
# training starts from a blank model, intitalize the language class.
|
||||
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||
msg.text(Messages.M055.format(pipeline=pipeline))
|
||||
if base_model:
|
||||
msg.text(Messages.M056.format(model=base_model))
|
||||
nlp = util.load_model(base_model)
|
||||
if nlp.lang != lang:
|
||||
msg.fail(Messages.M072.format(model_lang=nlp.lang, lang=lang), exits=1)
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
|
||||
nlp.disable_pipes(*other_pipes)
|
||||
for pipe in pipeline:
|
||||
if pipe not in nlp.pipe_names:
|
||||
nlp.add_pipe(nlp.create_pipe(pipe))
|
||||
else:
|
||||
msg.text(Messages.M057.format(model=lang))
|
||||
lang_cls = util.get_lang_class(lang)
|
||||
nlp = lang_cls()
|
||||
for pipe in pipeline:
|
||||
nlp.add_pipe(nlp.create_pipe(pipe))
|
||||
|
||||
if learn_tokens:
|
||||
nlp.add_pipe(nlp.create_pipe("merge_subtokens"))
|
||||
|
||||
# Take dropout and batch size as generators of values -- dropout
|
||||
# starts high and decays sharply, to force the optimizer to explore.
|
||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||
# at the beginning of training.
|
||||
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1),
|
||||
util.env_opt('dropout_to', 0.1),
|
||||
util.env_opt('dropout_decay', 0.0))
|
||||
batch_sizes = util.compounding(util.env_opt('batch_from', 750),
|
||||
util.env_opt('batch_to', 750),
|
||||
util.env_opt('batch_compound', 1.001))
|
||||
dropout_rates = util.decaying(
|
||||
util.env_opt("dropout_from", 0.1),
|
||||
util.env_opt("dropout_to", 0.1),
|
||||
util.env_opt("dropout_decay", 0.0),
|
||||
)
|
||||
batch_sizes = util.compounding(
|
||||
util.env_opt("batch_from", 750),
|
||||
util.env_opt("batch_to", 750),
|
||||
util.env_opt("batch_compound", 1.001),
|
||||
)
|
||||
lang_class = util.get_lang_class(lang)
|
||||
nlp = lang_class()
|
||||
meta['pipeline'] = pipeline
|
||||
meta["pipeline"] = pipeline
|
||||
nlp.meta.update(meta)
|
||||
if vectors:
|
||||
print("Load vectors model", vectors)
|
||||
util.load_model(vectors, vocab=nlp.vocab)
|
||||
for lex in nlp.vocab:
|
||||
values = {}
|
||||
for attr, func in nlp.vocab.lex_attr_getters.items():
|
||||
# These attrs are expected to be set by data. Others should
|
||||
# be set by calling the language functions.
|
||||
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
|
||||
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
||||
lex.set_attrs(**values)
|
||||
lex.is_oov = False
|
||||
for name in pipeline:
|
||||
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
||||
nlp.add_pipe(nlp.create_pipe('merge_subtokens'))
|
||||
if parser_multitasks:
|
||||
for objective in parser_multitasks.split(','):
|
||||
nlp.parser.add_multitask_objective(objective)
|
||||
if entity_multitasks:
|
||||
for objective in entity_multitasks.split(','):
|
||||
nlp.entity.add_multitask_objective(objective)
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
if init_tok2vec is not None:
|
||||
loaded = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||
print("Loaded pretrained tok2vec for:", loaded)
|
||||
msg.text(Messages.M058.format(model=vectors))
|
||||
_load_vectors(nlp, vectors)
|
||||
|
||||
# Multitask objectives
|
||||
multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
|
||||
for pipe_name, multitasks in multitask_options:
|
||||
if multitasks:
|
||||
if pipe_name not in pipeline:
|
||||
msg.fail(Messages.M059.format(pipe=pipe_name))
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
for objective in multitasks.split(","):
|
||||
pipe.add_multitask_objective(objective)
|
||||
|
||||
# Prepare training corpus
|
||||
msg.text(Messages.M060.format(limit=n_examples))
|
||||
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
|
||||
n_train_words = corpus.count_train()
|
||||
|
||||
if base_model:
|
||||
# Start with an existing model, use default optimizer
|
||||
optimizer = create_default_optimizer(Model.ops)
|
||||
else:
|
||||
# Start with a blank model, call begin_training
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||
nlp._optimizer = None
|
||||
|
||||
print("Itn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS")
|
||||
# Load in pre-trained weights
|
||||
if init_tok2vec is not None:
|
||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||
msg.text(Messages.M071.format(components=components))
|
||||
|
||||
print(
|
||||
"\nItn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS"
|
||||
)
|
||||
try:
|
||||
for i in range(n_iter):
|
||||
train_docs = corpus.train_docs(nlp, noise_level=noise_level,
|
||||
gold_preproc=gold_preproc, max_length=0)
|
||||
train_docs = corpus.train_docs(
|
||||
nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
|
||||
)
|
||||
words_seen = 0
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
losses = {}
|
||||
for batch in minibatch_by_words(train_docs, size=batch_sizes):
|
||||
for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
|
||||
if not batch:
|
||||
continue
|
||||
docs, golds = zip(*batch)
|
||||
nlp.update(docs, golds, sgd=optimizer,
|
||||
drop=next(dropout_rates), losses=losses)
|
||||
nlp.update(
|
||||
docs,
|
||||
golds,
|
||||
sgd=optimizer,
|
||||
drop=next(dropout_rates),
|
||||
losses=losses,
|
||||
)
|
||||
pbar.update(sum(len(doc) for doc in docs))
|
||||
words_seen += sum(len(doc) for doc in docs)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
util.set_env_log(False)
|
||||
epoch_model_path = output_path / ('model%d' % i)
|
||||
epoch_model_path = output_path / ("model%d" % i)
|
||||
nlp.to_disk(epoch_model_path)
|
||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||
dev_docs = list(corpus.dev_docs(
|
||||
nlp_loaded,
|
||||
gold_preproc=gold_preproc))
|
||||
dev_docs = list(corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc))
|
||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs, verbose)
|
||||
scorer = nlp_loaded.evaluate(dev_docs, debug)
|
||||
end_time = timer()
|
||||
if use_gpu < 0:
|
||||
gpu_wps = None
|
||||
cpu_wps = nwords/(end_time-start_time)
|
||||
cpu_wps = nwords / (end_time - start_time)
|
||||
else:
|
||||
gpu_wps = nwords/(end_time-start_time)
|
||||
with Model.use_device('cpu'):
|
||||
gpu_wps = nwords / (end_time - start_time)
|
||||
with Model.use_device("cpu"):
|
||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||
dev_docs = list(corpus.dev_docs(
|
||||
nlp_loaded, gold_preproc=gold_preproc))
|
||||
dev_docs = list(
|
||||
corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
|
||||
)
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs)
|
||||
end_time = timer()
|
||||
cpu_wps = nwords/(end_time-start_time)
|
||||
acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
|
||||
with acc_loc.open('w') as file_:
|
||||
file_.write(json_dumps(scorer.scores))
|
||||
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
||||
meta['accuracy'] = scorer.scores
|
||||
meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps,
|
||||
'gpu': gpu_wps}
|
||||
meta['vectors'] = {'width': nlp.vocab.vectors_length,
|
||||
'vectors': len(nlp.vocab.vectors),
|
||||
'keys': nlp.vocab.vectors.n_keys}
|
||||
meta['lang'] = nlp.lang
|
||||
meta['pipeline'] = pipeline
|
||||
meta['spacy_version'] = '>=%s' % about.__version__
|
||||
meta.setdefault('name', 'model%d' % i)
|
||||
meta.setdefault('version', version)
|
||||
cpu_wps = nwords / (end_time - start_time)
|
||||
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
|
||||
util.write_json(acc_loc, scorer.scores)
|
||||
|
||||
with meta_loc.open('w') as file_:
|
||||
file_.write(json_dumps(meta))
|
||||
util.set_env_log(True)
|
||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
|
||||
gpu_wps=gpu_wps)
|
||||
# Update model meta.json
|
||||
meta["lang"] = nlp.lang
|
||||
meta["pipeline"] = nlp.pipe_names
|
||||
meta["spacy_version"] = ">=%s" % about.__version__
|
||||
meta["accuracy"] = scorer.scores
|
||||
meta["speed"] = {"nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps}
|
||||
meta["vectors"] = {
|
||||
"width": nlp.vocab.vectors_length,
|
||||
"vectors": len(nlp.vocab.vectors),
|
||||
"keys": nlp.vocab.vectors.n_keys,
|
||||
}
|
||||
meta.setdefault("name", "model%d" % i)
|
||||
meta.setdefault("version", version)
|
||||
meta_loc = output_path / ("model%d" % i) / "meta.json"
|
||||
util.write_json(meta_loc, meta)
|
||||
|
||||
util.set_env_log(verbose)
|
||||
|
||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
|
||||
finally:
|
||||
print("Saving model...")
|
||||
with nlp.use_params(optimizer.averages):
|
||||
final_model_path = output_path / 'model-final'
|
||||
nlp.to_disk(final_model_path)
|
||||
components = []
|
||||
if not no_parser:
|
||||
components.append('parser')
|
||||
if not no_tagger:
|
||||
components.append('tagger')
|
||||
if not no_entities:
|
||||
components.append('ner')
|
||||
_collate_best_model(meta, output_path, components)
|
||||
with msg.loading(Messages.M061):
|
||||
with nlp.use_params(optimizer.averages):
|
||||
final_model_path = output_path / "model-final"
|
||||
nlp.to_disk(final_model_path)
|
||||
msg.good(Messages.M066, util.path2str(final_model_path))
|
||||
|
||||
_collate_best_model(meta, output_path, nlp.pipe_names)
|
||||
|
||||
|
||||
def _load_vectors(nlp, vectors):
|
||||
util.load_model(vectors, vocab=nlp.vocab)
|
||||
for lex in nlp.vocab:
|
||||
values = {}
|
||||
for attr, func in nlp.vocab.lex_attr_getters.items():
|
||||
# These attrs are expected to be set by data. Others should
|
||||
# be set by calling the language functions.
|
||||
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
|
||||
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
||||
lex.set_attrs(**values)
|
||||
lex.is_oov = False
|
||||
|
||||
|
||||
def _load_pretrained_tok2vec(nlp, loc):
|
||||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||
"""
|
||||
with loc.open('rb') as file_:
|
||||
with loc.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
loaded = []
|
||||
for name, component in nlp.pipeline:
|
||||
if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
|
||||
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
||||
component.tok2vec.from_bytes(weights_data)
|
||||
loaded.append(name)
|
||||
return loaded
|
||||
|
@ -222,24 +305,22 @@ def _collate_best_model(meta, output_path, components):
|
|||
bests = {}
|
||||
for component in components:
|
||||
bests[component] = _find_best(output_path, component)
|
||||
best_dest = output_path / 'model-best'
|
||||
shutil.copytree(output_path / 'model-final', best_dest)
|
||||
best_dest = output_path / "model-best"
|
||||
shutil.copytree(output_path / "model-final", best_dest)
|
||||
for component, best_component_src in bests.items():
|
||||
shutil.rmtree(best_dest / component)
|
||||
shutil.copytree(best_component_src / component, best_dest / component)
|
||||
with (best_component_src / 'accuracy.json').open() as file_:
|
||||
accs = json.load(file_)
|
||||
accs = util.read_json(best_component_src / "accuracy.json")
|
||||
for metric in _get_metrics(component):
|
||||
meta['accuracy'][metric] = accs[metric]
|
||||
with (best_dest / 'meta.json').open('w') as file_:
|
||||
file_.write(json_dumps(meta))
|
||||
meta["accuracy"][metric] = accs[metric]
|
||||
util.write_json(best_dest / "meta.json", meta)
|
||||
|
||||
|
||||
def _find_best(experiment_dir, component):
|
||||
accuracies = []
|
||||
for epoch_model in experiment_dir.iterdir():
|
||||
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
||||
accs = json.load((epoch_model / "accuracy.json").open())
|
||||
accs = util.read_json(epoch_model / "accuracy.json")
|
||||
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
||||
accuracies.append((scores, epoch_model))
|
||||
if accuracies:
|
||||
|
@ -247,6 +328,7 @@ def _find_best(experiment_dir, component):
|
|||
else:
|
||||
return None
|
||||
|
||||
|
||||
def _get_metrics(component):
|
||||
if component == "parser":
|
||||
return ("las", "uas", "token_acc")
|
||||
|
@ -257,50 +339,40 @@ def _get_metrics(component):
|
|||
return ("token_acc",)
|
||||
|
||||
|
||||
def _render_parses(i, to_render):
|
||||
to_render[0].user_data['title'] = "Batch %d" % i
|
||||
with Path('/tmp/entities.html').open('w') as file_:
|
||||
html = displacy.render(to_render[:5], style='ent', page=True)
|
||||
file_.write(html)
|
||||
with Path('/tmp/parses.html').open('w') as file_:
|
||||
html = displacy.render(to_render[:5], style='dep', page=True)
|
||||
file_.write(html)
|
||||
|
||||
|
||||
def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
|
||||
scores = {}
|
||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
||||
'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
|
||||
for col in [
|
||||
"dep_loss",
|
||||
"tag_loss",
|
||||
"uas",
|
||||
"tags_acc",
|
||||
"token_acc",
|
||||
"ents_p",
|
||||
"ents_r",
|
||||
"ents_f",
|
||||
"cpu_wps",
|
||||
"gpu_wps",
|
||||
]:
|
||||
scores[col] = 0.0
|
||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
||||
scores["dep_loss"] = losses.get("parser", 0.0)
|
||||
scores["ner_loss"] = losses.get("ner", 0.0)
|
||||
scores["tag_loss"] = losses.get("tagger", 0.0)
|
||||
scores.update(dev_scores)
|
||||
scores['cpu_wps'] = cpu_wps
|
||||
scores['gpu_wps'] = gpu_wps or 0.0
|
||||
tpl = ''.join((
|
||||
'{:<6d}',
|
||||
'{dep_loss:<10.3f}',
|
||||
'{ner_loss:<10.3f}',
|
||||
'{uas:<8.3f}',
|
||||
'{ents_p:<8.3f}',
|
||||
'{ents_r:<8.3f}',
|
||||
'{ents_f:<8.3f}',
|
||||
'{tags_acc:<8.3f}',
|
||||
'{token_acc:<9.3f}',
|
||||
'{cpu_wps:<9.1f}',
|
||||
'{gpu_wps:.1f}',
|
||||
))
|
||||
scores["cpu_wps"] = cpu_wps
|
||||
scores["gpu_wps"] = gpu_wps or 0.0
|
||||
tpl = "".join(
|
||||
(
|
||||
"{:<6d}",
|
||||
"{dep_loss:<10.3f}",
|
||||
"{ner_loss:<10.3f}",
|
||||
"{uas:<8.3f}",
|
||||
"{ents_p:<8.3f}",
|
||||
"{ents_r:<8.3f}",
|
||||
"{ents_f:<8.3f}",
|
||||
"{tags_acc:<8.3f}",
|
||||
"{token_acc:<9.3f}",
|
||||
"{cpu_wps:<9.1f}",
|
||||
"{gpu_wps:.1f}",
|
||||
)
|
||||
)
|
||||
print(tpl.format(itn, **scores))
|
||||
|
||||
|
||||
def print_results(scorer):
|
||||
results = {
|
||||
'TOK': '%.2f' % scorer.token_acc,
|
||||
'POS': '%.2f' % scorer.tags_acc,
|
||||
'UAS': '%.2f' % scorer.uas,
|
||||
'LAS': '%.2f' % scorer.las,
|
||||
'NER P': '%.2f' % scorer.ents_p,
|
||||
'NER R': '%.2f' % scorer.ents_r,
|
||||
'NER F': '%.2f' % scorer.ents_f}
|
||||
util.print_table(results, title="Results")
|
||||
|
|
2
spacy/cli/ud/__init__.py
Normal file
2
spacy/cli/ud/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
from .conll17_ud_eval import main as ud_evaluate # noqa: F401
|
||||
from .ud_train import main as ud_train # noqa: F401
|
|
@ -1,4 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
# flake8: noqa
|
||||
|
||||
# CoNLL 2017 UD Parsing evaluation script.
|
||||
#
|
||||
|
@ -214,7 +215,7 @@ def load_conllu(file):
|
|||
start, end = map(int, columns[ID].split("-"))
|
||||
except:
|
||||
raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
|
||||
|
||||
|
||||
for _ in range(start, end + 1):
|
||||
word_line = file.readline().rstrip("\r\n")
|
||||
word_columns = word_line.split("\t")
|
|
@ -1,7 +1,9 @@
|
|||
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||
# flake8: noqa
|
||||
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||
.conllu format for development data, allowing the official scorer to be used.
|
||||
'''
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
|
@ -11,15 +13,17 @@ import json
|
|||
|
||||
import spacy
|
||||
import spacy.util
|
||||
from ..tokens import Token, Doc
|
||||
from ..gold import GoldParse
|
||||
from ..util import compounding, minibatch_by_words
|
||||
from ..syntax.nonproj import projectivize
|
||||
from ..matcher import Matcher
|
||||
#from ..morphology import Fused_begin, Fused_inside
|
||||
from .. import displacy
|
||||
from ...tokens import Token, Doc
|
||||
from ...gold import GoldParse
|
||||
from ...util import compounding, minibatch_by_words
|
||||
from ...syntax.nonproj import projectivize
|
||||
from ...matcher import Matcher
|
||||
|
||||
# from ...morphology import Fused_begin, Fused_inside
|
||||
from ... import displacy
|
||||
from collections import defaultdict, Counter
|
||||
from timeit import default_timer as timer
|
||||
|
||||
Fused_begin = None
|
||||
Fused_inside = None
|
||||
|
||||
|
@ -30,43 +34,45 @@ import cytoolz
|
|||
|
||||
from . import conll17_ud_eval
|
||||
|
||||
from .. import lang
|
||||
from .. import lang
|
||||
from ..lang import zh
|
||||
from ..lang import ja
|
||||
from ..lang import ru
|
||||
from ... import lang
|
||||
from ...lang import zh
|
||||
from ...lang import ja
|
||||
from ...lang import ru
|
||||
|
||||
|
||||
################
|
||||
# Data reading #
|
||||
################
|
||||
|
||||
space_re = re.compile('\s+')
|
||||
space_re = re.compile("\s+")
|
||||
|
||||
|
||||
def split_text(text):
|
||||
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
|
||||
|
||||
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
|
||||
|
||||
|
||||
##############
|
||||
# Evaluation #
|
||||
##############
|
||||
|
||||
|
||||
def read_conllu(file_):
|
||||
docs = []
|
||||
sent = []
|
||||
doc = []
|
||||
for line in file_:
|
||||
if line.startswith('# newdoc'):
|
||||
if line.startswith("# newdoc"):
|
||||
if doc:
|
||||
docs.append(doc)
|
||||
doc = []
|
||||
elif line.startswith('#'):
|
||||
elif line.startswith("#"):
|
||||
continue
|
||||
elif not line.strip():
|
||||
if sent:
|
||||
doc.append(sent)
|
||||
sent = []
|
||||
else:
|
||||
sent.append(list(line.strip().split('\t')))
|
||||
sent.append(list(line.strip().split("\t")))
|
||||
if len(sent[-1]) != 10:
|
||||
print(repr(line))
|
||||
raise ValueError
|
||||
|
@ -78,7 +84,7 @@ def read_conllu(file_):
|
|||
|
||||
|
||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||
if text_loc.parts[-1].endswith('.conllu'):
|
||||
if text_loc.parts[-1].endswith(".conllu"):
|
||||
docs = []
|
||||
with text_loc.open() as file_:
|
||||
for conllu_doc in read_conllu(file_):
|
||||
|
@ -88,14 +94,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|||
for name, component in nlp.pipeline:
|
||||
docs = list(component.pipe(docs))
|
||||
else:
|
||||
with text_loc.open('r', encoding='utf8') as text_file:
|
||||
with text_loc.open("r", encoding="utf8") as text_file:
|
||||
texts = split_text(text_file.read())
|
||||
docs = list(nlp.pipe(texts))
|
||||
with sys_loc.open('w', encoding='utf8') as out_file:
|
||||
with sys_loc.open("w", encoding="utf8") as out_file:
|
||||
write_conllu(docs, out_file)
|
||||
with gold_loc.open('r', encoding='utf8') as gold_file:
|
||||
with gold_loc.open("r", encoding="utf8") as gold_file:
|
||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
||||
with sys_loc.open('r', encoding='utf8') as sys_file:
|
||||
with sys_loc.open("r", encoding="utf8") as sys_file:
|
||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
||||
return docs, scores
|
||||
|
@ -103,26 +109,26 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|||
|
||||
def write_conllu(docs, file_):
|
||||
merger = Matcher(docs[0].vocab)
|
||||
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
|
||||
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
||||
for i, doc in enumerate(docs):
|
||||
matches = merger(doc)
|
||||
spans = [doc[start:end+1] for _, start, end in matches]
|
||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||
for start_char, end_char in offsets:
|
||||
doc.merge(start_char, end_char)
|
||||
# TODO: This shuldn't be necessary? Should be handled in merge
|
||||
for word in doc:
|
||||
if word.i == word.head.i:
|
||||
word.dep_ = 'ROOT'
|
||||
word.dep_ = "ROOT"
|
||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
||||
for j, sent in enumerate(doc.sents):
|
||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
||||
file_.write("# text = {text}\n".format(text=sent.text))
|
||||
for k, token in enumerate(sent):
|
||||
file_.write(_get_token_conllu(token, k, len(sent)) + '\n')
|
||||
file_.write('\n')
|
||||
file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
|
||||
file_.write("\n")
|
||||
for word in sent:
|
||||
if word.head.i == word.i and word.dep_ == 'ROOT':
|
||||
if word.head.i == word.i and word.dep_ == "ROOT":
|
||||
break
|
||||
else:
|
||||
print("Rootless sentence!")
|
||||
|
@ -134,24 +140,34 @@ def write_conllu(docs, file_):
|
|||
|
||||
|
||||
def _get_token_conllu(token, k, sent_len):
|
||||
if token.check_morph(Fused_begin) and (k+1 < sent_len):
|
||||
if token.check_morph(Fused_begin) and (k + 1 < sent_len):
|
||||
n = 1
|
||||
text = [token.text]
|
||||
while token.nbor(n).check_morph(Fused_inside):
|
||||
text.append(token.nbor(n).text)
|
||||
n += 1
|
||||
id_ = '%d-%d' % (k+1, (k+n))
|
||||
fields = [id_, ''.join(text)] + ['_'] * 8
|
||||
lines = ['\t'.join(fields)]
|
||||
id_ = "%d-%d" % (k + 1, (k + n))
|
||||
fields = [id_, "".join(text)] + ["_"] * 8
|
||||
lines = ["\t".join(fields)]
|
||||
else:
|
||||
lines = []
|
||||
if token.head.i == token.i:
|
||||
head = 0
|
||||
else:
|
||||
head = k + (token.head.i - token.i) + 1
|
||||
fields = [str(k+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
||||
str(head), token.dep_.lower(), '_', '_']
|
||||
if token.check_morph(Fused_begin) and (k+1 < sent_len):
|
||||
fields = [
|
||||
str(k + 1),
|
||||
token.text,
|
||||
token.lemma_,
|
||||
token.pos_,
|
||||
token.tag_,
|
||||
"_",
|
||||
str(head),
|
||||
token.dep_.lower(),
|
||||
"_",
|
||||
"_",
|
||||
]
|
||||
if token.check_morph(Fused_begin) and (k + 1 < sent_len):
|
||||
if k == 0:
|
||||
fields[1] = token.norm_[0].upper() + token.norm_[1:]
|
||||
else:
|
||||
|
@ -163,18 +179,18 @@ def _get_token_conllu(token, k, sent_len):
|
|||
split_end = token._.split_end
|
||||
split_len = (split_end.i - split_start.i) + 1
|
||||
n_in_split = token.i - split_start.i
|
||||
subtokens = guess_fused_orths(split_start.text, [''] * split_len)
|
||||
subtokens = guess_fused_orths(split_start.text, [""] * split_len)
|
||||
fields[1] = subtokens[n_in_split]
|
||||
|
||||
lines.append('\t'.join(fields))
|
||||
return '\n'.join(lines)
|
||||
lines.append("\t".join(fields))
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def guess_fused_orths(word, ud_forms):
|
||||
'''The UD data 'fused tokens' don't necessarily expand to keys that match
|
||||
"""The UD data 'fused tokens' don't necessarily expand to keys that match
|
||||
the form. We need orths that exact match the string. Here we make a best
|
||||
effort to divide up the word.'''
|
||||
if word == ''.join(ud_forms):
|
||||
effort to divide up the word."""
|
||||
if word == "".join(ud_forms):
|
||||
# Happy case: we get a perfect split, with each letter accounted for.
|
||||
return ud_forms
|
||||
elif len(word) == sum(len(subtoken) for subtoken in ud_forms):
|
||||
|
@ -183,16 +199,16 @@ def guess_fused_orths(word, ud_forms):
|
|||
remain = word
|
||||
for subtoken in ud_forms:
|
||||
assert len(subtoken) >= 1
|
||||
output.append(remain[:len(subtoken)])
|
||||
remain = remain[len(subtoken):]
|
||||
output.append(remain[: len(subtoken)])
|
||||
remain = remain[len(subtoken) :]
|
||||
assert len(remain) == 0, (word, ud_forms, remain)
|
||||
return output
|
||||
else:
|
||||
# Let's say word is 6 long, and there are three subtokens. The orths
|
||||
# *must* equal the original string. Arbitrarily, split [4, 1, 1]
|
||||
first = word[:len(word)-(len(ud_forms)-1)]
|
||||
first = word[: len(word) - (len(ud_forms) - 1)]
|
||||
output = [first]
|
||||
remain = word[len(first):]
|
||||
remain = word[len(first) :]
|
||||
for i in range(1, len(ud_forms)):
|
||||
assert remain
|
||||
output.append(remain[:1])
|
||||
|
@ -201,60 +217,50 @@ def guess_fused_orths(word, ud_forms):
|
|||
return output
|
||||
|
||||
|
||||
|
||||
def print_results(name, ud_scores):
|
||||
fields = {}
|
||||
if ud_scores is not None:
|
||||
fields.update({
|
||||
'words': ud_scores['Words'].f1 * 100,
|
||||
'sents': ud_scores['Sentences'].f1 * 100,
|
||||
'tags': ud_scores['XPOS'].f1 * 100,
|
||||
'uas': ud_scores['UAS'].f1 * 100,
|
||||
'las': ud_scores['LAS'].f1 * 100,
|
||||
})
|
||||
fields.update(
|
||||
{
|
||||
"words": ud_scores["Words"].f1 * 100,
|
||||
"sents": ud_scores["Sentences"].f1 * 100,
|
||||
"tags": ud_scores["XPOS"].f1 * 100,
|
||||
"uas": ud_scores["UAS"].f1 * 100,
|
||||
"las": ud_scores["LAS"].f1 * 100,
|
||||
}
|
||||
)
|
||||
else:
|
||||
fields.update({
|
||||
'words': 0.0,
|
||||
'sents': 0.0,
|
||||
'tags': 0.0,
|
||||
'uas': 0.0,
|
||||
'las': 0.0
|
||||
})
|
||||
tpl = '\t'.join((
|
||||
name,
|
||||
'{las:.1f}',
|
||||
'{uas:.1f}',
|
||||
'{tags:.1f}',
|
||||
'{sents:.1f}',
|
||||
'{words:.1f}',
|
||||
))
|
||||
fields.update({"words": 0.0, "sents": 0.0, "tags": 0.0, "uas": 0.0, "las": 0.0})
|
||||
tpl = "\t".join(
|
||||
(name, "{las:.1f}", "{uas:.1f}", "{tags:.1f}", "{sents:.1f}", "{words:.1f}")
|
||||
)
|
||||
print(tpl.format(**fields))
|
||||
return fields
|
||||
|
||||
|
||||
def get_token_split_start(token):
|
||||
if token.text == '':
|
||||
if token.text == "":
|
||||
assert token.i != 0
|
||||
i = -1
|
||||
while token.nbor(i).text == '':
|
||||
while token.nbor(i).text == "":
|
||||
i -= 1
|
||||
return token.nbor(i)
|
||||
elif (token.i+1) < len(token.doc) and token.nbor(1).text == '':
|
||||
elif (token.i + 1) < len(token.doc) and token.nbor(1).text == "":
|
||||
return token
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def get_token_split_end(token):
|
||||
if (token.i+1) == len(token.doc):
|
||||
return token if token.text == '' else None
|
||||
elif token.text != '' and token.nbor(1).text != '':
|
||||
if (token.i + 1) == len(token.doc):
|
||||
return token if token.text == "" else None
|
||||
elif token.text != "" and token.nbor(1).text != "":
|
||||
return None
|
||||
i = 1
|
||||
while (token.i+i) < len(token.doc) and token.nbor(i).text == '':
|
||||
while (token.i + i) < len(token.doc) and token.nbor(i).text == "":
|
||||
i += 1
|
||||
return token.nbor(i-1)
|
||||
|
||||
return token.nbor(i - 1)
|
||||
|
||||
|
||||
##################
|
||||
# Initialization #
|
||||
|
@ -262,54 +268,73 @@ def get_token_split_end(token):
|
|||
|
||||
|
||||
def load_nlp(experiments_dir, corpus):
|
||||
nlp = spacy.load(experiments_dir / corpus / 'best-model')
|
||||
nlp = spacy.load(experiments_dir / corpus / "best-model")
|
||||
return nlp
|
||||
|
||||
|
||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
return nlp
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
test_data_dir=("Path to Universal Dependencies test data", "positional", None, Path),
|
||||
test_data_dir=(
|
||||
"Path to Universal Dependencies test data",
|
||||
"positional",
|
||||
None,
|
||||
Path,
|
||||
),
|
||||
experiment_dir=("Parent directory with output model", "positional", None, Path),
|
||||
corpus=("UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc", "positional", None, str),
|
||||
corpus=(
|
||||
"UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc",
|
||||
"positional",
|
||||
None,
|
||||
str,
|
||||
),
|
||||
)
|
||||
def main(test_data_dir, experiment_dir, corpus):
|
||||
Token.set_extension('split_start', getter=get_token_split_start)
|
||||
Token.set_extension('split_end', getter=get_token_split_end)
|
||||
Token.set_extension('begins_fused', default=False)
|
||||
Token.set_extension('inside_fused', default=False)
|
||||
Token.set_extension("split_start", getter=get_token_split_start)
|
||||
Token.set_extension("split_end", getter=get_token_split_end)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
lang.zh.Chinese.Defaults.use_jieba = False
|
||||
lang.ja.Japanese.Defaults.use_janome = False
|
||||
lang.ru.Russian.Defaults.use_pymorphy2 = False
|
||||
|
||||
nlp = load_nlp(experiment_dir, corpus)
|
||||
|
||||
treebank_code = nlp.meta['treebank']
|
||||
for section in ('test', 'dev'):
|
||||
if section == 'dev':
|
||||
section_dir = 'conll17-ud-development-2017-03-19'
|
||||
else:
|
||||
section_dir = 'conll17-ud-test-2017-05-09'
|
||||
text_path = test_data_dir / 'input' / section_dir / (treebank_code+'.txt')
|
||||
udpipe_path = test_data_dir / 'input' / section_dir / (treebank_code+'-udpipe.conllu')
|
||||
gold_path = test_data_dir / 'gold' / section_dir / (treebank_code+'.conllu')
|
||||
|
||||
header = [section, 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
|
||||
print('\t'.join(header))
|
||||
inputs = {'gold': gold_path, 'udp': udpipe_path, 'raw': text_path}
|
||||
for input_type in ('udp', 'raw'):
|
||||
treebank_code = nlp.meta["treebank"]
|
||||
for section in ("test", "dev"):
|
||||
if section == "dev":
|
||||
section_dir = "conll17-ud-development-2017-03-19"
|
||||
else:
|
||||
section_dir = "conll17-ud-test-2017-05-09"
|
||||
text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt")
|
||||
udpipe_path = (
|
||||
test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu")
|
||||
)
|
||||
gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu")
|
||||
|
||||
header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"]
|
||||
print("\t".join(header))
|
||||
inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path}
|
||||
for input_type in ("udp", "raw"):
|
||||
input_path = inputs[input_type]
|
||||
output_path = experiment_dir / corpus / '{section}.conllu'.format(section=section)
|
||||
output_path = (
|
||||
experiment_dir / corpus / "{section}.conllu".format(section=section)
|
||||
)
|
||||
|
||||
parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path)
|
||||
|
||||
accuracy = print_results(input_type, test_scores)
|
||||
acc_path = experiment_dir / corpus / '{section}-accuracy.json'.format(section=section)
|
||||
with open(acc_path, 'w') as file_:
|
||||
acc_path = (
|
||||
experiment_dir
|
||||
/ corpus
|
||||
/ "{section}-accuracy.json".format(section=section)
|
||||
)
|
||||
with open(acc_path, "w") as file_:
|
||||
file_.write(json.dumps(accuracy, indent=2))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
plac.call(main)
|
|
@ -1,7 +1,9 @@
|
|||
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||
# flake8: noqa
|
||||
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||
.conllu format for development data, allowing the official scorer to be used.
|
||||
'''
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
|
@ -11,12 +13,12 @@ import json
|
|||
|
||||
import spacy
|
||||
import spacy.util
|
||||
from ..tokens import Token, Doc
|
||||
from ..gold import GoldParse
|
||||
from ..util import compounding, minibatch, minibatch_by_words
|
||||
from ..syntax.nonproj import projectivize
|
||||
from ..matcher import Matcher
|
||||
from .. import displacy
|
||||
from ...tokens import Token, Doc
|
||||
from ...gold import GoldParse
|
||||
from ...util import compounding, minibatch, minibatch_by_words
|
||||
from ...syntax.nonproj import projectivize
|
||||
from ...matcher import Matcher
|
||||
from ... import displacy
|
||||
from collections import defaultdict, Counter
|
||||
from timeit import default_timer as timer
|
||||
|
||||
|
@ -27,10 +29,9 @@ import cytoolz
|
|||
|
||||
from . import conll17_ud_eval
|
||||
|
||||
from .. import lang
|
||||
from .. import lang
|
||||
from ..lang import zh
|
||||
from ..lang import ja
|
||||
from ... import lang
|
||||
from ...lang import zh
|
||||
from ...lang import ja
|
||||
|
||||
try:
|
||||
import torch
|
||||
|
@ -42,17 +43,26 @@ except ImportError:
|
|||
# Data reading #
|
||||
################
|
||||
|
||||
space_re = re.compile('\s+')
|
||||
def split_text(text):
|
||||
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
|
||||
|
||||
space_re = re.compile("\s+")
|
||||
|
||||
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
||||
max_doc_length=None, limit=None):
|
||||
'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
||||
|
||||
def split_text(text):
|
||||
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
|
||||
|
||||
|
||||
def read_data(
|
||||
nlp,
|
||||
conllu_file,
|
||||
text_file,
|
||||
raw_text=True,
|
||||
oracle_segments=False,
|
||||
max_doc_length=None,
|
||||
limit=None,
|
||||
):
|
||||
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
||||
include Doc objects created using nlp.make_doc and then aligned against
|
||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
||||
created from the gold-standard segments. At least one must be True.'''
|
||||
created from the gold-standard segments. At least one must be True."""
|
||||
if not raw_text and not oracle_segments:
|
||||
raise ValueError("At least one of raw_text or oracle_segments must be True")
|
||||
paragraphs = split_text(text_file.read())
|
||||
|
@ -66,22 +76,21 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
|||
for cs in cd:
|
||||
sent = defaultdict(list)
|
||||
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
||||
if '.' in id_:
|
||||
if "." in id_:
|
||||
continue
|
||||
if '-' in id_:
|
||||
if "-" in id_:
|
||||
continue
|
||||
id_ = int(id_)-1
|
||||
head = int(head)-1 if head != '0' else id_
|
||||
sent['words'].append(word)
|
||||
sent['tags'].append(tag)
|
||||
sent['heads'].append(head)
|
||||
sent['deps'].append('ROOT' if dep == 'root' else dep)
|
||||
sent['spaces'].append(space_after == '_')
|
||||
sent['entities'] = ['-'] * len(sent['words'])
|
||||
sent['heads'], sent['deps'] = projectivize(sent['heads'],
|
||||
sent['deps'])
|
||||
id_ = int(id_) - 1
|
||||
head = int(head) - 1 if head != "0" else id_
|
||||
sent["words"].append(word)
|
||||
sent["tags"].append(tag)
|
||||
sent["heads"].append(head)
|
||||
sent["deps"].append("ROOT" if dep == "root" else dep)
|
||||
sent["spaces"].append(space_after == "_")
|
||||
sent["entities"] = ["-"] * len(sent["words"])
|
||||
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
|
||||
if oracle_segments:
|
||||
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
|
||||
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
|
||||
golds.append(GoldParse(docs[-1], **sent))
|
||||
|
||||
sent_annots.append(sent)
|
||||
|
@ -107,18 +116,18 @@ def read_conllu(file_):
|
|||
sent = []
|
||||
doc = []
|
||||
for line in file_:
|
||||
if line.startswith('# newdoc'):
|
||||
if line.startswith("# newdoc"):
|
||||
if doc:
|
||||
docs.append(doc)
|
||||
doc = []
|
||||
elif line.startswith('#'):
|
||||
elif line.startswith("#"):
|
||||
continue
|
||||
elif not line.strip():
|
||||
if sent:
|
||||
doc.append(sent)
|
||||
sent = []
|
||||
else:
|
||||
sent.append(list(line.strip().split('\t')))
|
||||
sent.append(list(line.strip().split("\t")))
|
||||
if len(sent[-1]) != 10:
|
||||
print(repr(line))
|
||||
raise ValueError
|
||||
|
@ -134,17 +143,19 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
|||
flat = defaultdict(list)
|
||||
sent_starts = []
|
||||
for sent in sent_annots:
|
||||
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
|
||||
for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
|
||||
flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
|
||||
for field in ["words", "tags", "deps", "entities", "spaces"]:
|
||||
flat[field].extend(sent[field])
|
||||
sent_starts.append(True)
|
||||
sent_starts.extend([False] * (len(sent['words'])-1))
|
||||
sent_starts.extend([False] * (len(sent["words"]) - 1))
|
||||
# Construct text if necessary
|
||||
assert len(flat['words']) == len(flat['spaces'])
|
||||
assert len(flat["words"]) == len(flat["spaces"])
|
||||
if text is None:
|
||||
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))
|
||||
text = "".join(
|
||||
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
|
||||
)
|
||||
doc = nlp.make_doc(text)
|
||||
flat.pop('spaces')
|
||||
flat.pop("spaces")
|
||||
gold = GoldParse(doc, **flat)
|
||||
gold.sent_starts = sent_starts
|
||||
for i in range(len(gold.heads)):
|
||||
|
@ -154,13 +165,15 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
|||
|
||||
return doc, gold
|
||||
|
||||
|
||||
#############################
|
||||
# Data transforms for spaCy #
|
||||
#############################
|
||||
|
||||
|
||||
def golds_to_gold_tuples(docs, golds):
|
||||
'''Get out the annoying 'tuples' format used by begin_training, given the
|
||||
GoldParse objects.'''
|
||||
"""Get out the annoying 'tuples' format used by begin_training, given the
|
||||
GoldParse objects."""
|
||||
tuples = []
|
||||
for doc, gold in zip(docs, golds):
|
||||
text = doc.text
|
||||
|
@ -174,8 +187,9 @@ def golds_to_gold_tuples(docs, golds):
|
|||
# Evaluation #
|
||||
##############
|
||||
|
||||
|
||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||
if text_loc.parts[-1].endswith('.conllu'):
|
||||
if text_loc.parts[-1].endswith(".conllu"):
|
||||
docs = []
|
||||
with text_loc.open() as file_:
|
||||
for conllu_doc in read_conllu(file_):
|
||||
|
@ -185,14 +199,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|||
for name, component in nlp.pipeline:
|
||||
docs = list(component.pipe(docs))
|
||||
else:
|
||||
with text_loc.open('r', encoding='utf8') as text_file:
|
||||
with text_loc.open("r", encoding="utf8") as text_file:
|
||||
texts = split_text(text_file.read())
|
||||
docs = list(nlp.pipe(texts))
|
||||
with sys_loc.open('w', encoding='utf8') as out_file:
|
||||
with sys_loc.open("w", encoding="utf8") as out_file:
|
||||
write_conllu(docs, out_file)
|
||||
with gold_loc.open('r', encoding='utf8') as gold_file:
|
||||
with gold_loc.open("r", encoding="utf8") as gold_file:
|
||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
||||
with sys_loc.open('r', encoding='utf8') as sys_file:
|
||||
with sys_loc.open("r", encoding="utf8") as sys_file:
|
||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
||||
return docs, scores
|
||||
|
@ -200,10 +214,10 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
|||
|
||||
def write_conllu(docs, file_):
|
||||
merger = Matcher(docs[0].vocab)
|
||||
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
|
||||
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
||||
for i, doc in enumerate(docs):
|
||||
matches = merger(doc)
|
||||
spans = [doc[start:end+1] for _, start, end in matches]
|
||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||
for start_char, end_char in offsets:
|
||||
doc.merge(start_char, end_char)
|
||||
|
@ -213,65 +227,82 @@ def write_conllu(docs, file_):
|
|||
file_.write("# text = {text}\n".format(text=sent.text))
|
||||
for k, token in enumerate(sent):
|
||||
if token.head.i > sent[-1].i or token.head.i < sent[0].i:
|
||||
for word in doc[sent[0].i-10 : sent[0].i]:
|
||||
for word in doc[sent[0].i - 10 : sent[0].i]:
|
||||
print(word.i, word.head.i, word.text, word.dep_)
|
||||
for word in sent:
|
||||
print(word.i, word.head.i, word.text, word.dep_)
|
||||
for word in doc[sent[-1].i : sent[-1].i+10]:
|
||||
for word in doc[sent[-1].i : sent[-1].i + 10]:
|
||||
print(word.i, word.head.i, word.text, word.dep_)
|
||||
raise ValueError("Invalid parse: head outside sentence (%s)" % token.text)
|
||||
file_.write(token._.get_conllu_lines(k) + '\n')
|
||||
file_.write('\n')
|
||||
raise ValueError(
|
||||
"Invalid parse: head outside sentence (%s)" % token.text
|
||||
)
|
||||
file_.write(token._.get_conllu_lines(k) + "\n")
|
||||
file_.write("\n")
|
||||
|
||||
|
||||
def print_progress(itn, losses, ud_scores):
|
||||
fields = {
|
||||
'dep_loss': losses.get('parser', 0.0),
|
||||
'tag_loss': losses.get('tagger', 0.0),
|
||||
'words': ud_scores['Words'].f1 * 100,
|
||||
'sents': ud_scores['Sentences'].f1 * 100,
|
||||
'tags': ud_scores['XPOS'].f1 * 100,
|
||||
'uas': ud_scores['UAS'].f1 * 100,
|
||||
'las': ud_scores['LAS'].f1 * 100,
|
||||
"dep_loss": losses.get("parser", 0.0),
|
||||
"tag_loss": losses.get("tagger", 0.0),
|
||||
"words": ud_scores["Words"].f1 * 100,
|
||||
"sents": ud_scores["Sentences"].f1 * 100,
|
||||
"tags": ud_scores["XPOS"].f1 * 100,
|
||||
"uas": ud_scores["UAS"].f1 * 100,
|
||||
"las": ud_scores["LAS"].f1 * 100,
|
||||
}
|
||||
header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
|
||||
header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
|
||||
if itn == 0:
|
||||
print('\t'.join(header))
|
||||
tpl = '\t'.join((
|
||||
'{:d}',
|
||||
'{dep_loss:.1f}',
|
||||
'{las:.1f}',
|
||||
'{uas:.1f}',
|
||||
'{tags:.1f}',
|
||||
'{sents:.1f}',
|
||||
'{words:.1f}',
|
||||
))
|
||||
print("\t".join(header))
|
||||
tpl = "\t".join(
|
||||
(
|
||||
"{:d}",
|
||||
"{dep_loss:.1f}",
|
||||
"{las:.1f}",
|
||||
"{uas:.1f}",
|
||||
"{tags:.1f}",
|
||||
"{sents:.1f}",
|
||||
"{words:.1f}",
|
||||
)
|
||||
)
|
||||
print(tpl.format(itn, **fields))
|
||||
|
||||
#def get_sent_conllu(sent, sent_id):
|
||||
|
||||
# def get_sent_conllu(sent, sent_id):
|
||||
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
||||
|
||||
|
||||
def get_token_conllu(token, i):
|
||||
if token._.begins_fused:
|
||||
n = 1
|
||||
while token.nbor(n)._.inside_fused:
|
||||
n += 1
|
||||
id_ = '%d-%d' % (i, i+n)
|
||||
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
|
||||
id_ = "%d-%d" % (i, i + n)
|
||||
lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
|
||||
else:
|
||||
lines = []
|
||||
if token.head.i == token.i:
|
||||
head = 0
|
||||
else:
|
||||
head = i + (token.head.i - token.i) + 1
|
||||
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
||||
str(head), token.dep_.lower(), '_', '_']
|
||||
lines.append('\t'.join(fields))
|
||||
return '\n'.join(lines)
|
||||
fields = [
|
||||
str(i + 1),
|
||||
token.text,
|
||||
token.lemma_,
|
||||
token.pos_,
|
||||
token.tag_,
|
||||
"_",
|
||||
str(head),
|
||||
token.dep_.lower(),
|
||||
"_",
|
||||
"_",
|
||||
]
|
||||
lines.append("\t".join(fields))
|
||||
return "\n".join(lines)
|
||||
|
||||
Token.set_extension('get_conllu_lines', method=get_token_conllu)
|
||||
Token.set_extension('begins_fused', default=False)
|
||||
Token.set_extension('inside_fused', default=False)
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
|
||||
##################
|
||||
|
@ -280,35 +311,40 @@ Token.set_extension('inside_fused', default=False)
|
|||
|
||||
|
||||
def load_nlp(corpus, config, vectors=None):
|
||||
lang = corpus.split('_')[0]
|
||||
lang = corpus.split("_")[0]
|
||||
nlp = spacy.blank(lang)
|
||||
if config.vectors:
|
||||
if not vectors:
|
||||
raise ValueError("config asks for vectors, but no vectors "
|
||||
"directory set on command line (use -v)")
|
||||
if not vectors:
|
||||
raise ValueError(
|
||||
"config asks for vectors, but no vectors "
|
||||
"directory set on command line (use -v)"
|
||||
)
|
||||
if (Path(vectors) / corpus).exists():
|
||||
nlp.vocab.from_disk(Path(vectors) / corpus / 'vocab')
|
||||
nlp.meta['treebank'] = corpus
|
||||
nlp.vocab.from_disk(Path(vectors) / corpus / "vocab")
|
||||
nlp.meta["treebank"] = corpus
|
||||
return nlp
|
||||
|
||||
|
||||
|
||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
||||
nlp.add_pipe(nlp.create_pipe('tagger'))
|
||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
if config.multitask_tag:
|
||||
nlp.parser.add_multitask_objective('tag')
|
||||
nlp.parser.add_multitask_objective("tag")
|
||||
if config.multitask_sent:
|
||||
nlp.parser.add_multitask_objective('sent_start')
|
||||
nlp.parser.add_multitask_objective("sent_start")
|
||||
for gold in golds:
|
||||
for tag in gold.tags:
|
||||
if tag is not None:
|
||||
nlp.tagger.add_label(tag)
|
||||
if torch is not None and device != -1:
|
||||
torch.set_default_tensor_type('torch.cuda.FloatTensor')
|
||||
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
||||
optimizer = nlp.begin_training(
|
||||
lambda: golds_to_gold_tuples(docs, golds), device=device,
|
||||
subword_features=config.subword_features, conv_depth=config.conv_depth,
|
||||
bilstm_depth=config.bilstm_depth)
|
||||
lambda: golds_to_gold_tuples(docs, golds),
|
||||
device=device,
|
||||
subword_features=config.subword_features,
|
||||
conv_depth=config.conv_depth,
|
||||
bilstm_depth=config.bilstm_depth,
|
||||
)
|
||||
if config.pretrained_tok2vec:
|
||||
_load_pretrained_tok2vec(nlp, config.pretrained_tok2vec)
|
||||
return optimizer
|
||||
|
@ -318,27 +354,41 @@ def _load_pretrained_tok2vec(nlp, loc):
|
|||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||
"""
|
||||
with Path(loc).open('rb') as file_:
|
||||
with Path(loc).open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
loaded = []
|
||||
for name, component in nlp.pipeline:
|
||||
if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
|
||||
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
||||
component.tok2vec.from_bytes(weights_data)
|
||||
loaded.append(name)
|
||||
return loaded
|
||||
|
||||
|
||||
|
||||
########################
|
||||
# Command line helpers #
|
||||
########################
|
||||
|
||||
|
||||
class Config(object):
|
||||
def __init__(self, vectors=None, max_doc_length=10, multitask_tag=False,
|
||||
multitask_sent=False, multitask_dep=False, multitask_vectors=None,
|
||||
bilstm_depth=0, nr_epoch=30, min_batch_size=750, max_batch_size=750,
|
||||
batch_by_words=True, dropout=0.1, conv_depth=4, subword_features=True,
|
||||
vectors_dir=None, pretrained_tok2vec=None):
|
||||
def __init__(
|
||||
self,
|
||||
vectors=None,
|
||||
max_doc_length=10,
|
||||
multitask_tag=False,
|
||||
multitask_sent=False,
|
||||
multitask_dep=False,
|
||||
multitask_vectors=None,
|
||||
bilstm_depth=0,
|
||||
nr_epoch=30,
|
||||
min_batch_size=100,
|
||||
max_batch_size=1000,
|
||||
batch_by_words=True,
|
||||
dropout=0.2,
|
||||
conv_depth=4,
|
||||
subword_features=True,
|
||||
vectors_dir=None,
|
||||
pretrained_tok2vec=None,
|
||||
):
|
||||
if vectors_dir is not None:
|
||||
if vectors is None:
|
||||
vectors = True
|
||||
|
@ -346,13 +396,13 @@ class Config(object):
|
|||
multitask_vectors = True
|
||||
for key, value in locals().items():
|
||||
setattr(self, key, value)
|
||||
|
||||
|
||||
@classmethod
|
||||
def load(cls, loc, vectors_dir=None):
|
||||
with Path(loc).open('r', encoding='utf8') as file_:
|
||||
with Path(loc).open("r", encoding="utf8") as file_:
|
||||
cfg = json.load(file_)
|
||||
if vectors_dir is not None:
|
||||
cfg['vectors_dir'] = vectors_dir
|
||||
cfg["vectors_dir"] = vectors_dir
|
||||
return cls(**cfg)
|
||||
|
||||
|
||||
|
@ -364,43 +414,59 @@ class Dataset(object):
|
|||
self.text = None
|
||||
for file_path in self.path.iterdir():
|
||||
name = file_path.parts[-1]
|
||||
if section in name and name.endswith('conllu'):
|
||||
if section in name and name.endswith("conllu"):
|
||||
self.conllu = file_path
|
||||
elif section in name and name.endswith('txt'):
|
||||
elif section in name and name.endswith("txt"):
|
||||
self.text = file_path
|
||||
if self.conllu is None:
|
||||
msg = "Could not find .txt file in {path} for {section}"
|
||||
raise IOError(msg.format(section=section, path=path))
|
||||
if self.text is None:
|
||||
msg = "Could not find .txt file in {path} for {section}"
|
||||
self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
|
||||
self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]
|
||||
|
||||
|
||||
class TreebankPaths(object):
|
||||
def __init__(self, ud_path, treebank, **cfg):
|
||||
self.train = Dataset(ud_path / treebank, 'train')
|
||||
self.dev = Dataset(ud_path / treebank, 'dev')
|
||||
self.train = Dataset(ud_path / treebank, "train")
|
||||
self.dev = Dataset(ud_path / treebank, "dev")
|
||||
self.lang = self.train.lang
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
||||
corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
||||
"positional", None, str),
|
||||
corpus=(
|
||||
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
||||
"positional",
|
||||
None,
|
||||
str,
|
||||
),
|
||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||
config=("Path to json formatted config file", "option", "C", Path),
|
||||
limit=("Size limit", "option", "n", int),
|
||||
gpu_device=("Use GPU", "option", "g", int),
|
||||
use_oracle_segments=("Use oracle segments", "flag", "G", int),
|
||||
vectors_dir=("Path to directory with pre-trained vectors, named e.g. en/",
|
||||
"option", "v", Path),
|
||||
vectors_dir=(
|
||||
"Path to directory with pre-trained vectors, named e.g. en/",
|
||||
"option",
|
||||
"v",
|
||||
Path,
|
||||
),
|
||||
)
|
||||
def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vectors_dir=None,
|
||||
use_oracle_segments=False):
|
||||
def main(
|
||||
ud_dir,
|
||||
parses_dir,
|
||||
corpus,
|
||||
config=None,
|
||||
limit=0,
|
||||
gpu_device=-1,
|
||||
vectors_dir=None,
|
||||
use_oracle_segments=False,
|
||||
):
|
||||
spacy.util.fix_random_seed()
|
||||
lang.zh.Chinese.Defaults.use_jieba = False
|
||||
lang.ja.Japanese.Defaults.use_janome = False
|
||||
|
||||
|
||||
if config is not None:
|
||||
config = Config.load(config, vectors_dir=vectors_dir)
|
||||
else:
|
||||
|
@ -411,19 +477,28 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
|
|||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
||||
nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
|
||||
|
||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
||||
max_doc_length=config.max_doc_length,
|
||||
limit=limit)
|
||||
docs, golds = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(),
|
||||
paths.train.text.open(),
|
||||
max_doc_length=config.max_doc_length,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
|
||||
|
||||
batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
|
||||
beam_prob = compounding(0.2, 0.8, 1.001)
|
||||
for i in range(config.nr_epoch):
|
||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
||||
max_doc_length=config.max_doc_length, limit=limit,
|
||||
oracle_segments=use_oracle_segments,
|
||||
raw_text=not use_oracle_segments)
|
||||
docs, golds = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(),
|
||||
paths.train.text.open(),
|
||||
max_doc_length=config.max_doc_length,
|
||||
limit=limit,
|
||||
oracle_segments=use_oracle_segments,
|
||||
raw_text=not use_oracle_segments,
|
||||
)
|
||||
Xs = list(zip(docs, golds))
|
||||
random.shuffle(Xs)
|
||||
if config.batch_by_words:
|
||||
|
@ -436,27 +511,34 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
|
|||
for batch in batches:
|
||||
batch_docs, batch_gold = zip(*batch)
|
||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
||||
nlp.parser.cfg['beam_update_prob'] = next(beam_prob)
|
||||
nlp.update(batch_docs, batch_gold, sgd=optimizer,
|
||||
drop=config.dropout, losses=losses)
|
||||
|
||||
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
|
||||
nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
|
||||
nlp.update(
|
||||
batch_docs,
|
||||
batch_gold,
|
||||
sgd=optimizer,
|
||||
drop=config.dropout,
|
||||
losses=losses,
|
||||
)
|
||||
|
||||
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
if use_oracle_segments:
|
||||
parsed_docs, scores = evaluate(nlp, paths.dev.conllu,
|
||||
paths.dev.conllu, out_path)
|
||||
parsed_docs, scores = evaluate(
|
||||
nlp, paths.dev.conllu, paths.dev.conllu, out_path
|
||||
)
|
||||
else:
|
||||
parsed_docs, scores = evaluate(nlp, paths.dev.text,
|
||||
paths.dev.conllu, out_path)
|
||||
parsed_docs, scores = evaluate(
|
||||
nlp, paths.dev.text, paths.dev.conllu, out_path
|
||||
)
|
||||
print_progress(i, losses, scores)
|
||||
|
||||
|
||||
def _render_parses(i, to_render):
|
||||
to_render[0].user_data['title'] = "Batch %d" % i
|
||||
with Path('/tmp/parses.html').open('w') as file_:
|
||||
html = displacy.render(to_render[:5], style='dep', page=True)
|
||||
to_render[0].user_data["title"] = "Batch %d" % i
|
||||
with Path("/tmp/parses.html").open("w") as file_:
|
||||
html = displacy.render(to_render[:5], style="dep", page=True)
|
||||
file_.write(html)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
plac.call(main)
|
|
@ -4,28 +4,34 @@ from __future__ import unicode_literals, print_function
|
|||
import pkg_resources
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import ujson
|
||||
import requests
|
||||
from wasabi import Printer
|
||||
|
||||
from ._messages import Messages
|
||||
from ..compat import path2str, locale_escape
|
||||
from ..util import prints, get_data_path, read_json
|
||||
from ..compat import path2str
|
||||
from ..util import get_data_path, read_json
|
||||
from .. import about
|
||||
|
||||
|
||||
def validate():
|
||||
"""Validate that the currently installed version of spaCy is compatible
|
||||
"""
|
||||
Validate that the currently installed version of spaCy is compatible
|
||||
with the installed models. Should be run after `pip install -U spacy`.
|
||||
"""
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
prints(Messages.M021, title=Messages.M003.format(code=r.status_code),
|
||||
exits=1)
|
||||
compat = r.json()['spacy']
|
||||
msg = Printer()
|
||||
with msg.loading("Loading compatibility table..."):
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
msg.fail(Messages.M003.format(code=r.status_code), Messages.M021, exits=1)
|
||||
msg.good("Loaded compatibility table")
|
||||
compat = r.json()["spacy"]
|
||||
current_compat = compat.get(about.__version__)
|
||||
if not current_compat:
|
||||
prints(about.__compatibility__, exits=1,
|
||||
title=Messages.M022.format(version=about.__version__))
|
||||
msg.fail(
|
||||
Messages.M022.format(version=about.__version__),
|
||||
about.__compatibility__,
|
||||
exits=1,
|
||||
)
|
||||
all_models = set()
|
||||
for spacy_v, models in dict(compat).items():
|
||||
all_models.update(models.keys())
|
||||
|
@ -33,33 +39,38 @@ def validate():
|
|||
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
||||
model_links = get_model_links(current_compat)
|
||||
model_pkgs = get_model_pkgs(current_compat, all_models)
|
||||
incompat_links = {l for l, d in model_links.items() if not d['compat']}
|
||||
incompat_models = {d['name'] for _, d in model_pkgs.items()
|
||||
if not d['compat']}
|
||||
incompat_models.update([d['name'] for _, d in model_links.items()
|
||||
if not d['compat']])
|
||||
incompat_links = {l for l, d in model_links.items() if not d["compat"]}
|
||||
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
|
||||
incompat_models.update(
|
||||
[d["name"] for _, d in model_links.items() if not d["compat"]]
|
||||
)
|
||||
na_models = [m for m in incompat_models if m not in current_compat]
|
||||
update_models = [m for m in incompat_models if m in current_compat]
|
||||
spacy_dir = Path(__file__).parent.parent
|
||||
|
||||
msg.divider(Messages.M023.format(version=about.__version__))
|
||||
msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
|
||||
|
||||
prints(path2str(Path(__file__).parent.parent),
|
||||
title=Messages.M023.format(version=about.__version__))
|
||||
if model_links or model_pkgs:
|
||||
print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
|
||||
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
|
||||
rows = []
|
||||
for name, data in model_pkgs.items():
|
||||
print(get_model_row(current_compat, name, data, 'package'))
|
||||
rows.append(get_model_row(current_compat, name, data, msg))
|
||||
for name, data in model_links.items():
|
||||
print(get_model_row(current_compat, name, data, 'link'))
|
||||
rows.append(get_model_row(current_compat, name, data, msg, "link"))
|
||||
msg.table(rows, header=header)
|
||||
else:
|
||||
prints(Messages.M024, exits=0)
|
||||
msg.text(Messages.M024, exits=0)
|
||||
if update_models:
|
||||
cmd = ' python -m spacy download {}'
|
||||
print("\n " + Messages.M025)
|
||||
print('\n'.join([cmd.format(pkg) for pkg in update_models]))
|
||||
msg.divider("Install updates")
|
||||
cmd = "python -m spacy download {}"
|
||||
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||
if na_models:
|
||||
prints(Messages.M025.format(version=about.__version__,
|
||||
models=', '.join(na_models)))
|
||||
msg.text(
|
||||
Messages.M025.format(version=about.__version__, models=", ".join(na_models))
|
||||
)
|
||||
if incompat_links:
|
||||
prints(Messages.M027.format(path=path2str(get_data_path())))
|
||||
msg.text(Messages.M027.format(path=path2str(get_data_path())))
|
||||
if incompat_models or incompat_links:
|
||||
sys.exit(1)
|
||||
|
||||
|
@ -70,50 +81,48 @@ def get_model_links(compat):
|
|||
if data_path:
|
||||
models = [p for p in data_path.iterdir() if is_model_path(p)]
|
||||
for model in models:
|
||||
meta_path = Path(model) / 'meta.json'
|
||||
meta_path = Path(model) / "meta.json"
|
||||
if not meta_path.exists():
|
||||
continue
|
||||
meta = read_json(meta_path)
|
||||
link = model.parts[-1]
|
||||
name = meta['lang'] + '_' + meta['name']
|
||||
links[link] = {'name': name, 'version': meta['version'],
|
||||
'compat': is_compat(compat, name, meta['version'])}
|
||||
name = meta["lang"] + "_" + meta["name"]
|
||||
links[link] = {
|
||||
"name": name,
|
||||
"version": meta["version"],
|
||||
"compat": is_compat(compat, name, meta["version"]),
|
||||
}
|
||||
return links
|
||||
|
||||
|
||||
def get_model_pkgs(compat, all_models):
|
||||
pkgs = {}
|
||||
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
||||
package = pkg_name.replace('-', '_')
|
||||
package = pkg_name.replace("-", "_")
|
||||
if package in all_models:
|
||||
version = pkg_data.version
|
||||
pkgs[pkg_name] = {'name': package, 'version': version,
|
||||
'compat': is_compat(compat, package, version)}
|
||||
pkgs[pkg_name] = {
|
||||
"name": package,
|
||||
"version": version,
|
||||
"compat": is_compat(compat, package, version),
|
||||
}
|
||||
return pkgs
|
||||
|
||||
|
||||
def get_model_row(compat, name, data, type='package'):
|
||||
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
|
||||
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
|
||||
if data['compat']:
|
||||
comp = tpl_green.format(locale_escape('✔', errors='ignore'))
|
||||
version = tpl_green.format(data['version'])
|
||||
def get_model_row(compat, name, data, msg, model_type="package"):
|
||||
if data["compat"]:
|
||||
comp = msg.text("", color="green", icon="good", no_print=True)
|
||||
version = msg.text(data["version"], color="green", no_print=True)
|
||||
else:
|
||||
comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0])
|
||||
version = tpl_red.format(data['version'])
|
||||
return get_row(type, name, data['name'], version, comp)
|
||||
|
||||
|
||||
def get_row(*args):
|
||||
tpl_row = ' {:<10}' + (' {:<20}' * 4)
|
||||
return tpl_row.format(*args)
|
||||
version = msg.text(data["version"], color="red", no_print=True)
|
||||
comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
|
||||
return (model_type, name, data["name"], version, comp)
|
||||
|
||||
|
||||
def is_model_path(model_path):
|
||||
exclude = ['cache', 'pycache', '__pycache__']
|
||||
exclude = ["cache", "pycache", "__pycache__"]
|
||||
name = model_path.parts[-1]
|
||||
return (model_path.is_dir() and name not in exclude
|
||||
and not name.startswith('.'))
|
||||
return model_path.is_dir() and name not in exclude and not name.startswith(".")
|
||||
|
||||
|
||||
def is_compat(compat, name, version):
|
||||
|
@ -122,6 +131,6 @@ def is_compat(compat, name, version):
|
|||
|
||||
def reformat_version(version):
|
||||
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
|
||||
if version.endswith('-alpha'):
|
||||
return version.replace('-alpha', 'a0')
|
||||
return version.replace('-alpha', 'a')
|
||||
if version.endswith("-alpha"):
|
||||
return version.replace("-alpha", "a0")
|
||||
return version.replace("-alpha", "a")
|
||||
|
|
|
@ -1,59 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import json
|
||||
import spacy
|
||||
import numpy
|
||||
from pathlib import Path
|
||||
|
||||
from ..vectors import Vectors
|
||||
from ..util import prints, ensure_path
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("model language", "positional", None, str),
|
||||
output_dir=("model output directory", "positional", None, Path),
|
||||
lexemes_loc=("location of JSONL-formatted lexical data", "positional",
|
||||
None, Path),
|
||||
vectors_loc=("optional: location of vectors data, as numpy .npz",
|
||||
"positional", None, str),
|
||||
prune_vectors=("optional: number of vectors to prune to.",
|
||||
"option", "V", int)
|
||||
)
|
||||
def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, prune_vectors=-1):
|
||||
"""Compile a vocabulary from a lexicon jsonl file and word vectors."""
|
||||
if not lexemes_loc.exists():
|
||||
prints(lexemes_loc, title="Can't find lexical data", exits=1)
|
||||
vectors_loc = ensure_path(vectors_loc)
|
||||
nlp = spacy.blank(lang)
|
||||
for word in nlp.vocab:
|
||||
word.rank = 0
|
||||
lex_added = 0
|
||||
with lexemes_loc.open() as file_:
|
||||
for line in file_:
|
||||
if line.strip():
|
||||
attrs = json.loads(line)
|
||||
if 'settings' in attrs:
|
||||
nlp.vocab.cfg.update(attrs['settings'])
|
||||
else:
|
||||
lex = nlp.vocab[attrs['orth']]
|
||||
lex.set_attrs(**attrs)
|
||||
assert lex.rank == attrs['id']
|
||||
lex_added += 1
|
||||
if vectors_loc is not None:
|
||||
vector_data = numpy.load(vectors_loc.open('rb'))
|
||||
nlp.vocab.vectors = Vectors(data=vector_data)
|
||||
for word in nlp.vocab:
|
||||
if word.rank:
|
||||
nlp.vocab.vectors.add(word.orth, row=word.rank)
|
||||
|
||||
if prune_vectors >= 1:
|
||||
remap = nlp.vocab.prune_vectors(prune_vectors)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
vec_added = len(nlp.vocab.vectors)
|
||||
prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
|
||||
title="Sucessfully compiled vocab and vectors, and saved model")
|
||||
return nlp
|
|
@ -5,7 +5,6 @@ import os
|
|||
import sys
|
||||
import ujson
|
||||
import itertools
|
||||
import locale
|
||||
|
||||
from thinc.neural.util import copy_array
|
||||
|
||||
|
@ -136,12 +135,3 @@ def import_file(name, loc):
|
|||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def locale_escape(string, errors="replace"):
|
||||
"""
|
||||
Mangle non-supported characters, for savages with ascii terminals.
|
||||
"""
|
||||
encoding = locale.getpreferredencoding()
|
||||
string = string.encode(encoding, errors).decode("utf8")
|
||||
return string
|
||||
|
|
|
@ -5,7 +5,7 @@ from .render import DependencyRenderer, EntityRenderer
|
|||
from ..tokens import Doc, Span
|
||||
from ..compat import b_to_str
|
||||
from ..errors import Errors, Warnings, user_warning
|
||||
from ..util import prints, is_in_jupyter
|
||||
from ..util import is_in_jupyter
|
||||
|
||||
|
||||
_html = {}
|
||||
|
@ -72,14 +72,12 @@ def serve(
|
|||
|
||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
httpd = simple_server.make_server("0.0.0.0", port, app)
|
||||
prints(
|
||||
"Using the '{}' visualizer".format(style),
|
||||
title="Serving on port {}...".format(port),
|
||||
)
|
||||
print("\nUsing the '{}' visualizer".format(style))
|
||||
print("Serving on port {}...\n".format(port))
|
||||
try:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
prints("Shutting down server on port {}.".format(port))
|
||||
print("Shutting down server on port {}.".format(port))
|
||||
finally:
|
||||
httpd.server_close()
|
||||
|
||||
|
|
|
@ -278,6 +278,12 @@ class Errors(object):
|
|||
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
|
||||
" can only be part of one entity, so make sure the entities you're "
|
||||
"setting don't overlap.")
|
||||
E104 = ("Can't find JSON schema for '{name}'.")
|
||||
E105 = ("The Doc.print_tree() method is now deprecated. Please use "
|
||||
"Doc.json() instead.")
|
||||
E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
|
||||
"settings: {opts}")
|
||||
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
113
spacy/gold.pyx
113
spacy/gold.pyx
|
@ -15,7 +15,7 @@ import json
|
|||
|
||||
import ujson
|
||||
|
||||
from . import _align
|
||||
from . import _align
|
||||
from .syntax import nonproj
|
||||
from .tokens import Doc
|
||||
from .errors import Errors
|
||||
|
@ -172,7 +172,7 @@ class GoldCorpus(object):
|
|||
def dev_tuples(self):
|
||||
locs = (self.tmp_dir / 'dev').iterdir()
|
||||
yield from self.read_tuples(locs, limit=self.limit)
|
||||
|
||||
|
||||
@property
|
||||
def train_tuples(self):
|
||||
locs = (self.tmp_dir / 'train').iterdir()
|
||||
|
@ -271,6 +271,53 @@ def _corrupt(c, noise_level):
|
|||
return c.lower()
|
||||
|
||||
|
||||
def read_json_object(json_corpus_section):
|
||||
"""Take a list of JSON-formatted documents (e.g. from an already loaded
|
||||
training data file) and yield tuples in the GoldParse format.
|
||||
|
||||
json_corpus_section (list): The data.
|
||||
YIELDS (tuple): The reformatted data.
|
||||
"""
|
||||
for json_doc in json_corpus_section:
|
||||
tuple_doc = json_to_tuple(json_doc)
|
||||
for tuple_paragraph in tuple_doc:
|
||||
yield tuple_paragraph
|
||||
|
||||
|
||||
def json_to_tuple(doc):
|
||||
"""Convert an item in the JSON-formatted training data to the tuple format
|
||||
used by GoldParse.
|
||||
|
||||
doc (dict): One entry in the training data.
|
||||
YIELDS (tuple): The reformatted data.
|
||||
"""
|
||||
paragraphs = []
|
||||
for paragraph in doc['paragraphs']:
|
||||
sents = []
|
||||
for sent in paragraph['sentences']:
|
||||
words = []
|
||||
ids = []
|
||||
tags = []
|
||||
heads = []
|
||||
labels = []
|
||||
ner = []
|
||||
for i, token in enumerate(sent['tokens']):
|
||||
words.append(token['orth'])
|
||||
ids.append(i)
|
||||
tags.append(token.get('tag', '-'))
|
||||
heads.append(token.get('head', 0) + i)
|
||||
labels.append(token.get('dep', ''))
|
||||
# Ensure ROOT label is case-insensitive
|
||||
if labels[-1].lower() == 'root':
|
||||
labels[-1] = 'ROOT'
|
||||
ner.append(token.get('ner', '-'))
|
||||
sents.append([
|
||||
[ids, words, tags, heads, labels, ner],
|
||||
sent.get('brackets', [])])
|
||||
if sents:
|
||||
yield [paragraph.get('raw', None), sents]
|
||||
|
||||
|
||||
def read_json_file(loc, docs_filter=None, limit=None):
|
||||
loc = util.ensure_path(loc)
|
||||
if loc.is_dir():
|
||||
|
@ -280,31 +327,8 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
|||
for doc in _json_iterate(loc):
|
||||
if docs_filter is not None and not docs_filter(doc):
|
||||
continue
|
||||
paragraphs = []
|
||||
for paragraph in doc['paragraphs']:
|
||||
sents = []
|
||||
for sent in paragraph['sentences']:
|
||||
words = []
|
||||
ids = []
|
||||
tags = []
|
||||
heads = []
|
||||
labels = []
|
||||
ner = []
|
||||
for i, token in enumerate(sent['tokens']):
|
||||
words.append(token['orth'])
|
||||
ids.append(i)
|
||||
tags.append(token.get('tag', '-'))
|
||||
heads.append(token.get('head', 0) + i)
|
||||
labels.append(token.get('dep', ''))
|
||||
# Ensure ROOT label is case-insensitive
|
||||
if labels[-1].lower() == 'root':
|
||||
labels[-1] = 'ROOT'
|
||||
ner.append(token.get('ner', '-'))
|
||||
sents.append([
|
||||
[ids, words, tags, heads, labels, ner],
|
||||
sent.get('brackets', [])])
|
||||
if sents:
|
||||
yield [paragraph.get('raw', None), sents]
|
||||
for json_tuple in json_to_tuple(doc):
|
||||
yield json_tuple
|
||||
|
||||
|
||||
def _json_iterate(loc):
|
||||
|
@ -573,32 +597,19 @@ cdef class GoldParse:
|
|||
self.c.sent_start[i] = 0
|
||||
|
||||
|
||||
def docs_to_json(id, docs):
|
||||
'''Convert a list of Doc objects into the JSON-serializable format used by
|
||||
the spacy train command. Each Doc in the list will be interpreted as a
|
||||
paragraph.
|
||||
'''
|
||||
def docs_to_json(docs, underscore=None):
|
||||
"""Convert a list of Doc objects into the JSON-serializable format used by
|
||||
the spacy train command.
|
||||
|
||||
docs (iterable / Doc): The Doc object(s) to convert.
|
||||
underscore (list): Optional list of string names of custom doc._.
|
||||
attributes. Attribute values need to be JSON-serializable. Values will
|
||||
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
|
||||
RETURNS (list): The data in spaCy's JSON format.
|
||||
"""
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
json_doc = {'id': id, 'paragraphs': []}
|
||||
for i, doc in enumerate(docs):
|
||||
json_para = {'raw': doc.text, 'sentences': []}
|
||||
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
|
||||
for j, sent in enumerate(doc.sents):
|
||||
json_sent = {'tokens': [], 'brackets': []}
|
||||
for token in sent:
|
||||
json_token = {"id": token.i, "orth": token.text}
|
||||
if doc.is_tagged:
|
||||
json_token['tag'] = token.tag_
|
||||
if doc.is_parsed:
|
||||
json_token['head'] = token.head.i-token.i
|
||||
json_token['dep'] = token.dep_
|
||||
json_token['ner'] = biluo_tags[token.i]
|
||||
json_sent['tokens'].append(json_token)
|
||||
json_para['sentences'].append(json_sent)
|
||||
json_doc['paragraphs'].append(json_para)
|
||||
return json_doc
|
||||
return [doc.to_json(underscore=underscore) for doc in docs]
|
||||
|
||||
|
||||
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
||||
|
|
|
@ -341,21 +341,3 @@ def test_lowest_common_ancestor(en_tokenizer):
|
|||
assert lca[1, 1] == 1
|
||||
assert lca[0, 1] == 2
|
||||
assert lca[1, 2] == 2
|
||||
|
||||
|
||||
def test_parse_tree(en_tokenizer):
|
||||
"""Tests doc.print_tree() method."""
|
||||
text = "I like New York in Autumn."
|
||||
heads = [1, 0, 1, -2, -3, -1, -5]
|
||||
tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags)
|
||||
# full method parse_tree(text) is a trivial composition
|
||||
trees = doc.print_tree()
|
||||
assert len(trees) > 0
|
||||
tree = trees[0]
|
||||
assert all(
|
||||
k in list(tree.keys())
|
||||
for k in ["word", "lemma", "NE", "POS_fine", "POS_coarse", "arc", "modifiers"]
|
||||
)
|
||||
assert tree["word"] == "like" # check root is correct
|
||||
|
|
65
spacy/tests/doc/test_to_json.py
Normal file
65
spacy/tests/doc/test_to_json.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.cli.schemas import get_schema, validate_json
|
||||
from spacy.tokens import Doc
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def doc(en_vocab):
|
||||
words = ["c", "d", "e"]
|
||||
pos = ["VERB", "NOUN", "NOUN"]
|
||||
tags = ["VBP", "NN", "NN"]
|
||||
heads = [0, -1, -2]
|
||||
deps = ["ROOT", "dobj", "dobj"]
|
||||
ents = [(1, 2, "ORG")]
|
||||
return get_doc(
|
||||
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
||||
)
|
||||
|
||||
|
||||
def test_doc_to_json(doc):
|
||||
json_doc = doc.to_json()
|
||||
assert json_doc["text"] == "c d e "
|
||||
assert len(json_doc["tokens"]) == 3
|
||||
assert json_doc["tokens"][0]["pos"] == "VERB"
|
||||
assert json_doc["tokens"][0]["tag"] == "VBP"
|
||||
assert json_doc["tokens"][0]["dep"] == "ROOT"
|
||||
assert len(json_doc["ents"]) == 1
|
||||
assert json_doc["ents"][0]["start"] == 2 # character offset!
|
||||
assert json_doc["ents"][0]["end"] == 3 # character offset!
|
||||
assert json_doc["ents"][0]["label"] == "ORG"
|
||||
|
||||
|
||||
def test_doc_to_json_underscore(doc):
|
||||
Doc.set_extension("json_test1", default=False)
|
||||
Doc.set_extension("json_test2", default=False)
|
||||
doc._.json_test1 = "hello world"
|
||||
doc._.json_test2 = [1, 2, 3]
|
||||
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
|
||||
assert "_" in json_doc
|
||||
assert json_doc["_"]["json_test1"] == "hello world"
|
||||
assert json_doc["_"]["json_test2"] == [1, 2, 3]
|
||||
|
||||
|
||||
def test_doc_to_json_underscore_error_attr(doc):
|
||||
"""Test that Doc.to_json() raises an error if a custom attribute doesn't
|
||||
exist in the ._ space."""
|
||||
with pytest.raises(ValueError):
|
||||
doc.to_json(underscore=["json_test3"])
|
||||
|
||||
|
||||
def test_doc_to_json_underscore_error_serialize(doc):
|
||||
"""Test that Doc.to_json() raises an error if a custom attribute value
|
||||
isn't JSON-serializable."""
|
||||
Doc.set_extension("json_test4", method=lambda doc: doc.text)
|
||||
with pytest.raises(ValueError):
|
||||
doc.to_json(underscore=["json_test4"])
|
||||
|
||||
|
||||
def test_doc_to_json_valid_training(doc):
|
||||
json_doc = doc.to_json()
|
||||
errors = validate_json([json_doc], get_schema("training"))
|
||||
assert not errors
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
|||
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
|
|
|
@ -2,9 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||
from spacy.gold import docs_to_json
|
||||
from spacy.tokens import Doc
|
||||
from .util import get_doc
|
||||
|
||||
|
||||
def test_gold_biluo_U(en_vocab):
|
||||
|
@ -52,34 +50,3 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
|||
assert biluo_tags_converted == biluo_tags
|
||||
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
|
||||
assert offsets_converted == offsets
|
||||
|
||||
|
||||
def test_docs_to_json(en_vocab):
|
||||
"""Test we can convert a list of Doc objects into the JSON-serializable
|
||||
format we use for training.
|
||||
"""
|
||||
docs = [
|
||||
get_doc(
|
||||
en_vocab,
|
||||
words=["a", "b"],
|
||||
pos=["VBP", "NN"],
|
||||
heads=[0, -1],
|
||||
deps=["ROOT", "dobj"],
|
||||
ents=[],
|
||||
),
|
||||
get_doc(
|
||||
en_vocab,
|
||||
words=["c", "d", "e"],
|
||||
pos=["VBP", "NN", "NN"],
|
||||
heads=[0, -1, -2],
|
||||
deps=["ROOT", "dobj", "dobj"],
|
||||
ents=[(1, 2, "ORG")],
|
||||
),
|
||||
]
|
||||
json_doc = docs_to_json(0, docs)
|
||||
assert json_doc["id"] == 0
|
||||
assert len(json_doc["paragraphs"]) == 2
|
||||
assert len(json_doc["paragraphs"][0]["sentences"]) == 1
|
||||
assert len(json_doc["paragraphs"][1]["sentences"]) == 1
|
||||
assert len(json_doc["paragraphs"][0]["sentences"][0]["tokens"]) == 2
|
||||
assert len(json_doc["paragraphs"][1]["sentences"][0]["tokens"]) == 3
|
||||
|
|
44
spacy/tests/test_json_schemas.py
Normal file
44
spacy/tests/test_json_schemas.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.cli.schemas import validate_json, get_schema
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def training_schema():
|
||||
return get_schema("training")
|
||||
|
||||
|
||||
def test_json_schema_get():
|
||||
schema = get_schema("training")
|
||||
assert schema
|
||||
with pytest.raises(ValueError):
|
||||
schema = get_schema("xxx")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
{"text": "Hello world"},
|
||||
{"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]},
|
||||
],
|
||||
)
|
||||
def test_json_schema_training_valid(data, training_schema):
|
||||
errors = validate_json([data], training_schema)
|
||||
assert not errors
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,n_errors",
|
||||
[
|
||||
({"spans": []}, 1),
|
||||
({"text": "Hello", "ents": [{"start": "0", "end": "5", "label": "TEST"}]}, 2),
|
||||
({"text": "Hello", "ents": [{"start": 0, "end": 5}]}, 1),
|
||||
({"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "test"}]}, 1),
|
||||
({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2),
|
||||
],
|
||||
)
|
||||
def test_json_schema_training_invalid(data, n_errors, training_schema):
|
||||
errors = validate_json([data], training_schema)
|
||||
assert len(errors) == n_errors
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from spacy import util
|
||||
|
|
|
@ -20,7 +20,6 @@ from .span cimport Span
|
|||
from .token cimport Token
|
||||
from .span cimport Span
|
||||
from .token cimport Token
|
||||
from .printers import parse_tree
|
||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..attrs import intify_attrs, IDS
|
||||
|
@ -29,7 +28,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
|||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
||||
from ..attrs cimport ENT_TYPE, SENT_START
|
||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||
from ..util import normalize_slice
|
||||
from ..util import normalize_slice, is_json_serializable
|
||||
from ..compat import is_config, copy_reg, pickle, basestring_
|
||||
from ..errors import deprecation_warning, models_warning, user_warning
|
||||
from ..errors import Errors, Warnings
|
||||
|
@ -959,31 +958,48 @@ cdef class Doc:
|
|||
return self[start]
|
||||
|
||||
def print_tree(self, light=False, flat=False):
|
||||
"""Returns the parse trees in JSON (dict) format.
|
||||
raise ValueError(Errors.E105)
|
||||
|
||||
light (bool): Don't include lemmas or entities.
|
||||
flat (bool): Don't include arcs or modifiers.
|
||||
RETURNS (dict): Parse tree as dict.
|
||||
def to_json(self, underscore=None):
|
||||
"""Convert a Doc to JSON. Produces the same format used by the spacy
|
||||
train command.
|
||||
|
||||
EXAMPLE:
|
||||
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
||||
>>> trees = doc.print_tree()
|
||||
>>> trees[1]
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
|
||||
'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
|
||||
'lemma': 'Alice'},
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
||||
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
||||
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
|
||||
'POS_fine': 'NN', 'lemma': 'pizza'},
|
||||
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
|
||||
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
|
||||
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
||||
'POS_fine': 'VBD', 'lemma': 'eat'}
|
||||
underscore (list): Optional list of string names of custom doc._.
|
||||
attributes. Attribute values need to be JSON-serializable. Values will
|
||||
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
|
||||
RETURNS (dict): The data in spaCy's JSON format.
|
||||
"""
|
||||
return parse_tree(self, light=light, flat=flat)
|
||||
data = {'text': self.text}
|
||||
data['ents'] = [{'start': ent.start_char, 'end': ent.end_char,
|
||||
'label': ent.label_} for ent in self.ents]
|
||||
sents = list(self.sents)
|
||||
if sents:
|
||||
data['sents'] = [{'start': sent.start_char, 'end': sent.end_char}
|
||||
for sent in sents]
|
||||
if self.cats:
|
||||
data['cats'] = self.cats
|
||||
data['tokens'] = []
|
||||
for token in self:
|
||||
token_data = {'id': token.i, 'start': token.idx, 'end': token.idx + len(token)}
|
||||
if token.pos_:
|
||||
token_data['pos'] = token.pos_
|
||||
if token.tag_:
|
||||
token_data['tag'] = token.tag_
|
||||
if token.dep_:
|
||||
token_data['dep'] = token.dep_
|
||||
if token.head:
|
||||
token_data['head'] = token.head.i
|
||||
data['tokens'].append(token_data)
|
||||
if underscore:
|
||||
data['_'] = {}
|
||||
for attr in underscore:
|
||||
if not self.has_extension(attr):
|
||||
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
|
||||
value = self._.get(attr)
|
||||
if not is_json_serializable(value):
|
||||
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
||||
data['_'][attr] = value
|
||||
return data
|
||||
|
||||
|
||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
||||
|
|
|
@ -1,74 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .doc import Doc
|
||||
from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
|
||||
|
||||
|
||||
def merge_ents(doc):
|
||||
"""Helper: merge adjacent entities into single tokens; modifies the doc."""
|
||||
for ent in doc.ents:
|
||||
ent.merge(tag=ent.root.tag_, lemma=ent.text, ent_type=ent.label_)
|
||||
return doc
|
||||
|
||||
|
||||
def format_POS(token, light, flat):
|
||||
"""Helper: form the POS output for a token."""
|
||||
subtree = dict([
|
||||
("word", token.text),
|
||||
("lemma", token.lemma_), # trigger
|
||||
("NE", token.ent_type_), # trigger
|
||||
("POS_fine", token.tag_),
|
||||
("POS_coarse", token.pos_),
|
||||
("arc", token.dep_),
|
||||
("modifiers", [])
|
||||
])
|
||||
if light:
|
||||
subtree.pop("lemma")
|
||||
subtree.pop("NE")
|
||||
if flat:
|
||||
subtree.pop("arc")
|
||||
subtree.pop("modifiers")
|
||||
return subtree
|
||||
|
||||
|
||||
def POS_tree(root, light=False, flat=False):
|
||||
"""Helper: generate a POS tree for a root token. The doc must have
|
||||
`merge_ents(doc)` ran on it.
|
||||
"""
|
||||
subtree = format_POS(root, light=light, flat=flat)
|
||||
for c in root.children:
|
||||
subtree["modifiers"].append(POS_tree(c))
|
||||
return subtree
|
||||
|
||||
|
||||
def parse_tree(doc, light=False, flat=False):
|
||||
"""Make a copy of the doc and construct a syntactic parse tree similar to
|
||||
displaCy. Generates the POS tree for all sentences in a doc.
|
||||
|
||||
doc (Doc): The doc for parsing.
|
||||
RETURNS (dict): The parse tree.
|
||||
|
||||
EXAMPLE:
|
||||
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
||||
>>> trees = doc.print_tree()
|
||||
>>> trees[1]
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
|
||||
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
||||
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
||||
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
|
||||
'POS_fine': 'NN', 'lemma': 'pizza'},
|
||||
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
|
||||
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
|
||||
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
||||
'POS_fine': 'VBD', 'lemma': 'eat'}
|
||||
"""
|
||||
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
||||
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
|
||||
doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
|
||||
merge_ents(doc_clone) # merge the entities into single tokens first
|
||||
return [POS_tree(sent.root, light=light, flat=flat)
|
||||
for sent in doc_clone.sents]
|
|
@ -7,8 +7,6 @@ import pkg_resources
|
|||
import importlib
|
||||
import regex as re
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import textwrap
|
||||
import random
|
||||
from collections import OrderedDict
|
||||
from thinc.neural._classes.model import Model
|
||||
|
@ -18,9 +16,10 @@ import cytoolz
|
|||
import itertools
|
||||
import numpy.random
|
||||
|
||||
|
||||
from .symbols import ORTH
|
||||
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
||||
from .compat import import_file
|
||||
from .compat import import_file, json_dumps
|
||||
from .errors import Errors
|
||||
|
||||
# Import these directly from Thinc, so that we're sure we always have the
|
||||
|
@ -541,6 +540,16 @@ def read_json(location):
|
|||
return ujson.load(f)
|
||||
|
||||
|
||||
def write_json(file_path, contents):
|
||||
"""Create a .json file and dump contents.
|
||||
|
||||
file_path (unicode / Path): The path to the output file.
|
||||
contents: The JSON-serializable contents to output.
|
||||
"""
|
||||
with Path(file_path).open("w", encoding="utf8") as f:
|
||||
f.write(json_dumps(contents))
|
||||
|
||||
|
||||
def read_jsonl(file_path):
|
||||
"""Read a .jsonl file and yield its contents line by line.
|
||||
|
||||
|
@ -555,6 +564,29 @@ def read_jsonl(file_path):
|
|||
continue
|
||||
|
||||
|
||||
def write_jsonl(file_path, lines):
|
||||
"""Create a .jsonl file and dump contents.
|
||||
|
||||
file_path (unicode / Path): The path to the output file.
|
||||
lines (list): The JSON-serializable contents of each line.
|
||||
"""
|
||||
data = [json_dumps(line) for line in lines]
|
||||
with Path(file_path).open("w", encoding="utf-8") as f:
|
||||
f.write("\n".join(data))
|
||||
|
||||
|
||||
def is_json_serializable(obj):
|
||||
"""Check if a Python object is JSON-serializable."""
|
||||
if hasattr(obj, "__call__"):
|
||||
# Check this separately here to prevent infinite recursions
|
||||
return False
|
||||
try:
|
||||
ujson.dumps(obj)
|
||||
return True
|
||||
except TypeError:
|
||||
return False
|
||||
|
||||
|
||||
def get_raw_input(description, default=False):
|
||||
"""Get user input from the command line via raw_input / input.
|
||||
|
||||
|
@ -602,21 +634,6 @@ def from_disk(path, readers, exclude):
|
|||
return path
|
||||
|
||||
|
||||
def print_table(data, title=None):
|
||||
"""Print data in table format.
|
||||
|
||||
data (dict or list of tuples): Label/value pairs.
|
||||
title (unicode or None): Title, will be printed above.
|
||||
"""
|
||||
if isinstance(data, dict):
|
||||
data = list(data.items())
|
||||
tpl_row = " {:<15}" * len(data[0])
|
||||
table = "\n".join([tpl_row.format(l, unicode_(v)) for l, v in data])
|
||||
if title:
|
||||
print("\n \033[93m{}\033[0m".format(title))
|
||||
print("\n{}\n".format(table))
|
||||
|
||||
|
||||
def print_markdown(data, title=None):
|
||||
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
||||
|
||||
|
@ -638,44 +655,6 @@ def print_markdown(data, title=None):
|
|||
print("\n{}\n".format("\n".join(markdown)))
|
||||
|
||||
|
||||
def prints(*texts, **kwargs):
|
||||
"""Print formatted message (manual ANSI escape sequences to avoid
|
||||
dependency)
|
||||
|
||||
*texts (unicode): Texts to print. Each argument is rendered as paragraph.
|
||||
**kwargs: 'title' becomes coloured headline. exits=True performs sys exit.
|
||||
"""
|
||||
exits = kwargs.get("exits", None)
|
||||
title = kwargs.get("title", None)
|
||||
title = "\033[93m{}\033[0m\n".format(_wrap(title)) if title else ""
|
||||
message = "\n\n".join([_wrap(text) for text in texts])
|
||||
print("\n{}{}\n".format(title, message))
|
||||
if exits is not None:
|
||||
sys.exit(exits)
|
||||
|
||||
|
||||
def _wrap(text, wrap_max=80, indent=4):
|
||||
"""Wrap text at given width using textwrap module.
|
||||
|
||||
text (unicode): Text to wrap. If it's a Path, it's converted to string.
|
||||
wrap_max (int): Maximum line length (indent is deducted).
|
||||
indent (int): Number of spaces for indentation.
|
||||
RETURNS (unicode): Wrapped text.
|
||||
"""
|
||||
indent = indent * " "
|
||||
wrap_width = wrap_max - len(indent)
|
||||
if isinstance(text, Path):
|
||||
text = path2str(text)
|
||||
return textwrap.fill(
|
||||
text,
|
||||
width=wrap_width,
|
||||
initial_indent=indent,
|
||||
subsequent_indent=indent,
|
||||
break_long_words=False,
|
||||
break_on_hyphens=False,
|
||||
)
|
||||
|
||||
|
||||
def minify_html(html):
|
||||
"""Perform a template-specific, rudimentary HTML minification for displaCy.
|
||||
Disclaimer: NOT a general-purpose solution, only removes indentation and
|
||||
|
|
|
@ -320,37 +320,6 @@ p
|
|||
+cell dict
|
||||
+cell Combined tokenizer exceptions.
|
||||
|
||||
|
||||
+h(3, "util.prints") util.prints
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
||||
p
|
||||
| Print a formatted, text-wrapped message with optional title. If a text
|
||||
| argument is a #[code Path], it's converted to a string. Should only
|
||||
| be used for interactive components like the command-line interface.
|
||||
|
||||
+aside-code("Example").
|
||||
data_path = Path('/some/path')
|
||||
if not path.exists():
|
||||
util.prints("Can't find the path.", data_path,
|
||||
title="Error", exits=1)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code *texts]
|
||||
+cell unicode
|
||||
+cell Texts to print. Each argument is rendered as paragraph.
|
||||
|
||||
+row
|
||||
+cell #[code **kwargs]
|
||||
+cell -
|
||||
+cell
|
||||
| #[code title] is rendered as coloured headline. #[code exits]
|
||||
| performs system exit after printing, using the value of the
|
||||
| argument as the exit code, e.g. #[code exits=1].
|
||||
|
||||
|
||||
+h(3, "util.minibatch") util.minibatch
|
||||
+tag function
|
||||
+tag-new(2)
|
||||
|
|
|
@ -257,10 +257,19 @@ p
|
|||
| to allow packaging the model using the
|
||||
| #[+api("cli#package") #[code package]] command.
|
||||
|
||||
+infobox("Changed in v2.1", "⚠️")
|
||||
| As of spaCy 2.1, the #[code --no-tagger], #[code --no-parser] and
|
||||
| #[code --no-parser] flags have been replaced by a #[code --pipeline]
|
||||
| option, which lets you define comma-separated names of pipeline
|
||||
| components to train. For example, #[code --pipeline tagger,parser] will
|
||||
| only train the tagger and parser.
|
||||
|
||||
+code(false, "bash", "$", false, false, true).
|
||||
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter]
|
||||
[--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser]
|
||||
[--no-entities] [--gold-preproc] [--verbose]
|
||||
python -m spacy train [lang] [output_path] [train_path] [dev_path]
|
||||
[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-examples] [--use-gpu]
|
||||
[--version] [--meta-path] [--init-tok2vec] [--parser-multitasks]
|
||||
[--entity-multitasks] [--gold-preproc] [--noise-level] [--learn-tokens]
|
||||
[--verbose]
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
|
@ -269,34 +278,34 @@ p
|
|||
+cell Model language.
|
||||
|
||||
+row
|
||||
+cell #[code output_dir]
|
||||
+cell #[code output_path]
|
||||
+cell positional
|
||||
+cell Directory to store model in.
|
||||
+cell Directory to store model in. Will be created if it doesn't exist.
|
||||
|
||||
+row
|
||||
+cell #[code train_data]
|
||||
+cell #[code train_path]
|
||||
+cell positional
|
||||
+cell Location of JSON-formatted training data.
|
||||
|
||||
+row
|
||||
+cell #[code dev_data]
|
||||
+cell #[code dev_path]
|
||||
+cell positional
|
||||
+cell Location of JSON-formatted development data for evaluation.
|
||||
|
||||
+row
|
||||
+cell #[code --n-iter], #[code -n]
|
||||
+cell #[code --base-model], #[code -b]
|
||||
+cell option
|
||||
+cell Number of iterations (default: #[code 30]).
|
||||
+cell
|
||||
| Optional name of base model to update. Can be any loadable
|
||||
| spaCy model.
|
||||
|
||||
+row
|
||||
+cell #[code --n-sents], #[code -ns]
|
||||
+cell #[code --pipeline], #[code -p]
|
||||
+tag-new("2.1.0")
|
||||
+cell option
|
||||
+cell Number of sentences (default: #[code 0]).
|
||||
|
||||
+row
|
||||
+cell #[code --use-gpu], #[code -g]
|
||||
+cell option
|
||||
+cell Use GPU.
|
||||
+cell
|
||||
| Comma-separated names of pipeline components to train. Defaults
|
||||
| to #[code 'tagger,parser,ner'].
|
||||
|
||||
+row
|
||||
+cell #[code --vectors], #[code -v]
|
||||
|
@ -304,13 +313,21 @@ p
|
|||
+cell Model to load vectors from.
|
||||
|
||||
+row
|
||||
+cell #[code --meta-path], #[code -m]
|
||||
+cell #[code --n-iter], #[code -n]
|
||||
+cell option
|
||||
+cell Number of iterations (default: #[code 30]).
|
||||
|
||||
+row
|
||||
+cell #[code --n-examples], #[code -ns]
|
||||
+cell option
|
||||
+cell Number of examples to use (defaults to #[code 0] for all examples).
|
||||
|
||||
+row
|
||||
+cell #[code --use-gpu], #[code -g]
|
||||
+cell option
|
||||
+cell
|
||||
| #[+tag-new(2)] Optional path to model
|
||||
| #[+a("/usage/training#models-generating") #[code meta.json]].
|
||||
| All relevant properties like #[code lang], #[code pipeline] and
|
||||
| #[code spacy_version] will be overwritten.
|
||||
| Whether to use GPU. Can be either #[code 0], #[code 1] or
|
||||
| #[code -1].
|
||||
|
||||
+row
|
||||
+cell #[code --version], #[code -V]
|
||||
|
@ -320,40 +337,69 @@ p
|
|||
| #[code meta.json] after training.
|
||||
|
||||
+row
|
||||
+cell #[code --no-tagger], #[code -T]
|
||||
+cell flag
|
||||
+cell Don't train tagger.
|
||||
+cell #[code --meta-path], #[code -m]
|
||||
+tag-new(2)
|
||||
+cell option
|
||||
+cell
|
||||
| Optional path to model
|
||||
| #[+a("/usage/training#models-generating") #[code meta.json]].
|
||||
| All relevant properties like #[code lang], #[code pipeline] and
|
||||
| #[code spacy_version] will be overwritten.
|
||||
|
||||
+row
|
||||
+cell #[code --no-parser], #[code -P]
|
||||
+cell flag
|
||||
+cell Don't train parser.
|
||||
+cell #[code --init-tok2vec], #[code -t2v]
|
||||
+tag-new("2.1.0")
|
||||
+cell option
|
||||
+cell
|
||||
| Path to pretrained weights for the token-to-vector parts of the
|
||||
| models. See #[code spacy pretrain]. Experimental.
|
||||
|
||||
+row
|
||||
+cell #[code --no-entities], #[code -N]
|
||||
+cell flag
|
||||
+cell Don't train NER.
|
||||
+cell #[code --parser-multitasks], #[code -pt]
|
||||
+cell option
|
||||
+cell
|
||||
| Side objectives for parser CNN, e.g. #[code 'dep'] or
|
||||
| #[code 'dep,tag']
|
||||
|
||||
+row
|
||||
+cell #[code --entity-multitasks], #[code -et]
|
||||
+cell option
|
||||
+cell
|
||||
| Side objectives for NER CNN, e.g. #[code 'dep'] or
|
||||
| #[code 'dep,tag']
|
||||
|
||||
+row
|
||||
+cell #[code --noise-level], #[code -nl]
|
||||
+cell option
|
||||
+cell Float indicating the amount of corruption for data agumentation.
|
||||
|
||||
+row
|
||||
+cell #[code --gold-preproc], #[code -G]
|
||||
+cell flag
|
||||
+cell Use gold preprocessing.
|
||||
|
||||
+row
|
||||
+cell #[code --learn-tokens], #[code -T]
|
||||
+cell flag
|
||||
+cell
|
||||
| Make parser learn gold-standard tokenization by merging
|
||||
] subtokens. Typically used for languages like Chinese.
|
||||
|
||||
+row
|
||||
+cell #[code --verbose], #[code -VV]
|
||||
+tag-new("2.0.13")
|
||||
+cell flag
|
||||
+cell Show more detailed messages during training.
|
||||
|
||||
+row
|
||||
+cell #[code --help], #[code -h]
|
||||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+row
|
||||
+cell #[code --verbose]
|
||||
+tag-new("2.0.13")
|
||||
+cell flag
|
||||
+cell Show more detail message during training.
|
||||
|
||||
+row("foot")
|
||||
+cell creates
|
||||
+cell model, pickle
|
||||
+cell A spaCy model on each epoch, and a final #[code .pickle] file.
|
||||
+cell A spaCy model on each epoch.
|
||||
|
||||
+h(4, "train-hyperparams") Environment variables for hyperparameters
|
||||
+tag-new(2)
|
||||
|
|
Loading…
Reference in New Issue
Block a user